# Assignment 1: NYC Taxi Data

ETL processing of NFC TLC dataset. Process written in four parts.

1. Extract data from S3
2. Transform datatypes and create new features
3. Clean data - remove trips with questionable data
4. Load data into parquet files

In [2]:
# Import required packages
from pyspark.sql import SparkSession
from pyspark.sql.types import BooleanType, DoubleType, IntegerType, StringType, StructType, StructField, TimestampType
import pyspark.sql.functions as F

In [3]:
# Set parameters 
bucket_name = "nyc-tlc" # s3 bucket name with required nyc tlc files
years = ["2017", "2018"]
#tlc_colours = ["yellow", "green"]
tlc_colours = ["green"]
zone_lookup = "taxi _zone_lookup.csv"
dt_columns = ["pickup_datetime","dropoff_datetime"]
int_columns = ["passenger_count","year"]
num_columns = ["trip_distance","fare_amount","extra","mta_tax","improvement_surcharge","tip_amount","tolls_amount",
               "ehail_fee","total_amount"]
initial_columns = ["VendorID","pickup_datetime","dropoff_datetime","passenger_count","trip_distance","PULocationID",
                 "DOLocationID","RatecodeID","store_and_fwd_flag","payment_type","fare_amount","extra","mta_tax","improvement_surcharge",
                 "tip_amount","tolls_amount","ehail_fee","total_amount","trip_type","taxi_type","year","filename"]

In [4]:
# Create a local spark session
spark = SparkSession.builder \
        .appName('nyc-taxi-etl') \
        .getOrCreate()

In [1]:
# Create function to read S3 bucket
# Want to expand this out to be able to create a list of files to extract
def bucket_contents_to_list(bucket, match=''):
    files = []
    s3_resource = boto3.resource('s3')
    bucket_resource = s3_resource.Bucket(bucket)
    for key in bucket_resource.objects.all():
        if match in key.key:
            files.append(key.key)
    return files

In [5]:
# Function to extract data from S3 bucket
def extract_data_from_bucket(bucket, year, colour):
    df = spark.read.csv(f"s3a://{bucket}/trip data/{colour}_tripdata_{year}-*.csv", header=True)
    # Add taxi colour and filename to data frame
    df = df.withColumn("taxi_type", F.lit(colour)).\
        withColumn("year", F.lit(year)).\
        withColumn("filename", F.input_file_name())
    if colour == "yellow":
        # if extracting yellow taxi trips add trip_type so columns match with green data frame
        df = df.withColumn("trip_type","1").\
          withColumn("ehail_fee","0")
    return df

In [6]:
# Function to extract lookup data from NYC TLC
def extract_lookup_data_from_bucket(bucket, filename):
    df = spark.read.csv(f"s3a://{bucket}/misc/{filename}", header=True)
    return df

In [7]:
# Generate an empty data frame for data loaded from CSV
def generate_empty_dataframe():
    schema = StructType([
      StructField('VendorID', StringType(), True),
      StructField('pickup_datetime', StringType(), True),
      StructField('dropoff_datetime', StringType(), True),
      StructField('passenger_count', StringType(), True),
      StructField('trip_distance', StringType(), True),
      StructField('PULocationID', StringType(), True),
      StructField('DOLocationID', StringType(), True),
      StructField('RateCodeID', StringType(), True),
      StructField('store_and_fwd_flag', StringType(), True),
      StructField('payment_type', StringType(), True),
      StructField('fare_amount', StringType(), True),
      StructField('extra', StringType(), True),
      StructField('mta_tax', StringType(), True),
      StructField('improvement_surcharge', StringType(), True),
      StructField('tip_amount', StringType(), True),
      StructField('tolls_amount', StringType(), True),
      StructField('ehail_fee', StringType(), True),
      StructField('total_amount', StringType(), True),
      StructField('trip_type', StringType(), True),
      StructField('taxi_type', StringType(), True),
      StructField('year', StringType(), True),
      StructField('filename', StringType(), True),
      ])

    df = spark.createDataFrame([], schema)
    return df

In [8]:
def get_trip_duration_category(time):
    minutes = time / 60
    if minutes < 5:
        return "Under 5 mins"
    elif 5 <= minutes < 10:
        return "5-10 mins"
    elif 10 <= minutes < 20:
        return "10-20 mins"
    elif 20 <= minutes < 30:
        return "20-30 mins"
    else:
        return "Above 30 mins"

# Register function as a Spark user defined function 
udf_get_trip_duration_category = F.udf(lambda x: get_trip_duration_category(x), StringType())

In [9]:
# Function to extract the month from the filename - maybe more useful than relying on month in datettime fields
def get_month_from_filename(filename):
    element = filename.split("-")[-1]
    month = element.split(".")[0]
    return month
    
# Register function as a Spark user defined function 
udf_get_month_from_filename = F.udf(lambda x: get_month_from_filename(x), StringType())

In [10]:
def get_airport_location(location):
    if location == "EWR" or location == "Airports":
        return True
    else:
        return False
    
#Register function as a Spark user defined function
udf_get_airport_location = F.udf(lambda x: get_airport_location(x), BooleanType())

## Extract NYC Yellow and Green Taxi Cab Data

Extract data from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)

In [11]:
# Extract trip data
df = generate_empty_dataframe()

# Create a data frame with taxi cab data for each applicable year and colour 
for year in years:
    for tlc_colour in tlc_colours:
        df_initial = extract_data_from_bucket(bucket_name, year, tlc_colour)
        # Standardise names of pickup/dropoff datetime
        if tlc_colour == "yellow":
            df_initial = df_initial.withColumnRenamed("tpep_pickup_datetime", "pickup_datetime").\
                                  withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime")
        else:
            df_initial = df_initial.withColumnRenamed("lpep_pickup_datetime", "pickup_datetime").\
                                  withColumnRenamed("lpep_dropoff_datetime", "dropoff_datetime")
        # Union data dataframes together into one
        df_initial = df_initial.select(initial_columns)
        df = df.union(df_initial)

In [12]:
# Extract zone lookup data
zone_df = extract_lookup_data_from_bucket(bucket_name, zone_lookup)

## Transform Data
### Modify data types

* pickup_datetime_string: string -> timestamp
* dropoff_datetime_string: string -> timestamp
* passenger_count: string -> integer
* trip_distance: string -> double
* fare_amount: string -> double
* extra: string -> double
* mta_tax: string -> double
* tip_amount: string -> double
* tolls_amount: string -> double
* improvement_surcharge: string -> double
* total_amount: string -> double
* ehail_fee: string -> double

### Rename Columns

* **PULocationID** -> pickup_location_id
* **DOLocationID** -> dropof_location_id

### Join Datasets

* Join trips to zone lookups

### Create new features

* **taxi_type**: whether is a green or yellow cab - created in extract
* **trip_duration**: time, in seconds, between trip start and trip end
* **trip_duration_cat**: bins of trip durations; lt 5 Mins, 5-10 mins, 10-20 mins, 20-30 mins, gt 30 mins
* **year**: the year the trip took place in - created in extract
* **month**: the month the trip took place in
* **hour**: the hour the trip took place in
* **from_airport**: whether the trip started from either Newark or LaGuardia Airport
* **to_airport**: whether the trip ended at either Newark or LaGuardia Airport

In [14]:
# Create new data frame for transforms
df_transform = df

In [15]:
# Create new columns that convert datetime strings to a timestamp data type
for column in dt_columns:
    df_transform = df_transform.withColumn(column, F.col(column).astype(TimestampType()))

In [16]:
# Convert strings to integers
for column in int_columns:
    df_transform = df_transform.withColumn(column, F.col(column).astype(IntegerType()))

In [17]:
# Convert strings to numeric
for column in num_columns:
    df_transform = df_transform.withColumn(column, F.col(column).astype(DoubleType()))

In [18]:
# Rename columns
df_transform = df_transform.withColumnRenamed("PULocationID","pickup_location_id").\
                            withColumnRenamed("DOLocationID","dropoff_location_id")

In [19]:
# Join trip data with zone lookup
# Now join datasets
df_transform = df_transform.join(zone_df
                                 ,df_transform.pickup_location_id == zone_df.LocationID
                                 ,how="left").\
                            drop("LocationID").\
                            drop("Zone").\
                            withColumnRenamed("service_zone","pickup_service_zone").\
                            withColumnRenamed("Borough","pickup_borough").\
                            join(zone_df
                                 ,df_transform.dropoff_location_id == zone_df.LocationID
                                 ,how="left").\
                            drop("LocationID").\
                            drop("Zone").\
                            withColumnRenamed("service_zone","dropoff_service_zone").\
                            withColumnRenamed("Borough","dropoff_borough")

In [20]:
# Create features for trip durtaion (in seconds) and trip category
df_transform = df_transform.withColumn("trip_duration_seconds", F.col("dropoff_datetime").cast("long") - F.col("pickup_datetime").cast("long")).\
                    withColumn("trip_duration_category", udf_get_trip_duration_category(F.col("trip_duration_seconds")))

In [21]:
# Create field for month of trip based on filename rather than pickup datetime
df_transform = df_transform.withColumn("month", udf_get_month_from_filename(F.col("filename")).astype(IntegerType()))

In [22]:
# Create field for hour of trip
df_transform = df_transform.withColumn("pickup_hour", F.hour(F.col("pickup_datetime")))

In [23]:
# Create field for from airport and to airport features
df_transform = df_transform.withColumn("from_airport", udf_get_airport_location(F.col("pickup_service_zone"))).\
                            withColumn("to_airport", udf_get_airport_location(F.col("dropoff_service_zone")))

## Data Clean
### Remove records

* **RateCodeID**: trips with a 99 rate code
* **fare_amount**: trips with a fare amount of zero or below
* **trip_duration_seconds**: trips with a duration of zero, or less, seconds
* **pickup_datetime**: outside month of file period
* **passenger_count**: where equal to zero change to one

In [26]:
# Create new data frame for cleaning steps
df_clean = df_transform

In [27]:
# Remove records with a 99 rate code id 
df_clean = df_clean.filter(F.col("RateCodeID") < 7)

In [28]:
# Remove records with a fare_amount of zero or below
df_clean = df_clean.filter(F.col("fare_amount") > 0.0)

In [29]:
# Remove records with a trip duration of 0 seconds or less
df_clean = df_clean.filter(F.col("trip_duration_seconds") > 0)

In [30]:
# Remove trips that start outside original files remit
df_clean = df_clean.filter((F.col("year") == F.year(F.col("pickup_datetime")))
                           & (F.col("month") == F.month(F.col("pickup_datetime"))))

In [32]:
# Make trips with zero passengers equal to the mode for non zero passenger trips, which is 1 based on EDA
df_clean = df_clean.withColumn("passenger_count", F.when(df_clean["passenger_count"] == 0, 1).\
                    otherwise(df_clean["passenger_count"]))

## Write data
Write data to parquest files for analysis and loading into ML model at later date.

In [35]:
df_clean.write.partitionBy("year","month").parquet("./output", mode="overwrite")