# ETL of the data

### The following records were found to be faulty and can be removed:
* Records with Passenger count < 1
* Records with trip distance is negative and distance is greater than 50 miles (as the greatest distance between its boundaries is 35 miles via https://www.walksofnewyork.com/blog/nyc-by-the-numbers)
* Records with total_amount <1 and total_amount > 200 ($2.50 initial charge, Plus 50 cents per 1/5 mile when traveling above 12mph or per 60 seconds in slow traffic or when the vehicle is stopped, Plus 50 cents MTA State Surcharge for all trips that end in certain places- via https://www1.nyc.gov/site/tlc/passengers/taxi-fare.page)
* Records with payment type = 6 (voided)
* Records where Total time is negative and Total time is greater than 180 minutes (3 hours)
* Records with RatecodeID 99- undefined
* Speed under 2 miles per hour (3 to 4 mph is the average walking speed) and over 70 miles per hour (as 55 mph is the speed limit, via https://wazeopedia.waze.com/wiki/USA/New_York/Speed_limits)


### Columns that can be removed
* VendorID
* pickup_longitude
* pickup_latitude
* dropoff_longitude
* dropoff_latitude
* store_and_fwd_flag

### New columns
* Taxi colour- Green or yellow
* Trip type- 1- Street hail, 2- Dispatch. For yellow, all the trips are Street hail
* Trip duration in seconds
* Trip Duration bins
* Year
* Month
* Day
* Week Number

## Start code

In [1]:
# Import required packages
import boto3
import datetime as dt
import multiprocessing as mp
from pyspark.sql import SparkSession
from pyspark.sql.types import BooleanType, DoubleType, IntegerType, StringType, StructType, StructField, TimestampType
import pyspark.sql.functions as F

In [2]:
# Set parameters 
bucket_name = "nyc-tlc" # s3 bucket name 
years = ["2015", "2016"]
tlc_colours = ["green", "yellow"]
months = ['01','02','03','04','05','06','07','08','09','10','11','12']

In [3]:
# Create a local spark session
spark = SparkSession.builder \
        .appName('nyc-taxi-etl') \
        .getOrCreate()

## Extract data

In [4]:
# Function to extract data from S3 bucket
def extract_data_from_bucket(bucket, year, colour, month):
    df = spark.read.csv(f"s3a://{bucket}/trip data/{colour}_tripdata_{year}-{month}.csv", header=True)
    return df

## Functions to Transform the extracted data

### filter data
* Records with Passenger count < 1
* Records with trip distance < 0
* Records with total_amount <1
* Records with payment type = 6 (voided)
* Records where Total time is negative
* Records with RatecodeID 99- undefined
* Green Trip Records with Trip type null

In [5]:
# transform all green data
def transform_data(df, year, month, taxi_colour):  
    
    #if taxi colour is yellow, convert the column's name to similar to green's column name
    if taxi_colour=='yellow':
        df = df.withColumnRenamed("tpep_pickup_datetime", "lpep_pickup_datetime").\
                withColumnRenamed("tpep_dropoff_datetime", "lpep_dropoff_datetime")
    
    if 'trip_type ' in df.columns:
        df = df.withColumnRenamed("trip_type ", "trip_type")
        
    if 'Trip_type ' in df.columns:
        df = df.withColumnRenamed("Trip_type ", "trip_type")
    
    df.createOrReplaceTempView("v_df")

    #df.show()
    
    # Query to convert datatypes and remoe new columns
    sql_query = '''SELECT
    
        "''' + taxi_colour +'''" as taxi_colour,

        "''' + year +'''" as year,

        "''' + month +'''" as month,
        
        date_format(lpep_pickup_datetime,'dd') as day,

        date_format(lpep_pickup_datetime,'W') as week,
        
        date_format(lpep_pickup_datetime, "H") AS pickup_hour,
    
    to_timestamp(lpep_pickup_datetime, 'yyyy-MM-dd HH:mm:ss') as pickup_datetime,
    
    to_timestamp(lpep_dropoff_datetime, 'yyyy-MM-dd HH:mm:ss') as dropoff_datetime,
    
    (unix_timestamp(lpep_dropoff_datetime)-unix_timestamp(lpep_pickup_datetime))/(60) as duration_mins,

    case when (unix_timestamp(lpep_dropoff_datetime)-unix_timestamp(lpep_pickup_datetime))/(60) < 5 then "Under 5 mins"
         when (unix_timestamp(lpep_dropoff_datetime)-unix_timestamp(lpep_pickup_datetime))/(60) between 5 and 10 then "5-10 mins"
         when (unix_timestamp(lpep_dropoff_datetime)-unix_timestamp(lpep_pickup_datetime))/(60) between 10 and 20 then "10-20 mins"
         when (unix_timestamp(lpep_dropoff_datetime)-unix_timestamp(lpep_pickup_datetime))/(60) between 20 and 30 then "20-30 mins"
         else "Above 30 mins" end as cat_duration,
        
    INT(passenger_count) as passenger_count,

    FLOAT(trip_distance) as trip_distance,

    INT(RatecodeID) as RatecodeID,
    
    payment_type as payment_type,
    
    FLOAT(fare_amount) as fare_amount,

    FLOAT(extra) as extra,

    FLOAT(mta_tax) as mta_tax,

    FLOAT(tip_amount) as tip_amount,

    FLOAT(tolls_amount) as tolls_amount,

    FLOAT(improvement_surcharge) as improvement_surcharge,

    FLOAT(total_amount) as total_amount,
    
    ROUND(float(trip_distance)/((unix_timestamp(lpep_dropoff_datetime)-unix_timestamp(lpep_pickup_datetime))/3600), 6) as speed_mph,
    '''
    
    if taxi_colour == 'green':
        sql_query = sql_query + '`trip_type` as trip_type'
        
    elif taxi_colour == 'yellow':
        sql_query = sql_query + '1 as trip_type'
    
    sql_query = sql_query + ''' from v_df
    where
        INT(passenger_count) > 0
        and float(trip_distance) between 0 and 50
        and float(total_amount) between 0 and 200
        and payment_type != 6
        and float((unix_timestamp(lpep_dropoff_datetime)-unix_timestamp(lpep_pickup_datetime))/(60)) between 0 and 180
        and RatecodeID != 99
        and float(trip_distance)/((unix_timestamp(lpep_dropoff_datetime)-unix_timestamp(lpep_pickup_datetime))/3600) between 2 and 70 
    '''
    #print(sql_query)
    
    df_new = spark.sql(sql_query)
        
    return df_new

In [6]:
# Function to bring all transforms together
def data_processing_transform(df, year, month, colour):
    df = transform_data(df, year, month, colour)
        
    return df

## Write data
Write data to parquest files for analysis and loading into ML model at later date.

In [7]:
# Function to write data to parquet files
def write_data_to_parquet(df, mode):
    #df = df.repartition(F.col("year"), F.col("month"))
    df.write.partitionBy("year","month").parquet("./output", mode=mode)

## Process Data
For each year, month and taxi colour process csv and load into parquet files. Data is partitioned by year and month to speed up processing. Process is expected to run in full each time. Could make incremental if required.

In [9]:
loop_num = 1

# For each applicable year, month and taxi colour process files and load into parquet 
for year in years:
    for tlc_colour in tlc_colours:
        for month in months:
            start = dt.datetime.now()
            df_extract = extract_data_from_bucket(bucket_name, year, tlc_colour, month)
            
            df_transform = data_processing_transform(df_extract,
                                                     year,
                                                     month,
                                                     tlc_colour)

            #df_transform.show()
                        
            # Now write data to parquet
            if loop_num == 1:
                mode = "overwrite"
            else:
                mode = "append"
            
            write_data_to_parquet(df_transform, mode)
            
            loop_num += 1
            end = dt.datetime.now()
            process_time = abs((end - start).seconds)
            string = "Data file for month: {}, year: {} and taxi colour: {} successfully loaded in {} seconds".format(month, year, tlc_colour, process_time)
            print(string)
            

        
    

Data file for month: 01, year: 2015 and taxi colour: green successfully loaded in 138 seconds
Data file for month: 02, year: 2015 and taxi colour: green successfully loaded in 113 seconds
Data file for month: 03, year: 2015 and taxi colour: green successfully loaded in 114 seconds
Data file for month: 04, year: 2015 and taxi colour: green successfully loaded in 117 seconds
Data file for month: 05, year: 2015 and taxi colour: green successfully loaded in 128 seconds
Data file for month: 06, year: 2015 and taxi colour: green successfully loaded in 108 seconds
Data file for month: 07, year: 2015 and taxi colour: green successfully loaded in 112 seconds
Data file for month: 08, year: 2015 and taxi colour: green successfully loaded in 106 seconds
Data file for month: 09, year: 2015 and taxi colour: green successfully loaded in 103 seconds
Data file for month: 10, year: 2015 and taxi colour: green successfully loaded in 114 seconds
Data file for month: 11, year: 2015 and taxi colour: green s