# ETL of the data

### The following records were found to be faulty and can be removed:
* Records with Passenger count < 1
* Records with trip distance < 0
* Records with total_amount <1
* Records with payment type = 6 (voided)
* Records where Total time is negative
* Records with RatecodeID 99- undefined
* Green Trip Records with Trip type null


### Columns that can be removed
* VendorID
* pickup_longitude
* pickup_latitude
* dropoff_longitude
* dropoff_latitude
* store_and_fwd_flag

### New columns
* Taxi colour- Green or yellow
* Trip type- 1- Street hail, 2- Dispatch. For yellow, all the trips are Street hail
* Trip duration in seconds
* Trip Duration bins
* Year
* Month
* Day
* Week Number

## Start code

In [2]:
# Import required packages
import boto3
import datetime as dt
import multiprocessing as mp
from pyspark.sql import SparkSession
from pyspark.sql.types import BooleanType, DoubleType, IntegerType, StringType, StructType, StructField, TimestampType
import pyspark.sql.functions as F

In [80]:
# Set parameters 
bucket_name = "nyc-tlc" # s3 bucket name 
years = ["2015", "2016"]
tlc_colours = ["green", "yellow"]
months = ['01','02','03','04','05','06','07','08','09','10','11','12']

In [6]:
# Create a local spark session
spark = SparkSession.builder \
        .appName('nyc-taxi-etl') \
        .getOrCreate()

## Extract data

In [8]:
# Function to extract data from S3 bucket
def extract_data_from_bucket(bucket, year, colour, month):
    df = spark.read.csv(f"s3a://{bucket}/trip data/{colour}_tripdata_{year}-{month}.csv", header=True)
    return df

## Functions to Transform the extracted data

In [None]:
### filter data
* Records with Passenger count < 1
* Records with trip distance < 0
* Records with total_amount <1
* Records with payment type = 6 (voided)
* Records where Total time is negative
* Records with RatecodeID 99- undefined
* Green Trip Records with Trip type null

In [120]:
# transform all green data
def transform_data(df, year, month, taxi_colour):  
    
    #if taxi colour is yellow, convert the column's name to similar to green's column name
    if taxi_colour=='yellow':
        df = df.withColumnRenamed("tpep_pickup_datetime", "lpep_pickup_datetime").\
                withColumnRenamed("tpep_dropoff_datetime", "lpep_dropoff_datetime")
    
    if 'trip_type ' in df.columns:
        df = df.withColumnRenamed("trip_type ", "trip_type")
        
    if 'Trip_type ' in df.columns:
        df = df.withColumnRenamed("Trip_type ", "trip_type")
    
    df.createOrReplaceTempView("v_df")

    #df.show()
    
    # Query to convert datatypes and remoe new columns
    sql_query = '''SELECT
    
        "''' + taxi_colour +'''" as taxi_colour,

        "''' + year +'''" as year,

        "''' + month +'''" as month,
        
        date_format(lpep_pickup_datetime,'dd') as day,

        date_format(lpep_pickup_datetime,'W') as week,
    
    to_timestamp(lpep_pickup_datetime, 'yyyy-MM-dd HH:mm:ss') as pickup_datetime,
    
    to_timestamp(lpep_dropoff_datetime, 'yyyy-MM-dd HH:mm:ss') as dropoff_datetime,
    
    (unix_timestamp(lpep_dropoff_datetime)-unix_timestamp(lpep_pickup_datetime))/(60) as duration_mins,
    
    case when (unix_timestamp(lpep_dropoff_datetime)-unix_timestamp(lpep_pickup_datetime))/(60) < 5 then "Under 5 mins"
         when (unix_timestamp(lpep_dropoff_datetime)-unix_timestamp(lpep_pickup_datetime))/(60) between 5 and 10 then "5-10 mins"
         when (unix_timestamp(lpep_dropoff_datetime)-unix_timestamp(lpep_pickup_datetime))/(60) between 10 and 20 then "10-20 mins"
         when (unix_timestamp(lpep_dropoff_datetime)-unix_timestamp(lpep_pickup_datetime))/(60) between 20 and 30 then "20-30 mins"
         else "Above 30 mins" end as cat_duration,
        
    INT(passenger_count) as passenger_count,

    FLOAT(trip_distance) as trip_distance,

    INT(RatecodeID) as RatecodeID,
    
    payment_type as payment_type,
    
    FLOAT(fare_amount) as fare_amount,

    FLOAT(extra) as extra,

    FLOAT(mta_tax) as mta_tax,

    FLOAT(tip_amount) as tip_amount,

    FLOAT(tolls_amount) as tolls_amount,

    FLOAT(improvement_surcharge) as improvement_surcharge,

    FLOAT(total_amount) as total_amount,'''
    
    if taxi_colour == 'green':
        sql_query = sql_query + '`trip_type` as trip_type'
        
    elif taxi_colour == 'yellow':
        sql_query = sql_query + '1 as trip_type'
    
    sql_query = sql_query + ''' from v_df
    where
        passenger_count > 0
        and trip_distance > 0
        and total_amount > 0
        and payment_type != 6
        and unix_timestamp(lpep_dropoff_datetime) - unix_timestamp(lpep_pickup_datetime) > 0
        and RatecodeID != 99
    '''
    #print(sql_query)
    
    df_new = spark.sql(sql_query)
    
    return df_new

In [105]:
# Function to bring all transforms together
def data_processing_transform(df, year, month, colour):
    df = transform_data(df, year, month, colour)
        
    return df

## Write data
Write data to parquest files for analysis and loading into ML model at later date.

In [101]:
# Function to write data to parquet files
def write_data_to_parquet(df, mode):
    #df = df.repartition(F.col("year"), F.col("month"))
    df.write.partitionBy("year","month").parquet("./output", mode=mode)

## Process Data
For each year, month and taxi colour process csv and load into parquet files. Data is partitioned by year and month to speed up processing. Process is expected to run in full each time. Could make incremental if required.

In [121]:
loop_num = 1

# For each applicable year, month and taxi colour process files and load into parquet 
for year in years:
    for tlc_colour in tlc_colours:
        for month in months:
            start = dt.datetime.now()
            df_extract = extract_data_from_bucket(bucket_name, year, tlc_colour, month)
            
            df_transform = data_processing_transform(df_extract,
                                                     year,
                                                     month,
                                                     tlc_colour)
            #df_transform.show()
            #break
                        
            # Now write data to parquet
            if loop_num == 1:
                mode = "overwrite"
            else:
                mode = "append"
            
            write_data_to_parquet(df_transform, mode)
            
            loop_num += 1
            end = dt.datetime.now()
            process_time = abs((end - start).seconds)
            string = "Data file for month: {}, year: {} and taxi colour: {} successfully loaded in {} seconds".format(month, year, tlc_colour, process_time)
            print(string)
        
    

Data file for month: 01, year: 2015 and taxi colour: green successfully loaded in 133 seconds
Data file for month: 02, year: 2015 and taxi colour: green successfully loaded in 136 seconds
Data file for month: 03, year: 2015 and taxi colour: green successfully loaded in 130 seconds
Data file for month: 04, year: 2015 and taxi colour: green successfully loaded in 135 seconds
Data file for month: 05, year: 2015 and taxi colour: green successfully loaded in 137 seconds
Data file for month: 06, year: 2015 and taxi colour: green successfully loaded in 131 seconds
Data file for month: 07, year: 2015 and taxi colour: green successfully loaded in 114 seconds
Data file for month: 08, year: 2015 and taxi colour: green successfully loaded in 116 seconds
Data file for month: 09, year: 2015 and taxi colour: green successfully loaded in 125 seconds
Data file for month: 10, year: 2015 and taxi colour: green successfully loaded in 120 seconds
Data file for month: 11, year: 2015 and taxi colour: green s