In [1]:
# Import required packages
import boto3
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, DoubleType
import pyspark.sql.functions as F

In [2]:
# Create a local spark session
spark = SparkSession.builder \
        .appName('nyc-taxi-eda') \
        .getOrCreate()

In [3]:
# Set parameters 
bucket_name = "nyc-tlc" # s3 bucket name with required nyc tlc files

In [4]:
# Create function to read S3 bucket
def list_bucket_contents(bucket, match=''):
    files = []
    s3_resource = boto3.resource('s3')
    bucket_resource = s3_resource.Bucket(bucket)
    for key in bucket_resource.objects.all():
        if match in key.key:
            files.append(key.key)
    return files

In [5]:
colours = ["yellow","green"]
years = ["2015","2016"]
files = []

for year in years:
    for colour in colours:
        match = colour + "_tripdata_" + year
        files.extend(list_bucket_contents(bucket=bucket_name, match=match))

## Yellow Trip Data

In [6]:
# Read January 2018 yellow taxi cab data from S3 bucket
yellow_df = spark.read.csv(f"s3a://{bucket_name}/trip data/yellow_tripdata_2015-01.csv", header=True)

yellow_df.show(10)

+--------+--------------------+---------------------+---------------+-------------+-------------------+------------------+----------+------------------+-------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|   pickup_longitude|   pickup_latitude|RateCodeID|store_and_fwd_flag|  dropoff_longitude|  dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|
+--------+--------------------+---------------------+---------------+-------------+-------------------+------------------+----------+------------------+-------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|       2| 2015-01-15 19:05:39|  2015-01-15 19:23:42|              1|         1.59|   -73.993896484375|40.750110626220703|        

In [7]:
# Skipped as this takes a lot of time
# yellow_df.count()

In [8]:
yellow_df.show(10)

+--------+--------------------+---------------------+---------------+-------------+-------------------+------------------+----------+------------------+-------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|   pickup_longitude|   pickup_latitude|RateCodeID|store_and_fwd_flag|  dropoff_longitude|  dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|
+--------+--------------------+---------------------+---------------+-------------+-------------------+------------------+----------+------------------+-------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|       2| 2015-01-15 19:05:39|  2015-01-15 19:23:42|              1|         1.59|   -73.993896484375|40.750110626220703|        

In [9]:
yellow_df.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- pickup_longitude: string (nullable = true)
 |-- pickup_latitude: string (nullable = true)
 |-- RateCodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: string (nullable = true)
 |-- dropoff_latitude: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)



In [10]:
type(yellow_df)

pyspark.sql.dataframe.DataFrame

In [11]:
yellow_df.createOrReplaceTempView("v_yellow_df")

In [12]:
yellow_df_tbl_new = spark.sql('''SELECT 
    VendorID, 
    
    to_timestamp(tpep_pickup_datetime, 'yyyy-MM-dd HH:mm:ss') as tpep_pickup_datetime,
    
    to_timestamp(tpep_dropoff_datetime, 'yyyy-MM-dd HH:mm:ss') as tpep_dropoff_datetime,
        
    INT(passenger_count) as passenger_count,

    FLOAT(trip_distance) as trip_distance,

    FLOAT(pickup_longitude) as pickup_longitude,

    FLOAT(pickup_latitude) as pickup_latitude,

    INT(RatecodeID) as RatecodeID,
    
    store_and_fwd_flag,

    FLOAT(dropoff_longitude) as dropoff_longitude,

    FLOAT(dropoff_latitude) as dropoff_latitude,

    payment_type as payment_type,
    
    FLOAT(fare_amount) as fare_amount,

    FLOAT(extra) as extra,

    FLOAT(mta_tax) as mta_tax,

    FLOAT(tip_amount) as tip_amount,

    FLOAT(tolls_amount) as tolls_amount,

    FLOAT(improvement_surcharge) as improvement_surcharge,

    FLOAT(total_amount) as total_amount
    
    from v_yellow_df
''')
yellow_df_tbl_new.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- pickup_longitude: float (nullable = true)
 |-- pickup_latitude: float (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: float (nullable = true)
 |-- dropoff_latitude: float (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- total_amount: float (nullable = true)



In [13]:
yellow_df_tbl_new.show(10)

+--------+--------------------+---------------------+---------------+-------------+----------------+---------------+----------+------------------+-----------------+----------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|pickup_longitude|pickup_latitude|RatecodeID|store_and_fwd_flag|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|
+--------+--------------------+---------------------+---------------+-------------+----------------+---------------+----------+------------------+-----------------+----------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|       2| 2015-01-15 19:05:39|  2015-01-15 19:23:42|              1|         1.59|        -73.9939|       40.75011|         1|                 N|       -73.974

In [14]:
# Skipped as this takes a lot of time
# yellow_df_tbl_new.summary('count', 'min', 'max').show()

## Green Taxi Data

In [15]:
# Read January 2018 green taxi cab data from S3 bucket
green_df = spark.read.csv(f"s3a://{bucket_name}/trip data/green_tripdata_2016-01.csv", header=True)

In [16]:
green_df.show(10)

+--------+--------------------+---------------------+------------------+----------+-------------------+------------------+-------------------+------------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+----------+
|VendorID|lpep_pickup_datetime|Lpep_dropoff_datetime|Store_and_fwd_flag|RateCodeID|   Pickup_longitude|   Pickup_latitude|  Dropoff_longitude|  Dropoff_latitude|Passenger_count|Trip_distance|Fare_amount|Extra|MTA_tax|Tip_amount|Tolls_amount|Ehail_fee|improvement_surcharge|Total_amount|Payment_type|Trip_type |
+--------+--------------------+---------------------+------------------+----------+-------------------+------------------+-------------------+------------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+----------+
|       2| 2016-01-01 00:29:24|  2016-01-01 00:39:36|              

## There is location ID only from 2016-07. Before that, it has PU Location Longitude and Latitude.

In [17]:
green_df.summary('count', 'min', 'max').show()

+-------+--------+--------------------+---------------------+------------------+----------+-------------------+------------------+-------------------+------------------+---------------+-------------+-----------+-------+-------+----------+------------+---------+---------------------+------------+------------+----------+
|summary|VendorID|lpep_pickup_datetime|Lpep_dropoff_datetime|Store_and_fwd_flag|RateCodeID|   Pickup_longitude|   Pickup_latitude|  Dropoff_longitude|  Dropoff_latitude|Passenger_count|Trip_distance|Fare_amount|  Extra|MTA_tax|Tip_amount|Tolls_amount|Ehail_fee|improvement_surcharge|Total_amount|Payment_type|Trip_type |
+-------+--------+--------------------+---------------------+------------------+----------+-------------------+------------------+-------------------+------------------+---------------+-------------+-----------+-------+-------+----------+------------+---------+---------------------+------------+------------+----------+
|  count| 1445285|             144528

In [18]:
# Create a view with green dataframe table
green_df.createOrReplaceTempView("v_green_df")

In [19]:
spark.sql('select * from v_green_df').show()

+--------+--------------------+---------------------+------------------+----------+-------------------+------------------+-------------------+------------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+----------+
|VendorID|lpep_pickup_datetime|Lpep_dropoff_datetime|Store_and_fwd_flag|RateCodeID|   Pickup_longitude|   Pickup_latitude|  Dropoff_longitude|  Dropoff_latitude|Passenger_count|Trip_distance|Fare_amount|Extra|MTA_tax|Tip_amount|Tolls_amount|Ehail_fee|improvement_surcharge|Total_amount|Payment_type|Trip_type |
+--------+--------------------+---------------------+------------------+----------+-------------------+------------------+-------------------+------------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+----------+
|       2| 2016-01-01 00:29:24|  2016-01-01 00:39:36|              

In [21]:
# do type conversions of green dataframe table
green_df_new = spark.sql('''SELECT 
    VendorID,
    
    to_timestamp(lpep_pickup_datetime, 'yyyy-MM-dd HH:mm:ss') as tpep_pickup_datetime,
    
    to_timestamp(lpep_dropoff_datetime, 'yyyy-MM-dd HH:mm:ss') as tpep_dropoff_datetime,
    
    INT(passenger_count) as passenger_count,

    FLOAT(trip_distance) as trip_distance,

    FLOAT(pickup_longitude) as pickup_longitude,

    FLOAT(pickup_latitude) as pickup_latitude,

    INT(RatecodeID) as RatecodeID,
    
    store_and_fwd_flag,

    FLOAT(dropoff_longitude) as dropoff_longitude,

    FLOAT(dropoff_latitude) as dropoff_latitude,

    payment_type as payment_type,
    
    FLOAT(fare_amount) as fare_amount,

    FLOAT(extra) as extra,

    FLOAT(mta_tax) as mta_tax,

    FLOAT(tip_amount) as tip_amount,

    FLOAT(tolls_amount) as tolls_amount,

    FLOAT(improvement_surcharge) as improvement_surcharge,

    FLOAT(total_amount) as total_amount,
    
    `Trip_type ` as trip_type
    
    from v_green_df
''')
green_df_new.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- pickup_longitude: float (nullable = true)
 |-- pickup_latitude: float (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: float (nullable = true)
 |-- dropoff_latitude: float (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- trip_type: string (nullable = true)



## EDA Starts- For both Green and Yellow Trip data
### Questions to ask
* How many total records?
* How many records are there with passenger_count = 0?
* Trip distance = 0.00
* negative total_amount
* payment type = 6 (voided)
* Trip types- Hail and Dispatch
* Negotiated Fare- RateCodeID
* Total time in negative

### Green Trip Data

In [22]:
green_df_new.createOrReplaceTempView("v_green_df_new")

In [23]:
# How many total records?
# How many records are there with passenger_count = 0?
spark.sql('''
    select count(*) from v_green_df_new
''').show()

+--------+
|count(1)|
+--------+
| 1445285|
+--------+



In [24]:
# How many records are there with passenger_count = 0?
spark.sql('''
    select count(*) from v_green_df_new where passenger_count <1
''').show()

+--------+
|count(1)|
+--------+
|     411|
+--------+



In [25]:
# Trip distance < 0.00
spark.sql('''
    select count(*) from v_green_df_new where trip_distance < 0
''').show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



In [26]:
# negative total_amount
spark.sql('''
    select count(*) from v_green_df_new where total_amount <1
''').show()

+--------+
|count(1)|
+--------+
|    6836|
+--------+



In [27]:
# group by payment type
spark.sql('''
    select payment_type, count(*) from v_green_df_new group by payment_type
''').show()

+------------+--------+
|payment_type|count(1)|
+------------+--------+
|           3|    6084|
|           5|     101|
|           1|  713168|
|           4|    4464|
|           2|  721468|
+------------+--------+



In [28]:
# group by trip type
spark.sql('''
    select trip_type, count(*) from v_green_df_new group by trip_type
''').show()

+---------+--------+
|trip_type|count(1)|
+---------+--------+
|     null|       2|
|        1| 1412368|
|        2|   32915|
+---------+--------+



In [29]:
# group by RatecodeID
spark.sql('''
    select RatecodeID, count(*) from v_green_df_new group by RatecodeID
''').show()

+----------+--------+
|RatecodeID|count(1)|
+----------+--------+
|         1| 1406863|
|         6|      21|
|         3|    1002|
|         5|   33034|
|         4|     743|
|         2|    3620|
|        99|       2|
+----------+--------+



In [30]:
# Find the count of records which have Dropoff time greater than Pickup time 
spark.sql('''
    select count(*) from v_green_df_new where tpep_dropoff_datetime < tpep_pickup_datetime
''').show()

+--------+
|count(1)|
+--------+
|       0|
+--------+

