In [2]:
# Import required packages
import boto3
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, DoubleType
import pyspark.sql.functions as F

In [3]:
# Create a local spark session
spark = SparkSession.builder \
        .appName('nyc-taxi-eda') \
        .getOrCreate()

In [4]:
# Set parameters 
bucket_name = "nyc-tlc" # s3 bucket name with required nyc tlc files

In [5]:
# Create function to read S3 bucket
def list_bucket_contents(bucket, match=''):
    files = []
    s3_resource = boto3.resource('s3')
    bucket_resource = s3_resource.Bucket(bucket)
    for key in bucket_resource.objects.all():
        if match in key.key:
            files.append(key.key)
    return files

In [6]:
colours = ["yellow","green"]
years = ["2015","2016"]
files = []

for year in years:
    for colour in colours:
        match = colour + "_tripdata_" + year
        files.extend(list_bucket_contents(bucket=bucket_name, match=match))

In [7]:
# Read January 2018 yellow taxi cab data from S3 bucket
yellow_df = spark.read.csv(f"s3a://{bucket_name}/trip data/yellow_tripdata_2016-01.csv", header=True)

In [8]:
yellow_df.show(10)

+--------+--------------------+---------------------+---------------+-------------+-------------------+------------------+----------+------------------+-------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|   pickup_longitude|   pickup_latitude|RatecodeID|store_and_fwd_flag|  dropoff_longitude|  dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|
+--------+--------------------+---------------------+---------------+-------------+-------------------+------------------+----------+------------------+-------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|       2| 2016-01-01 00:00:00|  2016-01-01 00:00:00|              2|         1.10|-73.990371704101563|40.734695434570313|        

In [9]:
yellow_df.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- pickup_longitude: string (nullable = true)
 |-- pickup_latitude: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: string (nullable = true)
 |-- dropoff_latitude: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)



In [10]:
type(yellow_df)

pyspark.sql.dataframe.DataFrame

In [11]:
yellow_df.createOrReplaceTempView("yellow_df_tbl")

In [20]:
yellow_df_tbl_new = spark.sql('''SELECT 
    VendorID,
    date_format(tpep_pickup_datetime, 'YYYY-MM-DD hh:mm:ss'),
    date_format(tpep_dropoff_datetime, 'YYYY-MM-DD hh:mm:ss'),
    INT(passenger_count),

    FLOAT(trip_distance),

    FLOAT(pickup_longitude),

    FLOAT(pickup_latitude),

    INT(RatecodeID),
    
    store_and_fwd_flag,

    FLOAT(dropoff_longitude),

    FLOAT(dropoff_latitude),

    payment_type,
    
    FLOAT(fare_amount),

    FLOAT(extra),

    FLOAT(mta_tax),

    FLOAT(tip_amount),

    FLOAT(tolls_amount),

    FLOAT(improvement_surcharge),

    FLOAT(total_amount)
    
    from yellow_df_tbl
''')
yellow_df_tbl_new.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- date_format(CAST(tpep_pickup_datetime AS TIMESTAMP), YYYY-MM-DD hh:mm:ss): string (nullable = true)
 |-- date_format(CAST(tpep_dropoff_datetime AS TIMESTAMP), YYYY-MM-DD hh:mm:ss): string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- pickup_longitude: float (nullable = true)
 |-- pickup_latitude: float (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: float (nullable = true)
 |-- dropoff_latitude: float (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- total_amount: float (nullable = true)



In [21]:
yellow_df_tbl_new.show(10)

+--------+-------------------------------------------------------------------------+--------------------------------------------------------------------------+---------------+-------------+----------------+---------------+----------+------------------+-----------------+----------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|VendorID|date_format(CAST(tpep_pickup_datetime AS TIMESTAMP), YYYY-MM-DD hh:mm:ss)|date_format(CAST(tpep_dropoff_datetime AS TIMESTAMP), YYYY-MM-DD hh:mm:ss)|passenger_count|trip_distance|pickup_longitude|pickup_latitude|RatecodeID|store_and_fwd_flag|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|
+--------+-------------------------------------------------------------------------+--------------------------------------------------------------------------+---------------+-------------+----------------+---------------+------

In [23]:
yellow_df_tbl_new.summary('count', 'min', 'max').show()

+-------+--------+-------------------------------------------------------------------------+--------------------------------------------------------------------------+---------------+-------------+----------------+---------------+----------+------------------+-----------------+----------------+------------+-----------+--------+--------+----------+------------+---------------------+------------+
|summary|VendorID|date_format(CAST(tpep_pickup_datetime AS TIMESTAMP), YYYY-MM-DD hh:mm:ss)|date_format(CAST(tpep_dropoff_datetime AS TIMESTAMP), YYYY-MM-DD hh:mm:ss)|passenger_count|trip_distance|pickup_longitude|pickup_latitude|RatecodeID|store_and_fwd_flag|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|   extra| mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|
+-------+--------+-------------------------------------------------------------------------+--------------------------------------------------------------------------+---------------+-------------+-------