**Project Question: Predict the hourly demand for NYC taxi services in each zone.**


In [0]:
# RUN
# import libraries needed
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, expr, dayofweek, weekofyear, date_format
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
# RUN
# load the data in
# data comes from https://learn.microsoft.com/en-us/azure/open-datasets/dataset-taxi-yellow?tabs=pyspark#azure-databricks

# create spark session
spark = SparkSession.builder.appName("FinalProject").getOrCreate()

# Azure storage access info
blob_account_name = "azureopendatastorage"
blob_container_name = "nyctlc"
blob_relative_path = "yellow"
blob_sas_token = "r"

# Allow SPARK to read from Blob remotely
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set(
  'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
  blob_sas_token)
print('Remote blob path: ' + wasbs_path)

# SPARK read parquet, note that it won't load any data yet by now
df = spark.read.parquet(wasbs_path)
print('Register the DataFrame as a SQL temporary view: source')
df.createOrReplaceTempView('source')

# Display top 10 rows
print('Displaying top 10 rows: ')
display(spark.sql('SELECT * FROM source LIMIT 10'))

Remote blob path: wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/yellow
Register the DataFrame as a SQL temporary view: source
Displaying top 10 rows: 


vendorID,tpepPickupDateTime,tpepDropoffDateTime,passengerCount,tripDistance,puLocationId,doLocationId,startLon,startLat,endLon,endLat,rateCodeId,storeAndFwdFlag,paymentType,fareAmount,extra,mtaTax,improvementSurcharge,tipAmount,tollsAmount,totalAmount,puYear,puMonth
CMT,2012-02-29T23:53:14Z,2012-03-01T00:00:43Z,1,2.1,,,-73.980494,40.730601,-73.983532,40.752311,1,N,CSH,7.3,0.5,0.5,,0.0,0.0,8.3,2012,3
VTS,2012-03-17T08:01:00Z,2012-03-17T08:15:00Z,1,11.06,,,-73.986067,40.699862,-73.814838,40.737052,1,,CRD,24.5,0.0,0.5,,4.9,0.0,29.9,2012,3
CMT,2012-02-29T23:58:51Z,2012-03-01T00:15:48Z,1,3.4,,,-73.968967,40.754359,-73.957048,40.743289,1,N,CRD,12.5,0.5,0.5,,1.5,0.0,15.0,2012,3
CMT,2012-03-01T19:24:16Z,2012-03-01T19:31:22Z,1,1.3,,,-73.99374,40.75307,-74.005428,40.741118,1,N,CRD,6.1,1.0,0.5,,0.0,0.0,7.6,2012,3
CMT,2012-02-29T23:46:32Z,2012-03-01T00:05:18Z,3,2.0,,,-73.973723,40.752323,-73.948275,40.769413,1,N,CSH,11.7,0.5,0.5,,0.0,0.0,12.7,2012,3
VTS,2012-03-07T15:17:00Z,2012-03-07T15:26:00Z,5,1.87,,,-73.988237,40.75929,-73.97114,40.78275,1,,CSH,7.7,0.0,0.5,,0.0,0.0,8.2,2012,3
CMT,2012-02-29T23:41:58Z,2012-03-01T00:02:29Z,1,12.4,,,-73.954536,40.727742,-73.768994,40.760246,1,N,CSH,28.5,0.5,0.5,,0.0,0.0,29.5,2012,3
VTS,2012-03-18T15:21:00Z,2012-03-18T15:32:00Z,6,2.51,,,-74.001705,40.732345,-73.974888,40.750835,1,,CSH,8.9,0.0,0.5,,0.0,0.0,9.4,2012,3
CMT,2012-02-29T23:47:08Z,2012-03-01T00:06:42Z,4,6.3,,,-73.992319,40.724503,-73.923589,40.76113,1,N,CRD,16.5,0.5,0.5,,4.37,0.0,21.87,2012,3
VTS,2012-03-13T22:26:00Z,2012-03-13T22:37:00Z,1,1.34,,,-74.009907,40.706292,-74.000512,40.71733,1,,CSH,7.3,0.5,0.5,,0.0,0.0,8.3,2012,3


In [0]:
# RUN
# Part Zero: Data PreProcessing
# subset dataset bc too big. Will use the truncated df for the project
df_truncated = df.filter((col("puYear") >= 2017) & (col("puYear") <= 2018))

# drop startLon, startLat, endLon, and endLat columns
# they are null & not needed for this project scope
columns_to_drop = ['startLon','startLat', 'endLon', 'endLat']
df_truncated = df_truncated.drop(*columns_to_drop)

# transform non-numeric columns to numeric data types
df_truncated = df_truncated \
    .withColumn("vendorID", col("vendorID").cast("int")) \
    .withColumn("puLocationId", col("puLocationId").cast("int")) \
    .withColumn("doLocationId", col("doLocationId").cast("int")) \
    .withColumn("paymentType", col("paymentType").cast("int")) \
    .withColumn("storeAndFwdFlag", when(col("storeAndFwdFlag") == 'Y', 1).otherwise(0).cast("int")) \
    .withColumn("improvementSurcharge", col("improvementSurcharge").cast("double"))
####################################################################################
# Part One : Data Cleaning
# Remove tripDistance less than 0 and greater than 12. 
# Remove under 1 min rides and greater than 45 min rides.
# Remove rides with no passenegers and more than 5 passengers.

# calculate ride time in minutes (ride_duration)
df_truncated = df_truncated.withColumn('ride_duration', # in minutes
                                       (expr('unix_timestamp(tpepDropoffDateTime) - unix_timestamp(tpepPickupDateTime)') / 60).cast('double'))

# TO DO: Need to decided cut off for speed column.
# calculate speed, remember this is NOT per hour, it is per minute
df_truncated = df_truncated.withColumn('speed', col('tripDistance') / col('ride_duration'))

# Now, data cleaning
# TO DO: Remove extreme outliers for fare amounts. Find a cut off. 
cleaned_df = df_truncated[(df_truncated['tripDistance'] >= 0) & (df_truncated ['tripDistance'] <= 12) & (df_truncated['ride_duration'] >= 1) & (df_truncated['ride_duration'] <= 45) & (df_truncated['passengerCount'] >= 1) & (df_truncated['passengerCount'] <= 5)]
##################################################################################
# Part Two : Derived Features
# Create indication if weekday/weekend as this could affect the number of taxis needed.
# Create indication if holiday (1) or not (0).
# Create week number, hour of day, and day of week.
# Create demand column that counts number of taxis per hour for EACH pickup zone.

# indication if weekday/weekend
cleaned_df = cleaned_df.withColumn('is_weekend', when(dayofweek(cleaned_df['tpepPickupDateTime']) >= 6, 1).otherwise(0)) 
# use the dayofweek() function to get the day of the week (1 for Sunday, 2 for Monday, ..., 7 for Saturday). If the day of the week is 6 or 7 (which is Saturday or Sunday), we set the is_weekend column to 1 (weekend); otherwise, it's set to 0 (weekday).

# Recreate a SparkSession. Issue with holiday dates and get this error: You may get a different result due to the upgrading to Spark >= 3.0: Fail to recognize 'w' pattern in the DateTimeFormatter. 1) You can set "spark.sql.legacy.timeParserPolicy" to "LEGACY" to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from 'https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html'. To fix add spark code below to recreate spark session
spark = SparkSession.builder \
    .appName("YourApp") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()

# indication if holiday
# a list of holidays in `holiday_dates`: New Years, MLK day, Presidents day, Memorial Day, 4th of July, Labor Day, Thanksgiving, and Christmas
holiday_dates = ['2017-01-01', '2018-01-01', '2017-01-16', '2018-01-15','2017-02-20','2018-02-19','2017-05-29', '2018-05-28','2017-07-04', '2018-07-04','2017-09-4','2018-09-03','2017-11-23', '2018-11-22', '2017-12-25', '2018-12-25'
]  

# 1 if holiday, 0 if not
cleaned_df = cleaned_df.withColumn('is_holiday', when(date_format(cleaned_df['tpepPickupDateTime'], 'yyyy-MM-dd').isin(holiday_dates), 1).otherwise(0))

# indicate week number of year, hour of day, and day of week
cleaned_df = cleaned_df.withColumn('week_number', weekofyear(cleaned_df['tpepPickupDateTime']))
cleaned_df = cleaned_df.withColumn("hour_of_day", F.hour("tpepPickupDateTime"))
cleaned_df = cleaned_df.withColumn("day_of_week", F.dayofweek("tpepPickupDateTime"))

# Need to find the demand of taxis for each hour for each zone
# Use the window function to define a window spec for 1 hour and each pickup location ID
windowSpec = Window.partitionBy(F.hour("tpepPickupDateTime"), "puLocationId")

# count the number of trips per hour per zone
cleaned_df = cleaned_df.withColumn("demand", F.count("tpepPickupDateTime").over(windowSpec))
##################################################################################
# Part Three: Drop Columns

# list of columns to drop not useful for project
# TO DO: finialize the columns to drop
columns_to_drop = ["vendorID", "rateCodeId", "storeAndFwdFlag", "paymentType", "fareAmount", "extra", "mtaTax", "improvementSurcharge", "tipAmount", "tollsAmount", "totalAmount", "ride_duration"]

# drop the columns
cleaned_df = cleaned_df.drop(*columns_to_drop)

display(cleaned_df)

tpepPickupDateTime,tpepDropoffDateTime,passengerCount,tripDistance,puLocationId,doLocationId,puYear,puMonth,speed,is_weekend,is_holiday,week_number,hour_of_day,day_of_week,demand
2017-03-24T00:04:22Z,2017-03-24T00:12:19Z,1,2.1,190,61,2017,3,0.2641509433962264,1,0,12,0,6,504
2017-03-24T00:07:51Z,2017-03-24T00:12:48Z,2,1.3,190,181,2017,3,0.2626262626262626,1,0,12,0,6,504
2017-05-18T00:32:13Z,2017-05-18T00:59:19Z,1,6.6,190,37,2017,5,0.2435424354243542,0,0,20,0,5,504
2017-05-09T00:39:30Z,2017-05-09T00:50:05Z,1,3.1,190,72,2017,5,0.2929133858267716,0,0,19,0,3,504
2017-04-29T00:16:49Z,2017-04-29T00:46:16Z,1,5.76,190,79,2017,4,0.1955857385398981,1,0,17,0,7,504
2017-10-03T00:00:24Z,2017-10-03T00:07:00Z,1,1.9,190,190,2017,10,0.2878787878787879,0,0,40,0,3,504
2017-06-18T00:27:58Z,2017-06-18T00:37:59Z,1,1.7,190,106,2017,6,0.1697171381031613,0,0,24,0,1,504
2017-06-20T00:02:20Z,2017-06-20T00:06:23Z,1,1.0,190,62,2017,6,0.2469135802469136,0,0,25,0,3,504
2017-01-29T00:04:48Z,2017-01-29T00:32:36Z,2,7.79,190,48,2017,1,0.2802158273381295,0,0,4,0,1,504
2017-01-12T00:48:18Z,2017-01-12T00:55:00Z,1,1.9,190,188,2017,1,0.2835820895522388,0,0,2,0,5,504


In [0]:
# DON'T RUN
# sanity check they are gone
cleaned_df.printSchema() 

root
 |-- tpepPickupDateTime: timestamp (nullable = true)
 |-- tpepDropoffDateTime: timestamp (nullable = true)
 |-- passengerCount: integer (nullable = true)
 |-- tripDistance: double (nullable = true)
 |-- puLocationId: integer (nullable = true)
 |-- doLocationId: integer (nullable = true)
 |-- puYear: integer (nullable = true)
 |-- puMonth: integer (nullable = true)
 |-- speed: double (nullable = true)
 |-- is_weekend: integer (nullable = false)
 |-- is_holiday: integer (nullable = false)
 |-- week_number: integer (nullable = true)
 |-- hour_of_day: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- demand: long (nullable = false)



In [0]:
# DON'T RUN
# how many rows in new clean_df?
row_count = cleaned_df.count()
print("Number of rows in truncated data:", row_count)
# should be 196M rows

Number of rows in truncated data: 196496512
