**Project Question: Predict the hourly demand for NYC taxi services in each zone.**



# Read in the data

In [0]:
# RUN
# import libraries needed
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, expr, dayofweek, date_format, sum
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import weekofyear
from pyspark.sql.window import Window
from pyspark.sql.types import TimestampType

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.linalg import Vectors

In [0]:
# RUN
# create spark session
spark = SparkSession.builder.appName("FinalProject").getOrCreate()

In [0]:
# RUN
# data : https://learn.microsoft.com/en-us/azure/open-datasets/dataset-taxi-yellow?tabs=pyspark#azure-databricks

# Azure storage access info
blob_account_name = "azureopendatastorage"
blob_container_name = "nyctlc"
blob_relative_path = "yellow"
blob_sas_token = "r"

# Allow SPARK to read from Blob remotely
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set(
  'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
  blob_sas_token)
print('Remote blob path: ' + wasbs_path)

# SPARK read parquet, note that it won't load any data yet by now
df = spark.read.parquet(wasbs_path)
print('Register the DataFrame as a SQL temporary view: source')
df.createOrReplaceTempView('source')

# Display top 10 rows
print('Displaying top 10 rows: ')
display(spark.sql('SELECT * FROM source LIMIT 10'))

Remote blob path: wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/yellow
Register the DataFrame as a SQL temporary view: source
Displaying top 10 rows: 


vendorID,tpepPickupDateTime,tpepDropoffDateTime,passengerCount,tripDistance,puLocationId,doLocationId,startLon,startLat,endLon,endLat,rateCodeId,storeAndFwdFlag,paymentType,fareAmount,extra,mtaTax,improvementSurcharge,tipAmount,tollsAmount,totalAmount,puYear,puMonth
CMT,2012-02-29T23:53:14Z,2012-03-01T00:00:43Z,1,2.1,,,-73.980494,40.730601,-73.983532,40.752311,1,N,CSH,7.3,0.5,0.5,,0.0,0.0,8.3,2012,3
VTS,2012-03-17T08:01:00Z,2012-03-17T08:15:00Z,1,11.06,,,-73.986067,40.699862,-73.814838,40.737052,1,,CRD,24.5,0.0,0.5,,4.9,0.0,29.9,2012,3
CMT,2012-02-29T23:58:51Z,2012-03-01T00:15:48Z,1,3.4,,,-73.968967,40.754359,-73.957048,40.743289,1,N,CRD,12.5,0.5,0.5,,1.5,0.0,15.0,2012,3
CMT,2012-03-01T19:24:16Z,2012-03-01T19:31:22Z,1,1.3,,,-73.99374,40.75307,-74.005428,40.741118,1,N,CRD,6.1,1.0,0.5,,0.0,0.0,7.6,2012,3
CMT,2012-02-29T23:46:32Z,2012-03-01T00:05:18Z,3,2.0,,,-73.973723,40.752323,-73.948275,40.769413,1,N,CSH,11.7,0.5,0.5,,0.0,0.0,12.7,2012,3
VTS,2012-03-07T15:17:00Z,2012-03-07T15:26:00Z,5,1.87,,,-73.988237,40.75929,-73.97114,40.78275,1,,CSH,7.7,0.0,0.5,,0.0,0.0,8.2,2012,3
CMT,2012-02-29T23:41:58Z,2012-03-01T00:02:29Z,1,12.4,,,-73.954536,40.727742,-73.768994,40.760246,1,N,CSH,28.5,0.5,0.5,,0.0,0.0,29.5,2012,3
VTS,2012-03-18T15:21:00Z,2012-03-18T15:32:00Z,6,2.51,,,-74.001705,40.732345,-73.974888,40.750835,1,,CSH,8.9,0.0,0.5,,0.0,0.0,9.4,2012,3
CMT,2012-02-29T23:47:08Z,2012-03-01T00:06:42Z,4,6.3,,,-73.992319,40.724503,-73.923589,40.76113,1,N,CRD,16.5,0.5,0.5,,4.37,0.0,21.87,2012,3
VTS,2012-03-13T22:26:00Z,2012-03-13T22:37:00Z,1,1.34,,,-74.009907,40.706292,-74.000512,40.71733,1,,CSH,7.3,0.5,0.5,,0.0,0.0,8.3,2012,3


In [0]:
# RUN
# smaller dataset bc data is big. Will use the truncated df for the project
df_truncated = df.filter((col("puYear") >= 2017) & (col("puYear") <= 2018))

# view the truncated df
df_truncated.show(10) 

+--------+-------------------+-------------------+--------------+------------+------------+------------+--------+--------+------+------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+
|vendorID| tpepPickupDateTime|tpepDropoffDateTime|passengerCount|tripDistance|puLocationId|doLocationId|startLon|startLat|endLon|endLat|rateCodeId|storeAndFwdFlag|paymentType|fareAmount|extra|mtaTax|improvementSurcharge|tipAmount|tollsAmount|totalAmount|puYear|puMonth|
+--------+-------------------+-------------------+--------------+------------+------------+------------+--------+--------+------+------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+
|       1|2017-03-02 07:59:01|2017-03-02 08:12:48|             1|         1.4|         238|         236|    NULL|    NULL|  NULL|  NULL|         1|              N|          1|       9.5|  0.

In [0]:
# DON'T RUN
# count the number of rows in the truncated df
row_count = df_truncated.count()
print("Number of rows in truncated data:", row_count)  

# about 216M rows

Number of rows in truncated data: 216300320


In [0]:
# DON'T RUN
# print schema to inspect data types
df_truncated.printSchema()

root
 |-- vendorID: string (nullable = true)
 |-- tpepPickupDateTime: timestamp (nullable = true)
 |-- tpepDropoffDateTime: timestamp (nullable = true)
 |-- passengerCount: integer (nullable = true)
 |-- tripDistance: double (nullable = true)
 |-- puLocationId: string (nullable = true)
 |-- doLocationId: string (nullable = true)
 |-- startLon: double (nullable = true)
 |-- startLat: double (nullable = true)
 |-- endLon: double (nullable = true)
 |-- endLat: double (nullable = true)
 |-- rateCodeId: integer (nullable = true)
 |-- storeAndFwdFlag: string (nullable = true)
 |-- paymentType: string (nullable = true)
 |-- fareAmount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mtaTax: double (nullable = true)
 |-- improvementSurcharge: string (nullable = true)
 |-- tipAmount: double (nullable = true)
 |-- tollsAmount: double (nullable = true)
 |-- totalAmount: double (nullable = true)
 |-- puYear: integer (nullable = true)
 |-- puMonth: integer (nullable = true)



# Milestone 2


### 1. EDA on your dataset that will be used for training and evaluation.
#### a. Address missing data
#### b. Address non-numerical features
#### c. List out raw features, derived features that you plan to implement/use
#### d. Do you need any dimensionality reduction? (e.g., LASSO regularization, forward/backward selection, PCA, etc..)

##### a. Address missing data

In [0]:
# DON'T RUN
# inspect if any data is missing

# count missing values for each column
missing_values_counts = df_truncated.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_truncated.columns])

missing_values_counts.show()

# We can see that there is only missing values in startLon, startLat, endLon, and endLat. Since this is not needed for our time series analysis we can drop these columns and therefore do not need to handle missing data. 

+--------+------------------+-------------------+--------------+------------+------------+------------+---------+---------+---------+---------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+
|vendorID|tpepPickupDateTime|tpepDropoffDateTime|passengerCount|tripDistance|puLocationId|doLocationId| startLon| startLat|   endLon|   endLat|rateCodeId|storeAndFwdFlag|paymentType|fareAmount|extra|mtaTax|improvementSurcharge|tipAmount|tollsAmount|totalAmount|puYear|puMonth|
+--------+------------------+-------------------+--------------+------------+------------+------------+---------+---------+---------+---------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+
|       0|                 0|                  0|             0|           0|           0|           0|216300320|216300320|216300320|216300320|         0|              0

In [0]:
# RUN
# drop startLon, startLat, endLon, and endLat columns
columns_to_drop = ['startLon','startLat', 'endLon', 'endLat']
df_truncated = df_truncated.drop(*columns_to_drop)

df_truncated.printSchema() # sanity check, check schema if columns gone

root
 |-- vendorID: string (nullable = true)
 |-- tpepPickupDateTime: timestamp (nullable = true)
 |-- tpepDropoffDateTime: timestamp (nullable = true)
 |-- passengerCount: integer (nullable = true)
 |-- tripDistance: double (nullable = true)
 |-- puLocationId: string (nullable = true)
 |-- doLocationId: string (nullable = true)
 |-- rateCodeId: integer (nullable = true)
 |-- storeAndFwdFlag: string (nullable = true)
 |-- paymentType: string (nullable = true)
 |-- fareAmount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mtaTax: double (nullable = true)
 |-- improvementSurcharge: string (nullable = true)
 |-- tipAmount: double (nullable = true)
 |-- tollsAmount: double (nullable = true)
 |-- totalAmount: double (nullable = true)
 |-- puYear: integer (nullable = true)
 |-- puMonth: integer (nullable = true)



In [0]:
# DON'T RUN
# double check no missing values in df_truncated after drop lat/long columns
missing_values_counts = df_truncated.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_truncated.columns])

missing_values_counts.show()

+--------+------------------+-------------------+--------------+------------+------------+------------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+
|vendorID|tpepPickupDateTime|tpepDropoffDateTime|passengerCount|tripDistance|puLocationId|doLocationId|rateCodeId|storeAndFwdFlag|paymentType|fareAmount|extra|mtaTax|improvementSurcharge|tipAmount|tollsAmount|totalAmount|puYear|puMonth|
+--------+------------------+-------------------+--------------+------------+------------+------------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+
|       0|                 0|                  0|             0|           0|           0|           0|         0|              0|          0|         0|    0|     0|                   0|        0|          0|          0|     0|      0|
+--------+------------------+-------------------+---

##### b. Address non-numerical features

In [0]:
# DON'T RUN
# From the schema we can see the following columns are non-numerical: vendorID (string), tpepPickupDateTime (timestamp), tpepDropoffDateTime (timestamp), puLocationId (string), doLocationId (string), storeAndFwdFlag (string), paymentType (string), and improvementSurcharge (string). Let's look at those columns so we can see how to address them. 

# select specific columns to view
selected_df = df_truncated.select("vendorID", "tpepPickupDateTime", "tpepDropoffDateTime", "puLocationId", "doLocationId", "storeAndFwdFlag", "paymentType","improvementSurcharge")

selected_df.show()

+--------+-------------------+-------------------+------------+------------+---------------+-----------+--------------------+
|vendorID| tpepPickupDateTime|tpepDropoffDateTime|puLocationId|doLocationId|storeAndFwdFlag|paymentType|improvementSurcharge|
+--------+-------------------+-------------------+------------+------------+---------------+-----------+--------------------+
|       1|2017-03-02 07:59:01|2017-03-02 08:12:48|         238|         236|              N|          1|                 0.3|
|       2|2017-02-28 12:03:31|2017-03-01 11:58:41|         263|         140|              N|          1|                 0.3|
|       1|2017-03-02 00:46:47|2017-03-02 00:52:13|          48|         100|              N|          1|                 0.3|
|       2|2017-02-28 17:21:44|2017-03-01 16:26:31|         138|         170|              N|          2|                 0.3|
|       2|2017-03-02 00:07:44|2017-03-02 00:26:32|         144|         179|              N|          1|              

In [0]:
# DON'T RUN
# how many unique values in these selected non-numeric columns?

selected_columns = ["vendorID", "tpepPickupDateTime", "tpepDropoffDateTime", "puLocationId", "doLocationId", "storeAndFwdFlag", "paymentType","improvementSurcharge"]

unique_counts = {}

for col_name in selected_columns:
    unique_counts[col_name] = df_truncated.select(col_name).distinct().count()

print("Number of unique values in each selected column:")
for col_name, count in unique_counts.items():
    print(f"{col_name}: {count}")

# Since tpepPickupDateTime and tpepDropoffDateTime are DateTime we do not want to transform those into numeric values bc we are doing a time series analysis and want to predict for the next hour. We want to ensure that the timestamps are in military time (if not we need to transform the data to be). This makes it easier to predict hourly and not worry if the time is AM or PM. Looking at the data the timestamps are in military time already.

Number of unique values in each selected column:
vendorID: 3
tpepPickupDateTime: 55505188
tpepDropoffDateTime: 55564631
puLocationId: 265
doLocationId: 264
storeAndFwdFlag: 2
paymentType: 5
improvementSurcharge: 28


In [0]:
# DON'T RUN
# list out the unique values for all those non-numeric columns except tpepPickupDateTime and tpepDropoffDateTime

selected_columns = ["vendorID", "puLocationId", "doLocationId", "storeAndFwdFlag", "paymentType","improvementSurcharge"]

for col_name in selected_columns:
    unique_values = df_truncated.select(col_name).distinct().rdd.flatMap(lambda x: x).collect()
    print(f"Unique values in column '{col_name}': {unique_values}")


# From the selected df with non-numeric data (except the datetime columns we excluded here) we can see that vendorID is numeric, pulLocationID is numeric, doLocationID numeric, storeAndFWDFlag is either Y or N (can change that to int, 1 = Y and 0 = N), paymentType is numeric, and improvementSurcharge is a float. 

Unique values in column 'vendorID': ['1', '4', '2']
Unique values in column 'puLocationId': ['125', '7', '124', '51', '169', '205', '234', '232', '54', '15', '155', '132', '154', '200', '11', '101', '138', '69', '29', '42', '112', '87', '73', '64', '3', '113', '30', '34', '133', '162', '59', '139', '146', '250', '8', '160', '258', '22', '28', '203', '184', '199', '85', '251', '52', '35', '16', '171', '183', '187', '71', '188', '98', '223', '195', '47', '99', '107', '214', '179', '202', '248', '96', '221', '43', '5', '163', '31', '100', '18', '70', '174', '206', '168', '224', '61', '218', '27', '75', '166', '219', '140', '17', '131', '126', '26', '227', '120', '46', '130', '164', '207', '147', '78', '208', '89', '228', '77', '198', '136', '257', '6', '118', '185', '230', '256', '201', '177', '246', '68', '90', '244', '229', '60', '194', '19', '41', '128', '23', '102', '55', '238', '263', '111', '220', '197', '167', '93', '95', '40', '38', '25', '189', '233', '190', '135', '44', '156', '

In [0]:
# RUN
# transform non-numeric columns to numeric data types

df_truncated = df_truncated \
    .withColumn("vendorID", col("vendorID").cast("int")) \
    .withColumn("puLocationId", col("puLocationId").cast("int")) \
    .withColumn("doLocationId", col("doLocationId").cast("int")) \
    .withColumn("paymentType", col("paymentType").cast("int")) \
    .withColumn("storeAndFwdFlag", when(col("storeAndFwdFlag") == 'Y', 1).otherwise(0).cast("int")) \
    .withColumn("improvementSurcharge", col("improvementSurcharge").cast("double"))

df_truncated.printSchema()

root
 |-- vendorID: integer (nullable = true)
 |-- tpepPickupDateTime: timestamp (nullable = true)
 |-- tpepDropoffDateTime: timestamp (nullable = true)
 |-- passengerCount: integer (nullable = true)
 |-- tripDistance: double (nullable = true)
 |-- puLocationId: integer (nullable = true)
 |-- doLocationId: integer (nullable = true)
 |-- rateCodeId: integer (nullable = true)
 |-- storeAndFwdFlag: integer (nullable = false)
 |-- paymentType: integer (nullable = true)
 |-- fareAmount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mtaTax: double (nullable = true)
 |-- improvementSurcharge: double (nullable = true)
 |-- tipAmount: double (nullable = true)
 |-- tollsAmount: double (nullable = true)
 |-- totalAmount: double (nullable = true)
 |-- puYear: integer (nullable = true)
 |-- puMonth: integer (nullable = true)



In [0]:
# DON'T RUN
# inspect data to make sure no issues with transformation of data types
df_truncated.show()

+--------+-------------------+-------------------+--------------+------------+------------+------------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+
|vendorID| tpepPickupDateTime|tpepDropoffDateTime|passengerCount|tripDistance|puLocationId|doLocationId|rateCodeId|storeAndFwdFlag|paymentType|fareAmount|extra|mtaTax|improvementSurcharge|tipAmount|tollsAmount|totalAmount|puYear|puMonth|
+--------+-------------------+-------------------+--------------+------------+------------+------------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+
|       1|2017-03-02 07:59:01|2017-03-02 08:12:48|             1|         1.4|         238|         236|         1|              0|          1|       9.5|  0.0|   0.5|                 0.3|      0.0|        0.0|       10.3|  2017|      3|
|       2|2017-02-28 12:03:31|2017-03-01 11:58:4

##### c. List out raw features, derived features that you plan to implement/use


In [0]:
# First, data cleaning
# Remove zero distance trips
# Remove under 1 minute rides
# Remove no passengers

# Second, derived features
# create indication if weekday/weekend as this could affect the number of taxis needed
# create indication if holiday (1) or not (0).
# create week number

# Third, remove columns from df_truncated not needed and explain why keep/remove columns. Keep only columns that could be useful in predicting taxi demand in the zones. 
# To predict the hourly demand for NYC taxi services in each zone, certain columns from the dataset would be more relevant while others may not contribute. Initial ideas for useful and non-useful columns are:

# Useful Columns:

# 1. tpepPickupDateTime and tpepDropoffDateTime: These columns provide the timestamp information for when the meter was engaged and disengaged. They are crucial for analyzing the temporal aspect of taxi demand.

# 2. passengerCount: Number of passengers in the vehicle could impact the demand, especially during peak hours or events.

# 3. tripDistance: Elapsed trip distance provides insight into the length of trips, which can correlate with demand.

# 4. puLocationId and doLocationId: These columns indicate the Taxi Zone where the pickup and dropoff occurred, which is essential for spatial analysis and understanding demand patterns across different zones.

# 5. paymentType: Knowing how passengers paid for the trip can provide insights into customer preferences and behavior.

# 6. fareAmount and totalAmount: These columns represent the fare and total amount charged to passengers, respectively, which could be indicative of demand trends.

# 7. puYear and puMonth: These columns provide additional temporal information, enabling analysis at yearly and monthly levels.

# Less Useful Columns:

# 1. vendorID: The provider of the record might not directly impact taxi demand analysis.

# 2. rateCodeId: Final rate code might not directly influence demand prediction.

# 3. storeAndFwdFlag: This flag indicating whether the trip record was held in vehicle memory may not be relevant for demand analysis.

# 4. extra, improvementSurcharge, mtaTax, tipAmount, tollsAmount: While these columns are relevant for fare calculation and billing, they might not directly affect demand prediction.

# 5. startLat, startLon, endLat, endLon: These columns provide latitude and longitude information, which might be useful for spatial analysis but less so for hourly demand prediction. (already dropped these columns)

# Do correlation analysis for columns to see if there is important columns we should use. 

In [0]:
# RUN
# First part: Data Cleaning. 
# Remove Zero distance (tripDistance > 0)
# Remove Under 1 minute rides
# Remove No passengers (passengerCount > 0)

# Convert tpepPickupDateTime and tpepDropoffDateTime to dt objects
df_truncated = df_truncated.withColumn('tpepPickupDateTime', df_truncated['tpepPickupDateTime'].cast('timestamp'))
df_truncated = df_truncated.withColumn('tpepDropoffDateTime', df_truncated['tpepDropoffDateTime'].cast('timestamp'))

# calculate ride time in minutes
df_truncated = df_truncated.withColumn('ride_duration', # in minutes
                                       (expr('unix_timestamp(tpepDropoffDateTime) - unix_timestamp(tpepPickupDateTime)') / 60).cast('double'))


# data cleaning
cleaned_df = df_truncated[(df_truncated['tripDistance'] > 0) & (df_truncated['passengerCount'] > 0) & (df_truncated['ride_duration'] >= 1)]

cleaned_df.show()

+--------+-------------------+-------------------+--------------+------------+------------+------------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+------------------+
|vendorID| tpepPickupDateTime|tpepDropoffDateTime|passengerCount|tripDistance|puLocationId|doLocationId|rateCodeId|storeAndFwdFlag|paymentType|fareAmount|extra|mtaTax|improvementSurcharge|tipAmount|tollsAmount|totalAmount|puYear|puMonth|     ride_duration|
+--------+-------------------+-------------------+--------------+------------+------------+------------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+------------------+
|       1|2017-03-02 07:59:01|2017-03-02 08:12:48|             1|         1.4|         238|         236|         1|              0|          1|       9.5|  0.0|   0.5|                 0.3|      0.0|        0.0|       10.3|  2017|

In [0]:
# DON'T RUN
cleaned_df.printSchema()

root
 |-- vendorID: integer (nullable = true)
 |-- tpepPickupDateTime: timestamp (nullable = true)
 |-- tpepDropoffDateTime: timestamp (nullable = true)
 |-- passengerCount: integer (nullable = true)
 |-- tripDistance: double (nullable = true)
 |-- puLocationId: integer (nullable = true)
 |-- doLocationId: integer (nullable = true)
 |-- rateCodeId: integer (nullable = true)
 |-- storeAndFwdFlag: integer (nullable = false)
 |-- paymentType: integer (nullable = true)
 |-- fareAmount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mtaTax: double (nullable = true)
 |-- improvementSurcharge: double (nullable = true)
 |-- tipAmount: double (nullable = true)
 |-- tollsAmount: double (nullable = true)
 |-- totalAmount: double (nullable = true)
 |-- puYear: integer (nullable = true)
 |-- puMonth: integer (nullable = true)
 |-- ride_duration: double (nullable = true)



In [0]:
# DON'T RUN
# how many rows in clean_df?
row_count = cleaned_df.count()
print("Number of rows in truncated data:", row_count)

# About 213M rows left in clean df

Number of rows in truncated data: 213027293


In [0]:
# RUN
# can drop ride_duration now that df is clean
cleaned_df = cleaned_df.drop('ride_duration')
cleaned_df.printSchema() # sanity check it is gone

root
 |-- vendorID: integer (nullable = true)
 |-- tpepPickupDateTime: timestamp (nullable = true)
 |-- tpepDropoffDateTime: timestamp (nullable = true)
 |-- passengerCount: integer (nullable = true)
 |-- tripDistance: double (nullable = true)
 |-- puLocationId: integer (nullable = true)
 |-- doLocationId: integer (nullable = true)
 |-- rateCodeId: integer (nullable = true)
 |-- storeAndFwdFlag: integer (nullable = false)
 |-- paymentType: integer (nullable = true)
 |-- fareAmount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mtaTax: double (nullable = true)
 |-- improvementSurcharge: double (nullable = true)
 |-- tipAmount: double (nullable = true)
 |-- tollsAmount: double (nullable = true)
 |-- totalAmount: double (nullable = true)
 |-- puYear: integer (nullable = true)
 |-- puMonth: integer (nullable = true)



In [0]:
# RUN
# Second, derived features
# Create indication if weekday/weekend as this could affect the number of taxis needed
# Create indication if holiday (1) or not (0).
# Create week number

# indication if weekday/weekend
cleaned_df = cleaned_df.withColumn('is_weekend', when(dayofweek(cleaned_df['tpepPickupDateTime']) >= 6, 1).otherwise(0)) 
# use the dayofweek() function to get the day of the week (1 for Sunday, 2 for Monday, ..., 7 for Saturday). If the day of the week is 6 or 7 (which is Saturday or Sunday), we set the is_weekend column to 1 (weekend); otherwise, it's set to 0 (weekday).

# Recreate a SparkSession. Issue with holiday dates and get this error: You may get a different result due to the upgrading to Spark >= 3.0: Fail to recognize 'w' pattern in the DateTimeFormatter. 1) You can set "spark.sql.legacy.timeParserPolicy" to "LEGACY" to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from 'https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html'. To fix add spark code below to recreate spark session
spark = SparkSession.builder \
    .appName("YourApp") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()

# indication if holiday
# a list of holidays in `holiday_dates`: New Years, MLK day, Presidents day, Memorial Day, 4th of July, Labor Day, Thanksgiving, and Christmas
holiday_dates = ['2017-01-01', '2018-01-01', '2017-01-16', '2018-01-15','2017-02-20','2018-02-19','2017-05-29', '2018-05-28','2017-07-04', '2018-07-04','2017-09-4','2018-09-03','2017-11-23', '2018-11-22', '2017-12-25', '2018-12-25'
]  

# 1 if holiday, 0 if not
cleaned_df = cleaned_df.withColumn('is_holiday', when(date_format(cleaned_df['tpepPickupDateTime'], 'yyyy-MM-dd').isin(holiday_dates), 1).otherwise(0))

# indicate week number of year
cleaned_df = cleaned_df.withColumn('week_number', weekofyear(cleaned_df['tpepPickupDateTime']))
cleaned_df = cleaned_df.withColumn("hour_of_day", F.hour("tpepPickupDateTime"))
cleaned_df = cleaned_df.withColumn("day_of_week", F.dayofweek("tpepPickupDateTime"))



cleaned_df.show()


+--------+-------------------+-------------------+--------------+------------+------------+------------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+----------+----------+-----------+-----------+-----------+
|vendorID| tpepPickupDateTime|tpepDropoffDateTime|passengerCount|tripDistance|puLocationId|doLocationId|rateCodeId|storeAndFwdFlag|paymentType|fareAmount|extra|mtaTax|improvementSurcharge|tipAmount|tollsAmount|totalAmount|puYear|puMonth|is_weekend|is_holiday|week_number|hour_of_day|day_of_week|
+--------+-------------------+-------------------+--------------+------------+------------+------------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+----------+----------+-----------+-----------+-----------+
|       1|2017-03-02 07:59:01|2017-03-02 08:12:48|             1|         1.4|         238|         236|        

In [0]:
# DON'T RUN
# check dervived features are correct

# first check week number, min should be 1 and max should be 52 bc only 52 weeks in a yr
min_week_number = cleaned_df.agg(F.min('week_number')).collect()[0][0]
max_week_number = cleaned_df.agg(F.max('week_number')).collect()[0][0]

print("Minimum week number:", min_week_number)
print("Maximum week number:", max_week_number)

Minimum week number: 1
Maximum week number: 52


In [0]:
# DON'T RUN
# validate columns

# Look at regular nonholiday, should be 1
# specific_date = '2017-11-07' # this is week 45 so should have 45 in week_number and a 0 in is_holiday and 0 for is_weekend. This is good, the numbers are correct in the df. Check another date.
#specific_date = '2018-02-03' # this is week 5 so should have 5 in week_number and 0 in is_holiday and 1 for is_weekend. This is good too. Now check for a holiday.
specific_date = '2018-07-04' # this is week 27, 1 for is_holiday, and 0 for is_weekend

filtered_df = cleaned_df.filter(col('tpepPickupDateTime').cast('date') == specific_date)

filtered_df.show()

# validation check a success, all derived features are correct.

+--------+-------------------+-------------------+--------------+------------+------------+------------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+----------+----------+-----------+
|vendorID| tpepPickupDateTime|tpepDropoffDateTime|passengerCount|tripDistance|puLocationId|doLocationId|rateCodeId|storeAndFwdFlag|paymentType|fareAmount|extra|mtaTax|improvementSurcharge|tipAmount|tollsAmount|totalAmount|puYear|puMonth|is_weekend|is_holiday|week_number|
+--------+-------------------+-------------------+--------------+------------+------------+------------+----------+---------------+-----------+----------+-----+------+--------------------+---------+-----------+-----------+------+-------+----------+----------+-----------+
|       2|2018-07-04 20:38:06|2018-07-04 21:17:13|             1|       17.24|         132|         164|         2|              0|          2|      52.0|  0.0|   0.5|                 

In [0]:
# Third/Last, remove columns from cleaned_df not needed and explain why keep/remove columns. Keep only columns that could be useful in predicting taxi demand in the zones. 
# To predict the hourly demand for NYC taxi services in each zone, certain columns from the dataset would be more relevant while others may not contribute significantly. Initially we believe

# Useful Columns:

# 1. tpepPickupDateTime and tpepDropoffDateTime: These columns provide the timestamp information for when the meter was engaged and disengaged. They are crucial for analyzing the temporal aspect of taxi demand.

# 2. passengerCount: Number of passengers in the vehicle could impact the demand, especially during peak hours or events.

# 3. tripDistance: Elapsed trip distance provides insight into the length of trips, which can correlate with demand.

# 4. puLocationId and doLocationId: These columns indicate the Taxi Zone where the pickup and dropoff occurred, which is essential for spatial analysis and understanding demand patterns across different zones.

# 5. paymentType: Knowing how passengers paid for the trip can provide insights into customer preferences and behavior.

# 6. fareAmount and totalAmount: These columns represent the fare and total amount charged to passengers, respectively, which could be indicative of demand trends.

# 7. puYear and puMonth: These columns provide additional temporal information, enabling analysis at yearly and monthly levels.

# 8. Derived columns are useful.

# Less Useful Columns:

# 1. vendorID: The provider of the record might not directly impact taxi demand analysis.

# 2. rateCodeId: Final rate code might not directly influence demand prediction.

# 3. storeAndFwdFlag: This flag indicating whether the trip record was held in vehicle memory may not be relevant for demand analysis.

# 4. extra, improvementSurcharge, mtaTax, tipAmount, tollsAmount: While these columns are relevant for fare calculation and billing, they might not directly affect demand prediction.

# 5. startLat, startLon, endLat, endLon: These columns provide latitude and longitude information, which might be useful for spatial analysis but less so for hourly demand prediction. (already dropped these columns)

# We will do correlation analysis and feature importance for columns to see what columns can be useful for modelling/forecasting.

cleaned_df.printSchema()
cleaned_df.show()

root
 |-- vendorID: integer (nullable = true)
 |-- tpepPickupDateTime: timestamp (nullable = true)
 |-- tpepDropoffDateTime: timestamp (nullable = true)
 |-- passengerCount: integer (nullable = true)
 |-- tripDistance: double (nullable = true)
 |-- puLocationId: integer (nullable = true)
 |-- doLocationId: integer (nullable = true)
 |-- rateCodeId: integer (nullable = true)
 |-- storeAndFwdFlag: integer (nullable = false)
 |-- paymentType: integer (nullable = true)
 |-- fareAmount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mtaTax: double (nullable = true)
 |-- improvementSurcharge: double (nullable = true)
 |-- tipAmount: double (nullable = true)
 |-- tollsAmount: double (nullable = true)
 |-- totalAmount: double (nullable = true)
 |-- puYear: integer (nullable = true)
 |-- puMonth: integer (nullable = true)
 |-- is_weekend: integer (nullable = false)
 |-- is_holiday: integer (nullable = false)
 |-- week_number: integer (nullable = true)

+--------+------------

In [0]:
# DON'T RUN
# put features into a single vector. Use all columns for now to gain a better understanding
feature_cols = ['vendorID','passengerCount', 'tripDistance', 'rateCodeId','storeAndFwdFlag', 'paymentType','fareAmount', 'extra', 'mtaTax', 'improvementSurcharge', 'tipAmount', 'tollsAmount', 'totalAmount','puYear','puMonth', 'is_weekend','is_holiday', 'week_number']
# didn't use pick or drop off timestamp datatypes and taxi zone where person picked up and dropped off (puLocationId and doLocationId)

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# apply VectorAssembler to the df
data = assembler.transform(cleaned_df).select("features")

# calculate correlation matrix
correlation_matrix = Correlation.corr(data, "features").head()
corr_matrix = correlation_matrix[0].toArray()

# define feature labels
feature_labels = feature_cols

print("Correlation Matrix:")
for i, row_label in enumerate(feature_labels):
    for j, col_label in enumerate(feature_labels):
        print(f"{row_label} vs {col_label}: {corr_matrix[i][j]}")


# Notes from correlation matrix results:
# we want to look for correlation coefficients that are close to 1 or -1, as these indicate a strong linear relationship between the two variables. Many correlations are close to zero, indicating weak or no linear relationship between the variables. Overall, from the correlation matrix we see there is no very strong indicators. 

Correlation Matrix:
vendorID vs vendorID: 1.0
vendorID vs passengerCount: 0.23859317024104335
vendorID vs tripDistance: 0.0071163645725295386
vendorID vs rateCodeId: 0.010098291710613343
vendorID vs storeAndFwdFlag: -0.07304537719629776
vendorID vs paymentType: -0.016763189659596913
vendorID vs fareAmount: 0.0012973062098037547
vendorID vs extra: 0.002872012897936415
vendorID vs mtaTax: -0.0024100434010393126
vendorID vs improvementSurcharge: -0.0005992939003506492
vendorID vs tipAmount: 0.01191047607245094
vendorID vs tollsAmount: 0.010940768844313532
vendorID vs totalAmount: 0.0017121333439620407
vendorID vs puYear: 0.04597077792617179
vendorID vs puMonth: 0.03035670092469391
vendorID vs is_weekend: -0.0009761750321477059
vendorID vs is_holiday: 0.005822139842876883
vendorID vs week_number: 0.030076731686786883
passengerCount vs vendorID: 0.23859317024104335
passengerCount vs passengerCount: 1.0
passengerCount vs tripDistance: 0.0037948268282406972
passengerCount vs rateCodeId: 0.001

In [0]:
# RUN
# create final dataset with desired columns useful for forecasting
# since our correlation analysis was not as insightful as expected we will stick with out initial thoughts on what columns are useful and not useful. We wil drop the not useful columns discussed above and keep our derived features.

# list of columns to drop
columns_to_drop = ["vendorID", "rateCodeId", "storeAndFwdFlag", "extra", "improvementSurcharge", "mtaTax", "tipAmount", "tollsAmount"]

# drop the columns
cleaned_df = cleaned_df.drop(*columns_to_drop)

# sanity check they are gone
cleaned_df.printSchema() # this df will be used for initial pipeline


root
 |-- tpepPickupDateTime: timestamp (nullable = true)
 |-- tpepDropoffDateTime: timestamp (nullable = true)
 |-- passengerCount: integer (nullable = true)
 |-- tripDistance: double (nullable = true)
 |-- puLocationId: integer (nullable = true)
 |-- doLocationId: integer (nullable = true)
 |-- paymentType: integer (nullable = true)
 |-- fareAmount: double (nullable = true)
 |-- totalAmount: double (nullable = true)
 |-- puYear: integer (nullable = true)
 |-- puMonth: integer (nullable = true)
 |-- is_weekend: integer (nullable = false)
 |-- is_holiday: integer (nullable = false)
 |-- week_number: integer (nullable = true)
 |-- hour_of_day: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)



##### d. Do you need any dimensionality reduction? (e.g., LASSO regularization, forward/backward selection, PCA, etc..)

Dimensionality reduction techniques like LASSO regularization, forward/backward selection, and PCA (Principal Component Analysis) are used in machine learning and data analysis to address various challenges associated with high-dimensional data. Since our data is not high-dimensional we do not need to apply dimension reduction techniques.

### 2. Specify the feature transformations for the pipeline and justify these features given the target

The dataset provided indicates that the 'demand' column will serve as the target variable, as our objective is to forecast hourly taxi demand. We have engineered several features, including 'is_weekend', 'is_holiday', 'week_number', 'hour_of_day', and 'day_of_week', which were derived through feature transformation. These features are crucial because the time and day can significantly influence taxi demand. For instance, demand may escalate during holidays and be higher on weekdays compared to weekends. Consequently, these features are likely to have a substantial effect on predicting taxi demand, which justifies their creation and inclusion in our model.

In [0]:
display(cleaned_df)

tpepPickupDateTime,tpepDropoffDateTime,passengerCount,tripDistance,puLocationId,doLocationId,paymentType,fareAmount,totalAmount,puYear,puMonth,is_weekend,is_holiday,week_number,hour_of_day,day_of_week,demand
2017-03-15T03:56:48Z,2017-03-15T04:02:17Z,1,1.14,68,107,2,6.0,7.3,2017,3,0,0,11,3,4,94432
2017-03-15T03:19:18Z,2017-03-15T03:27:05Z,2,1.9,48,68,1,8.0,11.15,2017,3,0,0,11,3,4,94432
2017-03-15T03:17:40Z,2017-03-15T03:35:31Z,1,4.9,74,223,2,17.5,18.8,2017,3,0,0,11,3,4,94432
2017-03-15T03:24:19Z,2017-03-15T03:27:40Z,1,0.5,237,263,1,4.5,6.75,2017,3,0,0,11,3,4,94432
2017-03-15T03:12:41Z,2017-03-15T03:21:21Z,2,1.6,61,188,2,8.0,9.3,2017,3,0,0,11,3,4,94432
2017-03-15T03:17:04Z,2017-03-15T03:28:01Z,1,2.9,79,80,2,12.0,13.3,2017,3,0,0,11,3,4,94432
2017-03-15T03:03:07Z,2017-03-15T03:21:51Z,1,9.38,263,82,1,27.5,41.21,2017,3,0,0,11,3,4,94432
2017-03-15T03:01:23Z,2017-03-15T03:07:53Z,2,0.72,162,233,1,5.5,8.5,2017,3,0,0,11,3,4,94432
2017-03-15T03:38:28Z,2017-03-15T04:07:56Z,5,18.0,132,17,2,48.5,49.8,2017,3,0,0,11,3,4,94432
2017-03-15T03:49:08Z,2017-03-15T03:58:45Z,5,2.82,179,226,2,10.5,11.8,2017,3,0,0,11,3,4,94432


In [0]:
cleaned_df = cleaned_df.withColumn("tpepPickupDateTime", cleaned_df["tpepPickupDateTime"].cast(TimestampType()))
cleaned_df = cleaned_df.withColumn("tpepDropoffDateTime", cleaned_df["tpepDropoffDateTime"].cast(TimestampType()))

# Use the window function to define a window spec for 1 hour
windowSpec = Window.partitionBy(F.hour("tpepPickupDateTime"), F.dayofmonth("tpepPickupDateTime"))

# Count the number of trips per hour
cleaned_df = cleaned_df.withColumn("demand", F.count("tpepPickupDateTime").over(windowSpec))

# Show the result
cleaned_df.select(["tpepPickupDateTime", "demand"]).show()

+-------------------+------+
| tpepPickupDateTime|demand|
+-------------------+------+
|2017-03-25 00:59:10|229258|
|2017-03-25 00:21:17|229258|
|2017-03-25 00:37:13|229258|
|2017-03-25 00:55:20|229258|
|2017-03-25 00:26:12|229258|
|2017-03-25 00:01:42|229258|
|2017-03-25 00:17:42|229258|
|2017-03-25 00:49:31|229258|
|2017-03-25 00:11:45|229258|
|2017-03-25 00:57:54|229258|
|2017-03-25 00:18:30|229258|
|2017-03-25 00:24:49|229258|
|2017-03-25 00:16:21|229258|
|2017-03-25 00:33:19|229258|
|2017-03-25 00:50:07|229258|
|2017-03-25 00:16:13|229258|
|2017-03-25 00:01:13|229258|
|2017-03-25 00:27:23|229258|
|2017-03-25 00:46:08|229258|
|2017-03-25 00:58:51|229258|
+-------------------+------+
only showing top 20 rows



### 3. Create baseline pipelines and do experiments on your data
#### a. Set a blind test set that is never seen during training.
#### b. Report evaluation metrics over the training set
#### c. Report evaluation metrics over blind test dataset
#### d. Create a baseline model using logistic/linear regression, ensemble models
#### e. Discussion of experimental results on training and testing dataset

##### a. Set a blind test set that is never seen during training.

In [0]:
### New Truncated data (only 2017)
cleaned_df = cleaned_df.filter((col("puYear") == 2017))

In [0]:
# Assemble the vector 
feature_columns = ['tripDistance','puLocationId','doLocationId','totalAmount','puYear','puMonth','is_weekend', 'is_holiday', 'week_number', 'hour_of_day', 'day_of_week']  # Add other feature columns as needed
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Data Splitting
# Setting a blind test for test data
featured_df = assembler.transform(cleaned_df)
split_date = featured_df.select(F.max("puYear"), F.max("puMonth")).first()
train_data = featured_df.filter((F.col("puYear") < split_date[0]) | (F.col("puMonth") < split_date[1]))
test_data = featured_df.filter((F.col("puYear") == split_date[0]) & (F.col("puMonth") == split_date[1]))

# Model Training
lr = LinearRegression(featuresCol='features', labelCol='demand')
lr_model = lr.fit(train_data)
print("Coefficients:", lr_model.coefficients)
print("Intercept:", lr_model.intercept)

Coefficients: [-798.6027632722396,40.430325518718455,38.169672066378084,1.2641054253683812,0.0,9528.041578021295,-3600.3394645873327,15552.891326155814,-2186.8058915407996,10306.835079496173,-119.24228351804142]
Intercept: 179390.89674940117


##### b. Report evaluation metrics over the training set

Since this model is a linear regression model, out evaluation metric for the linear regression model on the training set will just Coefficients and Intercepts:

Coefficients: [-798.6027632722396,40.430325518718455,38.169672066378084,1.2641054253683812,0.0,9528.041578021295,-3600.3394645873327,15552.891326155814,-2186.8058915407996,10306.835079496173,-119.24228351804142]
Intercept: 179390.89674940117


##### c. Report evaluation metrics over blind test dataset

In [0]:
# Assuming lr_model is your trained model from lr.fit(train_data)
predictions = lr_model.transform(test_data)

# Create an instance of the evaluator
evaluator = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse")

# Evaluate the predictions
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data =", rmse)

Root Mean Squared Error (RMSE) on test data = 59973.71971378482


##### d. Create a baseline model using logistic/linear regression, ensemble models

In [0]:
# Model Training
lr = LinearRegression(featuresCol='features', labelCol='demand')
lr_model = lr.fit(train_data)
print("Coefficients:", lr_model.coefficients)
print("Intercept:", lr_model.intercept)

Result is under the 3a code chunk

##### e. Discussion of experimental results on training and testing dataset