<a href="https://colab.research.google.com/github/felolivee/DVA-NYC_Congestion/blob/main/DVA_Project_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

#mount drive to save data
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [3]:
import kagglehub
import time

start_time = time.time()

# Download 2019-2022 Data
originalPath = kagglehub.dataset_download("jeffsinsel/nyc-fhvhv-data")

print("Path to dataset files:", originalPath)
!mv {originalPath}/* /content

Downloading from https://www.kaggle.com/api/v1/datasets/download/jeffsinsel/nyc-fhvhv-data?dataset_version_number=4...


100%|██████████| 17.8G/17.8G [04:23<00:00, 72.8MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/jeffsinsel/nyc-fhvhv-data/versions/4


In [None]:
# OPTIONAL TO TEST SCALING: Download Nov 2024, Dec 2024, and Jan 2025 Data for Scalability
extendedPath = kagglehub.dataset_download("folive8/jan2025nyc")

print("Path to dataset files:", extendedPath)
!mv {extendedPath}/* /content

In [4]:
end_download = time.time()
print(f"Data download execution time: {end_download - start_time:.2f} seconds")

Data download execution time: 528.91 seconds


In [5]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
import glob
import pandas as pd
import numpy as np

# initialize spark
spark = SparkSession.builder.appName("NYC_Rides").getOrCreate()

# list of all parquet files from content folder
parquet_files = glob.glob('/content/*.parquet')

removed_summary = {}
combined_df = None

rawTotalCount = 0
cleanTotalCount = 0

for file in parquet_files:
    # read parquet files get features
    df = spark.read.parquet(file).select(
        "Pickup_datetime", "DropOff_datetime", "PULocationID", "DOLocationID",
        "base_passenger_fare", "trip_miles", "tips", "driver_pay", "trip_time",
        "Hvfhs_license_num", "congestion_surcharge"
    )

    initial_count = df.count()
    rawTotalCount += initial_count

    # Clean data by filtering out null and invalid values
    df_clean = df.filter(
        (F.col("Pickup_datetime").isNotNull()) &
        (F.col("DropOff_datetime").isNotNull()) &
        (F.col("PULocationID").isNotNull()) &
        (F.col("DOLocationID").isNotNull()) &
        (F.col("base_passenger_fare").isNotNull()) &
        (F.col("trip_miles").isNotNull()) &
        (F.col("tips").isNotNull()) &
        (F.col("driver_pay").isNotNull()) &
        (F.col("trip_time").isNotNull()) &
        (F.col("Hvfhs_license_num").isNotNull()) &
        (F.col("congestion_surcharge").isNotNull()) &


        # Ensure numeric columns have valid positive values
        (F.col("base_passenger_fare") > 0) &
        (F.col("trip_miles") > 0) &
        (F.col("driver_pay") > 0) &
        (F.col("trip_time") > 0)
    )

    clean_count = df_clean.count()
    cleanTotalCount += clean_count
    removed_summary[file] = initial_count - clean_count

    # combine clean DFs into one big DF
    if combined_df is None:
        combined_df = df_clean
    else:
        combined_df = combined_df.union(df_clean)

# check how many of each file were removed
print("Removal Summary:", removed_summary)

Removal Summary: {'/content/fhvhv_tripdata_2021-12.parquet': 94795, '/content/fhvhv_tripdata_2021-07.parquet': 140020, '/content/fhvhv_tripdata_2019-02.parquet': 2781621, '/content/fhvhv_tripdata_2022-07.parquet': 77160, '/content/fhvhv_tripdata_2021-03.parquet': 140061, '/content/fhvhv_tripdata_2020-04.parquet': 69415, '/content/fhvhv_tripdata_2021-11.parquet': 97481, '/content/fhvhv_tripdata_2022-01.parquet': 77981, '/content/fhvhv_tripdata_2021-02.parquet': 123443, '/content/fhvhv_tripdata_2020-10.parquet': 130980, '/content/fhvhv_tripdata_2021-05.parquet': 142950, '/content/fhvhv_tripdata_2019-05.parquet': 2197902, '/content/fhvhv_tripdata_2020-03.parquet': 949378, '/content/fhvhv_tripdata_2019-09.parquet': 2193661, '/content/fhvhv_tripdata_2019-04.parquet': 2150489, '/content/fhvhv_tripdata_2022-02.parquet': 84755, '/content/fhvhv_tripdata_2022-03.parquet': 84930, '/content/fhvhv_tripdata_2019-03.parquet': 2346635, '/content/fhvhv_tripdata_2021-06.parquet': 155107, '/content/fhvhv

In [6]:
# calculate how many total rows of data were removed
percent_removed = (1 - cleanTotalCount/rawTotalCount) * 100

print("Raw data count: {}".format(rawTotalCount))
print("Cleaned data count: {}".format(cleanTotalCount))
print("{}% was removed from a total of {} rides".format(percent_removed, rawTotalCount))

Raw data count: 745287023
Cleaned data count: 708208434
4.97507508593773% was removed from a total of 745287023 rides


In [7]:
end_cleaning = time.time()
print(f"Data cleaning execution time: {end_cleaning - start_time:.2f} seconds")

Data cleaning execution time: 1031.00 seconds


In [8]:
# aggregate features for Tableau file

add_hour = combined_df.withColumn("hour", F.date_format(F.col("Pickup_datetime"), "k"))

tableau_df = add_hour.groupBy(["PULocationID", "hour", "Hvfhs_license_num"]).agg(F.count("PULocationID").alias("num_rides"),F.mean("trip_miles").alias("avg_trip_miles"), F.mean("driver_pay").alias("avg_driver_pay"))

tableau_df = tableau_df.withColumn("rideType", F.when(F.col("Hvfhs_license_num") == "HV0002", "Juno")
     .when(F.col("Hvfhs_license_num") == "HV0003", "Uber")
     .when(F.col("Hvfhs_license_num") == "HV0004", "Via")
     .when(F.col("Hvfhs_license_num") == "HV0005", "Lyft"))

In [9]:
# Save data for Tableau
tableau_df = tableau_df.toPandas()
tableau_df.to_csv('/content/drive/MyDrive/dva_project.csv')

In [10]:
end_tableau = time.time()
print(f"Save CSV for Tableau execution time: {end_tableau - start_time:.2f} seconds")

Save CSV for Tableau execution time: 2278.30 seconds


In [11]:
!pip install -q category_encoders

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
from pyspark.sql.functions import hour

# Extract Hour from Pickup_datetime
combined_df = combined_df.withColumn("hour", hour("Pickup_datetime"))


In [13]:
combined_df.show(5)
combined_df.printSchema()

+-------------------+-------------------+------------+------------+-------------------+----------+----+----------+---------+-----------------+--------------------+----+
|    Pickup_datetime|   DropOff_datetime|PULocationID|DOLocationID|base_passenger_fare|trip_miles|tips|driver_pay|trip_time|Hvfhs_license_num|congestion_surcharge|hour|
+-------------------+-------------------+------------+------------+-------------------+----------+----+----------+---------+-----------------+--------------------+----+
|2021-12-01 00:09:16|2021-12-01 00:58:03|         164|         265|              72.32|     17.54| 0.0|     53.67|     2928|           HV0003|                2.75|   0|
|2021-12-01 00:06:05|2021-12-01 00:19:05|          80|         112|              13.27|      2.39| 0.0|      9.41|      780|           HV0003|                 0.0|   0|
|2021-12-01 00:22:45|2021-12-01 00:43:47|         112|         189|              22.59|      4.91|5.05|     16.23|     1262|           HV0003|             

In [14]:
from pyspark.sql.functions import col, sum, min

# Group by Pickup Zone, Hour, and Ride-Hailing Company to Aggregate Driver Pay
df_grouped = combined_df.groupBy("PULocationID", "hour", "Hvfhs_license_num").agg(
    sum("driver_pay").alias("total_driver_pay"),
    sum("tips").alias("total_tips"),
    sum("trip_miles").alias("total_miles"),
    sum("trip_time").alias("total_time_seconds"),
    sum("base_passenger_fare").alias("total_base_fare"),
    min("Pickup_datetime").alias("earliest_pickup_time")
)


In [15]:
# Convert to Pandas dataframe
df = df_grouped.toPandas()

In [16]:
df_pandas = df.copy()

# Add relevant variables
df_pandas["earliest_pickup_time"] = pd.to_datetime(df_pandas["earliest_pickup_time"])
df_pandas["day_of_week"] = df_pandas["earliest_pickup_time"].dt.dayofweek
df_pandas["is_weekend"] = df_pandas["day_of_week"].isin([5, 6]).astype(int)

# Convert trip time from seconds to hours
df_pandas["total_time_hours"] = df_pandas["total_time_seconds"] / 3600

# Handle division by zero
df_pandas["total_time_hours"].replace(0, np.nan, inplace=True)

# Compute total driver earnings (including tips)
df_pandas["total_driver_earnings"] = df_pandas["total_driver_pay"] + df_pandas["total_tips"]

# Compute earnings per hour
df_pandas["earnings_per_hour"] = df_pandas["total_driver_earnings"] / df_pandas["total_time_hours"]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_pandas["total_time_hours"].replace(0, np.nan, inplace=True)


In [17]:
# get column names
df_pandas.columns.tolist()

['PULocationID',
 'hour',
 'Hvfhs_license_num',
 'total_driver_pay',
 'total_tips',
 'total_miles',
 'total_time_seconds',
 'total_base_fare',
 'earliest_pickup_time',
 'day_of_week',
 'is_weekend',
 'total_time_hours',
 'total_driver_earnings',
 'earnings_per_hour']

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Drop unnecessary columns
X = df_pandas.drop(columns=["total_driver_earnings", "earnings_per_hour", "earliest_pickup_time", "total_driver_pay", "total_time_seconds"])
y = df_pandas["earnings_per_hour"]

# Encode PULocationID using target mean encoding
pulo_mean = df_pandas.groupby("PULocationID")["earnings_per_hour"].mean()
df_pandas["PULocationID_encoded"] = df_pandas["PULocationID"].map(pulo_mean)
X = X.drop(columns=["PULocationID"])

# One-hot encode Hvfhs_license_num
X = pd.get_dummies(X, columns=["Hvfhs_license_num"], drop_first=True)
X = X.astype(float)

# Fix extreme outliers in target variable
y = y.clip(upper=y.quantile(0.99))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# Train different models to choose the best performing one

models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    results[name] = {"R² Score": r2, "MAE": mae}

# Convert results dictionary to DataFrame
results_df = pd.DataFrame(results).T

# Display the results
print(results_df)


                  R² Score       MAE
RandomForest      0.727405  2.769593
XGBoost           0.796635  2.329759
GradientBoosting  0.803475  2.295832


In [20]:
#GradientBoosting is the best model, therefore I'll use it to predict driver earnings for your heatmap

# Train GradientBoosting with the whole dataset
best_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
best_model.fit(X, y)

# Predict earnings_per_hour
y_pred = best_model.predict(X)

# Attach Predictions to PULocationID
df_pandas["predicted_earnings_per_hour"] = y_pred


In [21]:
# Create a dataframe with PULocationID and predicted_earnings_per_hour for heatmap

# Group by PULocationID and Hour to Get Mean Earnings Per Hour
earnings_by_location_time = (
    df_pandas.groupby(["PULocationID", "hour"])["predicted_earnings_per_hour"]
    .mean()
    .reset_index()
)

# Display first few rows of Results
print(earnings_by_location_time.head())

   PULocationID  hour  predicted_earnings_per_hour
0             1     0                    58.385701
1             1     1                    64.568775
2             1     2                    67.868063
3             1     3                    69.569105
4             1     4                    72.479634


In [22]:
#write earnings by location file
earnings_by_location_time.to_csv('/content/drive/MyDrive/earnings_by_location_time.csv')

In [23]:
end_earningPrediction = time.time()
print(f"Save CSV of earnings prediction execution time: {end_earningPrediction - start_time:.2f} seconds")

Save CSV of earnings prediction execution time: 3254.14 seconds


In [24]:
# Group by Pickup Zone, Hour to aggregate City Revenue
df_grouped = combined_df.groupBy("PULocationID", "hour", "Hvfhs_license_num").agg(
    sum("trip_miles").alias("total_miles"),
    sum("trip_time").alias("total_time_seconds"),
    sum("base_passenger_fare").alias("total_base_fare"),
    min("Pickup_datetime").alias("earliest_pickup_time"),
    sum("congestion_surcharge").alias("total_congestion_surcharge")
)

In [25]:
print(df_grouped.columns)
pand_df = df_grouped.toPandas()

['PULocationID', 'hour', 'Hvfhs_license_num', 'total_miles', 'total_time_seconds', 'total_base_fare', 'earliest_pickup_time', 'total_congestion_surcharge']


In [26]:
pand_df['year'] = pand_df['earliest_pickup_time'].dt.year
pand_df["day_of_week"] = pand_df["earliest_pickup_time"].dt.dayofweek
pand_df['city_revenue'] = (pand_df['total_base_fare'] * 0.0875) + pand_df['total_congestion_surcharge']
pand_df['city_revenue_per_hour'] = pand_df['city_revenue'] / (pand_df['total_time_seconds'] / 3600 )
print(pand_df.columns)

Index(['PULocationID', 'hour', 'Hvfhs_license_num', 'total_miles',
       'total_time_seconds', 'total_base_fare', 'earliest_pickup_time',
       'total_congestion_surcharge', 'year', 'day_of_week', 'city_revenue',
       'city_revenue_per_hour'],
      dtype='object')


In [27]:
# Regression for NYC Revenue


# creating new copy of dataframe
reg2 = pand_df.copy()


# getting rid of unccessary columns and creating x&y variables
x = reg2.drop(columns=['total_time_seconds', 'total_miles', 'total_base_fare', 'total_congestion_surcharge', 'earliest_pickup_time', 'city_revenue', 'city_revenue_per_hour'])
x = pd.get_dummies(x, columns=["Hvfhs_license_num"], drop_first=True)
x = pd.get_dummies(x, columns=["day_of_week"], drop_first = True)
x = pd.get_dummies(x, columns=["hour"], drop_first = True)
pulo_mean = pand_df.groupby("PULocationID")["city_revenue_per_hour"].mean()
pand_df["PULocationID_encoded"] = pand_df["PULocationID"].map(pulo_mean)
x = x.drop(columns=["PULocationID"])
print(x.columns)

y = reg2['city_revenue']
y = y.clip(upper=y.quantile(0.99))


# run model
city_revenue_model = GradientBoostingRegressor(n_estimators=100, learning_rate = 0.1, max_depth=6, random_state=42)
city_revenue_model.fit(x,y)

prediction = city_revenue_model.predict(x)
reg2['predicted_city_revenue_per_hour'] = prediction
reg2['predicted_city_revenue_per_hour'] = np.where(reg2['predicted_city_revenue_per_hour'] < 0 , 0, reg2['predicted_city_revenue_per_hour'])

# create dataframe for city revenue per hour
city_revenue = (reg2.groupby(['PULocationID','hour'])['predicted_city_revenue_per_hour'].mean().round(3).reset_index())
city_revenue.head(15)

Index(['year', 'Hvfhs_license_num_HV0003', 'Hvfhs_license_num_HV0004',
       'Hvfhs_license_num_HV0005', 'day_of_week_1', 'day_of_week_2',
       'day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'day_of_week_6',
       'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7',
       'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_20', 'hour_21', 'hour_22', 'hour_23'],
      dtype='object')


Unnamed: 0,PULocationID,hour,predicted_city_revenue_per_hour
0,1,0,5513.796
1,1,1,72360.494
2,1,2,0.0
3,1,3,549.933
4,1,4,28088.007
5,1,5,32836.387
6,1,6,637.066
7,1,7,873.48
8,1,8,88748.941
9,1,9,82492.452


In [28]:
#write city revenue file
city_revenue.to_csv('/content/drive/MyDrive/city_revenue_.csv')

In [29]:
end_revenuePrediction = time.time()
print(f"Save CSV of city revenue execution time (total execution time): {end_revenuePrediction - start_time:.2f} seconds")

Save CSV of city revenue execution time (total execution time): 4145.00 seconds
