In [None]:
#download from Kaggle
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d jeffsinsel/nyc-fhvhv-data

In [None]:
# Alternative method to download data

import kagglehub

# Download latest version
path = kagglehub.dataset_download("jeffsinsel/nyc-fhvhv-data")

print("Path to dataset files:", path)
!mv {path}/* /content

In [None]:
#unzip files in nyc-fhvhv-data from Kaggle
! unzip nyc-fhvhv-data.zip

In [None]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
import glob

# initialize spark
spark = SparkSession.builder.appName("NYC_Rides").getOrCreate()

# list of all parquet files from content folder
parquet_files = glob.glob('/content/*.parquet')

removed_summary = {}
combined_df = None

for file in parquet_files:
    # read parquet files get features
    df = spark.read.parquet(file).select(
        "Pickup_datetime", "DropOff_datetime", "PULocationID", "DOLocationID",
        "base_passenger_fare", "trip_miles", "tips", "driver_pay", "trip_time",
        "Hvfhs_license_num", "congestion_surcharge"
    )

    initial_count = df.count()

    # Clean data by filtering out null and invalid values
    df_clean = df.filter(
        (F.col("Pickup_datetime").isNotNull()) &
        (F.col("DropOff_datetime").isNotNull()) &
        (F.col("PULocationID").isNotNull()) &
        (F.col("DOLocationID").isNotNull()) &
        (F.col("base_passenger_fare").isNotNull()) &
        (F.col("trip_miles").isNotNull()) &
        (F.col("tips").isNotNull()) &
        (F.col("driver_pay").isNotNull()) &
        (F.col("trip_time").isNotNull()) &
        (F.col("Hvfhs_license_num").isNotNull()) &
        (F.col("congestion_surcharge").isNotNull()) &


        # Ensure numeric columns have valid positive values
        (F.col("base_passenger_fare") > 0) &
        (F.col("trip_miles") > 0) &
        (F.col("driver_pay") > 0) &
        (F.col("trip_time") > 0)
    )

    clean_count = df_clean.count()
    removed_summary[file] = initial_count - clean_count

    # combine clean DFs into one big DF
    if combined_df is None:
        combined_df = df_clean
    else:
        combined_df = combined_df.union(df_clean)

# check how many of which file were removed
print("Removal Summary:", removed_summary)

In [None]:
# calculate how many total rows of data were removed
tot = 0
for key,val in removed_summary.items():
  tot += int(val)

count = combined_df.count()
percent_removed = tot/count * 100

print("{}% was removed from a total of {} rides in 46 months".format(percent_removed, count))

In [None]:
!pip install category_encoders


Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/85.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [None]:
from pyspark.sql.functions import hour

# Extract Hour from Pickup_datetime
combined_df = combined_df.withColumn("hour", hour("Pickup_datetime"))


In [None]:
combined_df.show(5)
combined_df.printSchema()

In [None]:
from pyspark.sql.functions import col, sum, min

# Group by Pickup Zone, Hour, and Ride-Hailing Company to Aggregate Driver Pay
df_grouped = combined_df.groupBy("PULocationID", "hour", "Hvfhs_license_num").agg(
    sum("driver_pay").alias("total_driver_pay"),
    sum("tips").alias("total_tips"),
    sum("trip_miles").alias("total_miles"),
    sum("trip_time").alias("total_time_seconds"),
    sum("base_passenger_fare").alias("total_base_fare"),
    min("Pickup_datetime").alias("earliest_pickup_time")
)


In [None]:
import pandas as pd

# Convert to Pandas dataframe

df = df_grouped.toPandas()



In [None]:
import numpy as np

df_pandas = df.copy()

# Add relevant variables
df_pandas["earliest_pickup_time"] = pd.to_datetime(df_pandas["earliest_pickup_time"])
df_pandas["day_of_week"] = df_pandas["earliest_pickup_time"].dt.dayofweek
df_pandas["is_weekend"] = df_pandas["day_of_week"].isin([5, 6]).astype(int)

# Convert trip time from seconds to hours
df_pandas["total_time_hours"] = df_pandas["total_time_seconds"] / 3600

# Handle division by zero
df_pandas["total_time_hours"].replace(0, np.nan, inplace=True)

# Compute total driver earnings (including tips)
df_pandas["total_driver_earnings"] = df_pandas["total_driver_pay"] + df_pandas["total_tips"]

# Compute earnings per hour
df_pandas["earnings_per_hour"] = df_pandas["total_driver_earnings"] / df_pandas["total_time_hours"]

In [None]:
df_pandas.columns.tolist()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Drop unnecessary columns
X = df_pandas.drop(columns=["total_driver_earnings", "earnings_per_hour", "earliest_pickup_time", "total_driver_pay", "total_time_seconds"])
y = df_pandas["earnings_per_hour"]

# Encode PULocationID using target mean encoding
pulo_mean = df_pandas.groupby("PULocationID")["earnings_per_hour"].mean()
df_pandas["PULocationID_encoded"] = df_pandas["PULocationID"].map(pulo_mean)
X = X.drop(columns=["PULocationID"])

# One-hot encode Hvfhs_license_num
X = pd.get_dummies(X, columns=["Hvfhs_license_num"], drop_first=True)
X = X.astype(float)

# Fix extreme outliers in target variable
y = y.clip(upper=y.quantile(0.99))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# Train different models to choose the best performing one

models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    results[name] = {"R² Score": r2, "MAE": mae}

# Convert results dictionary to DataFrame
results_df = pd.DataFrame(results).T

# Display the results
print(results_df)


In [None]:
#GradientBoosting is the best model, therefore I'll use it to predict driver earnings for your heatmap

# Train GradientBoosting with the whole dataset
best_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
best_model.fit(X, y)

# Predict earnings_per_hour
y_pred = best_model.predict(X)

# Attach Predictions to PULocationID
df_pandas["predicted_earnings_per_hour"] = y_pred


In [None]:
# Create a dataframe with PULocationID and predicted_earnings_per_hour for heatmap

# Group by PULocationID and Hour to Get Mean Earnings Per Hour
earnings_by_location_time = (
    df_pandas.groupby(["PULocationID", "hour"])["predicted_earnings_per_hour"]
    .mean()
    .reset_index()
)

# Display first few rows of Results
print(earnings_by_location_time.head())

In [None]:
# Regression for NYC Revenue


# creating new copy of dataframe
reg2 = df_pandas.copy()
reg2['year'] = reg2['earliest_pickup_time'].dt.year

# calculating city revenue column by hand
# this was the original method
reg2['city_revenue'] = reg2['total_base_fare'] * 0.0875
reg2['city_revenue'] = reg2['city_revenue'] + np.where(reg2['year'] == 2024, 4.25, 2.75)


# getting rid of unccessary columns and creating x&y variables
x = reg2.drop(columns=['predicted_earnings_per_hour','total_time_seconds', 'total_driver_pay', 'city_revenue', 'total_tips', 'is_weekend', 'total_driver_earnings', 'earliest_pickup_time', 'total_base_fare', 'PULocationID', 'congestion_surcharge'])
x = pd.get_dummies(x, columns=["Hvfhs_license_num"], drop_first=True)
print(x.columns)
y = reg2['congestion_surcharge'] + reg2['total_base_fare'] * 0.0875
y = y.clip(upper=y.quantile(0.99))


# run model
city_revenue_model = GradientBoostingRegressor(n_estimators=100, learning_rate = 0.1, max_depth=6, random_state=42)
city_revenue_model.fit(x,y)

prediction = city_revenue_model.predict(x)
reg2['predicted_city_revenue_per_hour'] = prediction

# create dataframe for city revenue per hour
city_revenue = (reg2.groupby(['PULocationID','hour'])['predicted_city_revenue_per_hour'].mean().round(3).reset_index())
city_revenue.head()



In [None]:
#pivot column to also visualize data
revenue_pivot = city_revenue.pivot(index='PULocationID', columns='hour', values='predicted_city_revenue_per_hour')
revenue_pivot