In [None]:
#download from Kaggle
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d jeffsinsel/nyc-fhvhv-data

In [None]:
# Alternative method to download data

import kagglehub

# Download latest version
path = kagglehub.dataset_download("jeffsinsel/nyc-fhvhv-data")

print("Path to dataset files:", path)
!mv {path}/* /content

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
import glob
import pandas as pd
import numpy as np

# initialize spark
spark = SparkSession.builder.appName("NYC_Rides").getOrCreate()

# list of all parquet files from content folder
parquet_files = glob.glob('/content/drive/My Drive/Team86DemoData/*.parquet')

removed_summary = {}
combined_df = None

for file in parquet_files:
    # read parquet files get features
    df = spark.read.parquet(file).select(
        "Pickup_datetime", "DropOff_datetime", "PULocationID", "DOLocationID",
        "base_passenger_fare", "trip_miles", "tips", "driver_pay", "trip_time",
        "Hvfhs_license_num", "congestion_surcharge"
    )

    initial_count = df.count()

    # Clean data by filtering out null and invalid values
    df_clean = df.filter(
        (F.col("Pickup_datetime").isNotNull()) &
        (F.col("DropOff_datetime").isNotNull()) &
        (F.col("PULocationID").isNotNull()) &
        (F.col("DOLocationID").isNotNull()) &
        (F.col("base_passenger_fare").isNotNull()) &
        (F.col("trip_miles").isNotNull()) &
        (F.col("tips").isNotNull()) &
        (F.col("driver_pay").isNotNull()) &
        (F.col("trip_time").isNotNull()) &
        (F.col("Hvfhs_license_num").isNotNull()) &
        (F.col("congestion_surcharge").isNotNull()) &


        # Ensure numeric columns have valid positive values
        (F.col("base_passenger_fare") > 0) &
        (F.col("trip_miles") > 0) &
        (F.col("driver_pay") > 0) &
        (F.col("trip_time") > 0)
    )

    clean_count = df_clean.count()
    removed_summary[file] = initial_count - clean_count

    # combine clean DFs into one big DF
    if combined_df is None:
        combined_df = df_clean
    else:
        combined_df = combined_df.union(df_clean)

# check how many of which file were removed
print("Removal Summary:", removed_summary)

Removal Summary: {'/content/drive/My Drive/Team86DemoData/fhvhv_tripdata_2022-09.parquet': 85805, '/content/drive/My Drive/Team86DemoData/fhvhv_tripdata_2022-10.parquet': 159570}


In [6]:
# calculate how many total rows of data were removed
tot = 0
for key,val in removed_summary.items():
  tot += int(val)

count = combined_df.count()
percent_removed = tot/count * 100

print("{}% was removed from a total of {} rides in 3 months".format(percent_removed, count))

0.6657980924107945% was removed from a total of 36854266 rides in 46 months


In [22]:
# aggregate features for Tableau file

add_hour = combined_df.withColumn("hour", F.date_format(F.col("Pickup_datetime"), "k"))

tableau_df = add_hour.groupBy(["PULocationID", "hour", "Hvfhs_license_num"]).agg(F.count("PULocationID").alias("num_rides"),F.mean("trip_miles").alias("avg_trip_miles"), F.mean("driver_pay").alias("avg_driver_pay"))


tableau_df = tableau_df.withColumn("rideType", F.when(F.col("Hvfhs_license_num") == "HV0002", "Juno")
     .when(F.col("Hvfhs_license_num") == "HV0003", "Uber")
     .when(F.col("Hvfhs_license_num") == "HV0004", "Via")
     .when(F.col("Hvfhs_license_num") == "HV0005", "Lyft"))


tableau_df.toPandas().to_csv('/content/drive/MyDrive/Team86DemoData/DemoFile1.csv')

In [9]:
!pip install category_encoders


Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [10]:
from pyspark.sql.functions import hour

# Extract Hour from Pickup_datetime
combined_df = combined_df.withColumn("hour", hour("Pickup_datetime"))


In [11]:
combined_df.show(5)
combined_df.printSchema()

+-------------------+-------------------+------------+------------+-------------------+----------+----+----------+---------+-----------------+--------------------+----+
|    Pickup_datetime|   DropOff_datetime|PULocationID|DOLocationID|base_passenger_fare|trip_miles|tips|driver_pay|trip_time|Hvfhs_license_num|congestion_surcharge|hour|
+-------------------+-------------------+------------+------------+-------------------+----------+----+----------+---------+-----------------+--------------------+----+
|2022-09-01 00:03:50|2022-09-01 00:38:49|         114|         265|              60.46|      8.78| 0.0|     33.72|     2099|           HV0003|                 0.0|   0|
|2022-09-01 00:06:31|2022-09-01 00:11:35|         208|         208|               8.69|      0.66| 0.0|      6.71|      304|           HV0003|                 0.0|   0|
|2022-09-01 00:14:17|2022-09-01 00:27:12|         247|         244|              13.38|      2.48| 0.0|     10.74|      775|           HV0003|             

In [12]:
from pyspark.sql.functions import col, sum, min

# Group by Pickup Zone, Hour, and Ride-Hailing Company to Aggregate Driver Pay
df_grouped = combined_df.groupBy("PULocationID", "hour", "Hvfhs_license_num").agg(
    sum("driver_pay").alias("total_driver_pay"),
    sum("tips").alias("total_tips"),
    sum("trip_miles").alias("total_miles"),
    sum("trip_time").alias("total_time_seconds"),
    sum("base_passenger_fare").alias("total_base_fare"),
    min("Pickup_datetime").alias("earliest_pickup_time")
)


In [23]:


# Convert to Pandas dataframe

df = df_grouped.toPandas()



In [24]:


df_pandas = df.copy()

# Add relevant variables
df_pandas["earliest_pickup_time"] = pd.to_datetime(df_pandas["earliest_pickup_time"])
df_pandas["day_of_week"] = df_pandas["earliest_pickup_time"].dt.dayofweek
df_pandas["is_weekend"] = df_pandas["day_of_week"].isin([5, 6]).astype(int)

# Convert trip time from seconds to hours
df_pandas["total_time_hours"] = df_pandas["total_time_seconds"] / 3600

# Handle division by zero
df_pandas["total_time_hours"].replace(0, np.nan, inplace=True)

# Compute total driver earnings (including tips)
df_pandas["total_driver_earnings"] = df_pandas["total_driver_pay"] + df_pandas["total_tips"]

# Compute earnings per hour
df_pandas["earnings_per_hour"] = df_pandas["total_driver_earnings"] / df_pandas["total_time_hours"]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_pandas["total_time_hours"].replace(0, np.nan, inplace=True)


In [25]:
df_pandas.columns.tolist()

['PULocationID',
 'hour',
 'Hvfhs_license_num',
 'total_driver_pay',
 'total_tips',
 'total_miles',
 'total_time_seconds',
 'total_base_fare',
 'earliest_pickup_time',
 'day_of_week',
 'is_weekend',
 'total_time_hours',
 'total_driver_earnings',
 'earnings_per_hour']

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Drop unnecessary columns
X = df_pandas.drop(columns=["total_driver_earnings", "earnings_per_hour", "earliest_pickup_time", "total_driver_pay", "total_time_seconds"])
y = df_pandas["earnings_per_hour"]

# Encode PULocationID using target mean encoding
pulo_mean = df_pandas.groupby("PULocationID")["earnings_per_hour"].mean()
df_pandas["PULocationID_encoded"] = df_pandas["PULocationID"].map(pulo_mean)
X = X.drop(columns=["PULocationID"])

# One-hot encode Hvfhs_license_num
X = pd.get_dummies(X, columns=["Hvfhs_license_num"], drop_first=True)
X = X.astype(float)

# Fix extreme outliers in target variable
y = y.clip(upper=y.quantile(0.99))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X

In [17]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# Train different models to choose the best performing one

models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    results[name] = {"R² Score": r2, "MAE": mae}

# Convert results dictionary to DataFrame
results_df = pd.DataFrame(results).T

# Display the results
print(results_df)


                  R² Score       MAE
RandomForest      0.795693  2.739237
XGBoost           0.856102  2.251448
GradientBoosting  0.867796  2.189844


In [18]:
#GradientBoosting is the best model, therefore I'll use it to predict driver earnings for your heatmap

# Train GradientBoosting with the whole dataset
best_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
best_model.fit(X, y)

# Predict earnings_per_hour
y_pred = best_model.predict(X)

# Attach Predictions to PULocationID
df_pandas["predicted_earnings_per_hour"] = y_pred


In [20]:
# Create a dataframe with PULocationID and predicted_earnings_per_hour for heatmap

# Group by PULocationID and Hour to Get Mean Earnings Per Hour
earnings_by_location_time = (
    df_pandas.groupby(["PULocationID", "hour"])["predicted_earnings_per_hour"]
    .mean()
    .reset_index()
)

# Display first few rows of Results
print(earnings_by_location_time.head())

#write results file
earnings_by_location_time.to_csv('/content/drive/MyDrive/Team86DemoData/DemoFile2.csv')

   PULocationID  hour  predicted_earnings_per_hour
0             1     8                    83.644539
1             2     0                    75.853476
2             2     2                    59.163087
3             2     4                    85.290625
4             2     5                    66.553821


In [26]:
df_grouped = combined_df.groupBy("PULocationID", "hour", "Hvfhs_license_num").agg(
    sum("trip_miles").alias("total_miles"),
    sum("trip_time").alias("total_time_seconds"),
    sum("base_passenger_fare").alias("total_base_fare"),
    min("Pickup_datetime").alias("earliest_pickup_time"),
    sum("congestion_surcharge").alias("total_congestion_surcharge")
)

In [27]:
pand_df = df_grouped.toPandas()

In [28]:
pand_df['year'] = pand_df['earliest_pickup_time'].dt.year
pand_df["day_of_week"] = pand_df["earliest_pickup_time"].dt.dayofweek
print(pand_df.columns)

Index(['PULocationID', 'hour', 'Hvfhs_license_num', 'total_miles',
       'total_time_seconds', 'total_base_fare', 'earliest_pickup_time',
       'total_congestion_surcharge', 'year', 'day_of_week'],
      dtype='object')


In [30]:
pand_df['city_revenue'] = (pand_df['total_base_fare'] * 0.0875) + pand_df['total_congestion_surcharge']
pand_df['city_revenue_per_hour'] = pand_df['city_revenue'] / (pand_df['total_time_seconds'] / 3600 )

In [31]:
# Regression for NYC Revenue


# creating new copy of dataframe
reg2 = pand_df.copy()


x = reg2.drop(columns=['total_time_seconds', 'total_miles', 'total_base_fare', 'total_congestion_surcharge', 'earliest_pickup_time', 'city_revenue', 'city_revenue_per_hour'])
x = pd.get_dummies(x, columns=["Hvfhs_license_num"])
x = pd.get_dummies(x, columns=["day_of_week"])
x = pd.get_dummies(x, columns=["hour"])
pulo_mean = pand_df.groupby("PULocationID")["city_revenue_per_hour"].mean()
pand_df["PULocationID_encoded"] = pand_df["PULocationID"].map(pulo_mean)
x = x.drop(columns=["PULocationID"])
print(x.columns)

y = reg2['city_revenue']
y = y.clip(upper=y.quantile(0.99))

# run model
city_revenue_model = GradientBoostingRegressor(n_estimators=100, learning_rate = 0.1, max_depth=6, random_state=42)
city_revenue_model.fit(x,y)

prediction = city_revenue_model.predict(x)
reg2['predicted_city_revenue_per_hour'] = prediction
reg2['predicted_city_revenue_per_hour'] = np.where(reg2['predicted_city_revenue_per_hour'] < 0 , 0, reg2['predicted_city_revenue_per_hour'])

# create dataframe for city revenue per hour
city_revenue = (reg2.groupby(['PULocationID','hour'])['predicted_city_revenue_per_hour'].mean().round(3).reset_index())
city_revenue.head()



Index(['year', 'Hvfhs_license_num_HV0003', 'Hvfhs_license_num_HV0005',
       'day_of_week_0', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
       'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'hour_0', 'hour_1',
       'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8',
       'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14',
       'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20',
       'hour_21', 'hour_22', 'hour_23'],
      dtype='object')


Unnamed: 0,PULocationID,hour,predicted_city_revenue_per_hour
0,1,8,395.221
1,2,0,39.392
2,2,2,0.0
3,2,4,0.0
4,2,5,0.0


In [32]:
#pivot column to also visualize data
revenue_pivot = city_revenue.pivot(index='PULocationID', columns='hour', values='predicted_city_revenue_per_hour')
revenue_pivot

hour,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
PULocationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,395.221,,...,,,,,,,,,,
2,39.392,,0.000,,0.000,0.000,0.000,68.947,3004.526,919.825,...,331.699,260.702,323.882,149.792,3615.070,209.356,3628.154,250.048,150.741,
3,9623.642,6829.071,5225.434,4250.026,4172.220,4402.214,6626.251,9289.682,10832.814,10349.965,...,11029.576,11503.352,11694.529,12979.763,13582.450,13405.357,12677.437,12477.297,12811.713,12256.789
4,9623.642,6829.071,5225.434,4250.026,4172.220,4402.214,6626.251,9289.682,10832.814,10349.965,...,11029.576,11503.352,11694.529,12979.763,13582.450,13405.357,12677.437,12477.297,12811.713,12256.789
5,9623.642,4930.369,1536.602,104.486,2893.944,1366.136,6626.251,6921.315,10832.814,10349.965,...,11029.576,11503.352,11694.529,12979.763,13582.450,13405.357,12677.437,9148.526,9373.879,8912.840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,9623.642,6829.071,5225.434,4250.026,4172.220,4402.214,6626.251,9289.682,10832.814,10349.965,...,11029.576,11503.352,11694.529,12979.763,13582.450,13405.357,12677.437,12477.297,12811.713,12256.789
261,9623.642,6829.071,5225.434,4250.026,4172.220,4402.214,6626.251,9289.682,10832.814,10349.965,...,11029.576,11503.352,11694.529,12979.763,13582.450,13405.357,12677.437,12477.297,12811.713,12256.789
262,9623.642,6829.071,5225.434,4250.026,4172.220,4402.214,6626.251,9289.682,10832.814,10349.965,...,11029.576,11503.352,11694.529,12979.763,13582.450,13405.357,12677.437,12477.297,12811.713,12256.789
263,9623.642,6829.071,5225.434,4250.026,4172.220,4402.214,6626.251,9289.682,10832.814,10349.965,...,11029.576,11503.352,11694.529,12979.763,13582.450,13405.357,12677.437,12477.297,12811.713,12256.789


In [33]:
city_revenue.to_csv('/content/drive/My Drive/Team86DemoData/DemoFile3.csv')