# 04) Modeling

Here you will find the different models built in order to attempt predicting taxi fares. 
They include: 
- 4.01 GradientBoost (winner)
- 4.02 RandomForest
- 4.03 LinearRegression
- 4.04 XGBoost 

## 4.01 GradientBoost

In [3]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle


from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor

pd.set_option('display.max_columns', None)

In [4]:
#In order to run the model faster, I had to sample it down to 700,000 instead of nearly 2M
new_df = pd.read_csv('../data/clean/041324_taxi_recs.csv')
new_df = new_df.set_index('req_index')
new_df = new_df.sample(n=700000, random_state=2024)
print(new_df.shape)
new_df.head()

(700000, 14)


Unnamed: 0_level_0,trip_miles,tips,congestion_surcharge,temp,preciptype,zone,borough_name,trip_duration,month,day_of_month,driver_made,day_of_week,hour,minute
req_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-10-19,2.38,5.0,0.0,48.9,0,Saint George/New Brighton,Staten Island,9.0,10,19,14.82,Wednesday,14,27
2022-07-15,10.92,0.0,0.0,78.3,0,Old Astoria,Queens,35.0,7,15,30.91,Friday,22,24
2022-03-09,1.26,0.0,0.0,37.6,3,Flushing,Queens,6.0,3,9,6.32,Wednesday,15,51
2022-12-03,1.53,0.0,2.75,52.7,1,Upper East Side North,Manhattan,7.0,12,3,5.88,Saturday,20,56
2022-11-20,0.7,0.0,0.0,34.9,2,Greenpoint,Brooklyn,3.0,11,20,6.15,Sunday,10,18


In [5]:
new_df.dtypes

trip_miles              float64
tips                    float64
congestion_surcharge    float64
temp                    float64
preciptype                int64
zone                     object
borough_name             object
trip_duration           float64
month                     int64
day_of_month              int64
driver_made             float64
day_of_week              object
hour                      int64
minute                    int64
dtype: object

In [6]:
new_df = new_df.sort_index()
new_df.shape

(700000, 14)

**Note**: By sampling down the data using .sample(), one runs the risk of the training data not including all zones in the testing data. Should this happen, tune the .sample() size and pull again.

In [40]:
#Attempthing Gradient Boost

#setting X, y
X = new_df.drop(columns=['driver_made', 'tips',	'congestion_surcharge'])
y = new_df['driver_made']

#tts
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2024)

# Instantiating
gb = GradientBoostingRegressor(random_state=2024)
ohe = OneHotEncoder()

cat_var = ['borough_name', 'zone', 'day_of_week']
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(), cat_var)], remainder ='passthrough')

pipe = Pipeline([('ct', ct), ('gb', gb)])

#Fitting
pipe.fit(X_train, y_train)

In [41]:
X_test.shape

(175000, 11)

In [42]:
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.8696095556417843, 0.8695550482254655)

In [79]:
with open('../Taxis/taxi_model.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [5]:
#Attempthing Gradient Boost with more data

#Setting X2, y2
X2 = new_df.drop(columns=['driver_made', 'tips'])
y2 = new_df['driver_made']

#Train Test Split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=2024)

#Instantiating GradientBoost and Preprocessor
gb = GradientBoostingRegressor(random_state=2024)
ohe = OneHotEncoder()

#Creating my column transformer
cat_var = ['borough_name', 'zone', 'day_of_week']
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(), cat_var)], remainder ='passthrough')

#Creating my pipeline
pipe2 = Pipeline([('ct', ct), ('gb', gb)])

#Fitting
pipe2.fit(X2_train, y2_train)

In [6]:
print(X2_train.shape)
print(X2_test.shape)
print(y2_train.shape)
print(y2_test.shape)

(525000, 12)
(175000, 12)
(525000,)
(175000,)


In [11]:
#Baseline
print(y2.mean())

#Model 
print(test_preds.mean())

19.925043828571425
19.892313136410248


In [8]:
#Predictions
train_preds = pipe2.predict(X2_train)
test_preds = pipe2.predict(X2_test)

# Evaluate the model
train_rmse = np.sqrt(mean_squared_error(y2_train, train_preds))
test_rmse = np.sqrt(mean_squared_error(y2_test, test_preds))

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)


Train RMSE: 5.996746393279121
Test RMSE: 6.030936149491458


In [14]:
#Getting the scores
pipe2.score(X2_train, y2_train), pipe2.score(X2_test, y2_test)

(0.8688101985791591, 0.8742563177724201)

In [15]:
#Saving the model
with open('../final_model.pkl', 'wb') as a:
    pickle.dump(pipe2, a)

## 4.02 RandomForest

In [16]:
#Importing the models
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [None]:
#Creating a smaller dataset to make it quicker
new_df = new_df.sample(n=1000000)

In [18]:
#Make X, y
X = new_df.drop(columns=['driver_made', 'tips'])
y = new_df['driver_made']

#TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2024)

#Instantiating
ohe = OneHotEncoder()
sc = StandardScaler()
rf = RandomForestRegressor()

cat_var = ['borough_name', 'zone', 'day_of_week']

ct = ColumnTransformer([
    ('ohe', OneHotEncoder(), cat_var)], remainder ='passthrough')

#Building the pipeline
pipe = Pipeline([
    ('ct', ct),
    ('sc', StandardScaler(with_mean=False)),
    ('rf', RandomForestRegressor(n_estimators=200, 
                                 max_depth=30, min_samples_split=.05,
                                max_features='sqrt', n_jobs=4))
     ])

#Fitting the model
pipe.fit(X_train, y_train)

#Looking at r2 scores
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.6265111574769324, 0.6082260837883272)

In [19]:
#Feature engineering/tuning to build pipe2
pipe2 = Pipeline([
    ('ct', ct),
    ('sc', StandardScaler(with_mean=False)),
    ('rf', RandomForestRegressor(n_estimators=250, 
                                 max_depth=30, min_samples_split=300,
                                 max_features='sqrt', n_jobs=4))
     ])

#Fitting
pipe2.fit(X_train, y_train)

#Looking at r2 scores
pipe2.score(X_train, y_train), pipe2.score(X_test, y_test)

(0.812658200192468, 0.7868359562917979)

## 4.03 LinearRegression

In [7]:
#Imports I'll need
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn import metrics

In [None]:
#Creating a smaller dataset
new_df = new_df.sample(n=500000, random_state=2024)

In [8]:
#Ordinal mapping to get numerics

#Days of week
days = {'Sunday': 1, 'Monday': 2, 'Tuesday': 3, 'Wednesday': 4, 'Thursday': 5, 'Friday': 6, 'Saturday': 7}
new_df['day_of_week'] = new_df['day_of_week'].map(days)

#Boroughs
boroughs = {'Manhattan': 1, 'Brooklyn': 2, 'Queens': 3, 'Bronx': 4, 'Staten Island': 5}
new_df['borough_name'] = new_df['borough_name'].map(boroughs)

In [9]:
#Dummifying taxi zones
new_df = pd.get_dummies(columns=['zone'], data=new_df)

In [10]:
#Setting my X, y
new_df = new_df.dropna()
X = new_df.drop(columns=['driver_made', 'tips'])
y = new_df['driver_made']

#Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2024)

#Checking the baseline
y_train.mean()

19.938961044651858

In [11]:
#Instantiating preprocessor and model
sc = StandardScaler()
lr = LinearRegression()

#Building the pipe
pipe = Pipeline([
    ('sc', StandardScaler()),
    ('lr', LinearRegression())
])

#Fitting the pipe
pipe.fit(X_train, y_train)

#Testing the pipe
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.8595608222393563, -6.877529910322985e+19)

In [12]:
# More Feature Engineering/ building a pipe with less features

#Creating new X with less features
X2 = new_df.drop(columns=['driver_made', 'congestion_surcharge', 'tips'])
y= new_df['driver_made']

#Perfomring tts
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, random_state=2024)


#Fitting the new tts
pipe.fit(X2_train, y_train)

#Getting Score
pipe.score(X2_test, y_test), pipe.score(X2_test, y_test)


(-3.12995915472969e+19, -3.12995915472969e+19)

In [33]:
# Trying a LassoCV

# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-3, 0, 100)

# Cross-validate over our list of Lasso alphas.
lasso_cv = LassoCV(alphas=l_alphas, cv=5, max_iter=10)

sc = StandardScaler()

pipe2 = Pipeline([
    ('sc', StandardScaler()),
    ('lasso_cv', lasso_cv)
])

# Fit model using best ridge alpha!
pipe2.fit(X_train, y_train)

#Checking the scores
pipe2.score(X_test, y_test), pipe2.score(X_test, y_test)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


(0.8602664886753134, 0.8602664886753134)

In [34]:
#Trying a RidgeRegression

#Setting alphas
r_alphas = np.logspace(0, 5, 100)

# Cross-validate over our list of ridge alphas.
ridge_cv = RidgeCV(alphas=r_alphas, scoring='r2', cv=5)

sc = StandardScaler()

pipe3 = Pipeline([
    ('sc', StandardScaler()),
    ('ridge_cv', ridge_cv)
])

# Fit model using best ridge alpha!
pipe3.fit(X_train, y_train)

#Getting score
pipe3.score(X_test, y_test), pipe3.score(X_test, y_test)

(0.8602934316071456, 0.8602934316071456)

In [35]:
#Checking the best alpha_
ridge_cv.alpha_

148.4968262254465

## 4.04 XGBoost

In [36]:
#Importing model
import xgboost as xgb
from xgboost.sklearn import XGBRegressor



In [39]:
#Making X, y
X = new_df.drop(columns=['driver_made', 'tips'])
y = new_df['driver_made']

#Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2024)

#Instantiating preprocessor
ohe = OneHotEncoder() 

#Creating pipeline and ColumnTransformer for preprocessing
cat_var = ['borough_name', 'zone', 'day_of_week']
cat_pipe = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

ct = ColumnTransformer([
    ('cat_pipe', cat_pipe, cat_var)])

#Instantiating XGBoost
xgb = XGBRegressor(n_estimators=250, 
                   max_depth=30, min_samples_split=200,
                   max_features=['sqrt', 'log2'], n_jobs=4, random_state=2024, 
                   enable_categorical=True)

#Building the pipeline of transformers and the model
pipe = Pipeline([
    ('ct', ct),
    ('xgb', xgb)
])

#Fitting the model
pipe.fit(X_train, y_train)

Parameters: { "max_features", "min_samples_split" } are not used.



In [40]:
#Checking the r2 scores
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.1254790827448825, 0.11795497969270252)

In [41]:
#Checking RMSE scores

#Predictions
train_preds = pipe.predict(X_train)
test_preds = pipe.predict(X_test)

#Evaluate the model
train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))

#Printing scores
print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)

Train RMSE: 15.532526394310093
Test RMSE: 15.952227323666603


In [42]:
#Feature engineering/tuning parameter for XGB2 model

#Instantiatingnew model and params
xgb2 = XGBRegressor(n_estimators=300, 
                   max_depth=20, min_samples_split=200,
                   max_features=['sqrt'], n_jobs=4, random_state=2024, 
                   enable_categorical=True)

#Building a second pipeline
pipe2 = Pipeline([
    ('ct', ct),
    ('xgb2', xgb2)
])

#Fitting to new pipe
pipe2.fit(X_train, y_train)

Parameters: { "max_features", "min_samples_split" } are not used.



In [43]:
#Checking XGB2 RMSE scores

#Predictions
train_preds = pipe2.predict(X_train)
test_preds = pipe2.predict(X_test)

# Evaluate the model
train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)

#r2 scores
print(pipe2.score(X_train, y_train), pipe2.score(X_test, y_test))

Train RMSE: 15.532526412350197
Test RMSE: 15.952228991805969
0.12547908071347447 0.11795479522014352


In [44]:
#Feature engineering/tuning parameter for XGB3/pipe3 model

#Instantiating XGB3 with new params
xgb3 = XGBRegressor(n_estimators=500, 
                   max_depth=10, min_samples_split=200,
                   min_child_weight=1, random_state=2024, 
                   enable_categorical=True)

#Building pipe3
pipe3 = Pipeline([
    ('ct', ct),
    ('xgb3', xgb3)
])

#Fitting to pipe3
pipe3.fit(X_train, y_train)

Parameters: { "min_samples_split" } are not used.



In [45]:
#Checking XGB2 RMSE scores

#Predictions
train_preds = pipe3.predict(X_train)
test_preds = pipe3.predict(X_test)

# Evaluate the model
train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)

#r2 scores
print(pipe3.score(X_train, y_train), pipe3.score(X_test, y_test))

Train RMSE: 15.5325289502805
Test RMSE: 15.952005176868525
0.1254787949295313 0.11797954580610048
