In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay

from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
from xgboost import plot_importance

import joblib

In [19]:
# Load dataset
df0 = pd.read_csv('/content/2017_Yellow_Taxi_Trip_Data.csv')
# Import predicted fares and mean distance
nyc_preds_avg = pd.read_csv('/content/nyc_preds_avgs.csv')

In [20]:
df0 = df0.merge(nyc_preds_avg,
                left_index=True,
                right_index=True)
df0.head()

Unnamed: 0,Unnamed: 0_x,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,Unnamed: 0_y,avg_duration,avg_distance,predicted_fare
0,24870114,2,03/25/2017 8:55:43 AM,03/25/2017 9:09:47 AM,6,3.34,1,N,100,231,...,0.0,0.5,2.76,0.0,0.3,16.56,0,22.847222,3.521667,16.434245
1,35634249,1,04/11/2017 2:53:28 PM,04/11/2017 3:19:58 PM,1,1.8,1,N,186,43,...,0.0,0.5,4.0,0.0,0.3,20.8,1,24.47037,3.108889,16.052218
2,106203690,1,12/15/2017 7:26:56 AM,12/15/2017 7:34:08 AM,1,1.0,1,N,262,236,...,0.0,0.5,1.45,0.0,0.3,8.75,2,7.25,0.881429,7.053706
3,38942136,2,05/07/2017 1:17:59 PM,05/07/2017 1:48:14 PM,1,3.7,1,N,188,97,...,0.0,0.5,6.39,0.0,0.3,27.69,3,30.25,3.7,18.73165
4,30841670,2,04/15/2017 11:32:20 PM,04/15/2017 11:49:03 PM,1,4.37,1,N,4,112,...,0.5,0.5,0.0,0.0,0.3,17.8,4,14.616667,4.435,15.845642


In [21]:
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22699 entries, 0 to 22698
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0_x           22699 non-null  int64  
 1   VendorID               22699 non-null  int64  
 2   tpep_pickup_datetime   22699 non-null  object 
 3   tpep_dropoff_datetime  22699 non-null  object 
 4   passenger_count        22699 non-null  int64  
 5   trip_distance          22699 non-null  float64
 6   RatecodeID             22699 non-null  int64  
 7   store_and_fwd_flag     22699 non-null  object 
 8   PULocationID           22699 non-null  int64  
 9   DOLocationID           22699 non-null  int64  
 10  payment_type           22699 non-null  int64  
 11  fare_amount            22699 non-null  float64
 12  extra                  22699 non-null  float64
 13  mta_tax                22699 non-null  float64
 14  tip_amount             22699 non-null  float64
 15  to

In [22]:
# Subset the dataset only with credit card payments
df1 = df0[df0['payment_type']==1].copy()

In [23]:
# Create 'tip percentage' column
df1['tip_percent'] = round(df1['tip_amount'] / (df1['total_amount'] - df1['tip_amount']), 3)

In [24]:
# Create target column: Generous
df1['generous'] = df1['tip_percent']
df1['generous'] = (df1['generous'] >= 0.2)
df1['generous'] = df1['generous'].astype(int)

In [25]:
# Convert pickup and dropoff cols to datetime
df1['tpep_pickup_datetime'] = pd.to_datetime(df1['tpep_pickup_datetime'], format='%m/%d/%Y %I:%M:%S %p')
df1['tpep_dropoff_datetime'] = pd.to_datetime(df1['tpep_dropoff_datetime'], format='%m/%d/%Y %I:%M:%S %p')

In [26]:
# Create a 'day' column
df1['day'] = df1['tpep_pickup_datetime'].dt.day_name().str.lower()

In [27]:
# Create 'am_rush' col
df1['am_rush'] = df1['tpep_pickup_datetime'].dt.hour

# Create 'daytime' col
df1['daytime'] = df1['tpep_pickup_datetime'].dt.hour

# Create 'pm_rush' col
df1['pm_rush'] = df1['tpep_pickup_datetime'].dt.hour

# Create 'nighttime' col
df1['nighttime'] = df1['tpep_pickup_datetime'].dt.hour

In [28]:
# Define 'am_rush()' conversion function [06:00–10:00)
def am_rush(hour):
    if 6 <= hour['am_rush'] < 10:
        val = 1
    else:
        val = 0
    return val

In [29]:
# Apply 'am_rush' function to the 'am_rush' series
df1['am_rush'] = df1.apply(am_rush, axis=1)
df1['am_rush'].head()

Unnamed: 0,am_rush
0,1
1,0
2,1
3,0
5,0


In [30]:
# Define 'daytime()' conversion function [10:00–16:00)
def daytime(hour):
    if 10 <= hour['daytime'] < 16:
        val = 1
    else:
        val = 0
    return val

# Apply 'daytime' function to the 'daytime' series
df1['daytime'] = df1.apply(daytime, axis=1)

In [31]:
# Define 'pm_rush()' conversion function [16:00–20:00)
def pm_rush(hour):
    if 16 <= hour['pm_rush'] < 20:
        val = 1
    else:
        val = 0
    return val

# Apply 'pm_rush' function to the 'pm_rush' series
df1['pm_rush'] = df1.apply(pm_rush, axis=1)

In [32]:
# Define 'nighttime()' conversion function [20:00–06:00)
def nighttime(hour):
    if 20 <= hour['nighttime'] < 24:
        val = 1
    elif 0 <= hour['nighttime'] < 6:
        val = 1
    else:
        val = 0
    return val

# Apply 'nighttime' function to the 'nighttime' series
df1['nighttime'] = df1.apply(nighttime, axis=1)

In [33]:
# Create 'month' col
df1['month'] = df1['tpep_pickup_datetime'].dt.strftime('%b').str.lower()

In [34]:
# Drop columns
drop_cols = ['Unnamed: 0_x', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
             'payment_type', 'trip_distance', 'store_and_fwd_flag', 'payment_type',
             'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
             'improvement_surcharge', 'total_amount', 'tip_percent', 'Unnamed: 0_y',
             'RatecodeID', 'PULocationID', 'DOLocationID', 'VendorID',
             'avg_duration', 'predicted_fare']

df1 = df1.drop(drop_cols, axis=1)
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15265 entries, 0 to 22698
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   passenger_count  15265 non-null  int64  
 1   avg_distance     15265 non-null  float64
 2   generous         15265 non-null  int64  
 3   day              15265 non-null  object 
 4   am_rush          15265 non-null  int64  
 5   daytime          15265 non-null  int64  
 6   pm_rush          15265 non-null  int64  
 7   nighttime        15265 non-null  int64  
 8   month            15265 non-null  object 
dtypes: float64(1), int64(6), object(2)
memory usage: 1.2+ MB


In [35]:
# Convert categoricals to binary
df2 = pd.get_dummies(df1, drop_first=True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15265 entries, 0 to 22698
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   passenger_count  15265 non-null  int64  
 1   avg_distance     15265 non-null  float64
 2   generous         15265 non-null  int64  
 3   am_rush          15265 non-null  int64  
 4   daytime          15265 non-null  int64  
 5   pm_rush          15265 non-null  int64  
 6   nighttime        15265 non-null  int64  
 7   day_monday       15265 non-null  bool   
 8   day_saturday     15265 non-null  bool   
 9   day_sunday       15265 non-null  bool   
 10  day_thursday     15265 non-null  bool   
 11  day_tuesday      15265 non-null  bool   
 12  day_wednesday    15265 non-null  bool   
 13  month_aug        15265 non-null  bool   
 14  month_dec        15265 non-null  bool   
 15  month_feb        15265 non-null  bool   
 16  month_jan        15265 non-null  bool   
 17  month_jul        

In [36]:
# Isolate target variable (y)
y = df2['generous']

# Isolate the features (X)
X = df2.drop('generous', axis=1)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [37]:
final_model = RandomForestClassifier(max_depth= None,
 max_features= 1.0,
 max_samples= 0.7,
 min_samples_leaf= 1,
 min_samples_split= 2,
 n_estimators= 300, random_state=42)
final_model.fit(X_train, y_train)

In [38]:
y_pred = final_model.predict(X_test)

In [39]:
accuracy_score = round(final_model.score(X, y) * 100, 2)
print("Accuracy of LogisticRegression for train set:", accuracy_score, "%")

Accuracy of LogisticRegression for train set: 90.08 %


In [40]:
confusion_matrix(y_test, y_pred)

array([[689, 757],
       [684, 923]])

In [41]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.48      0.49      1446
           1       0.55      0.57      0.56      1607

    accuracy                           0.53      3053
   macro avg       0.53      0.53      0.53      3053
weighted avg       0.53      0.53      0.53      3053



In [43]:
# Save model
joblib.dump(final_model, "nextgentipper_model_v2.pkl")

['nextgentipper_model_v2.pkl']

In [42]:
X_train.shape

(12212, 23)