In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split


In [2]:
# Setting random seed for reproducibility
np.random.seed(42)


In [3]:
year = 2021
df = pd.read_csv(os.path.join(str(year), "./chicago_taxi_cleaned.csv"))
print("There are {} data points".format(len(df)))
df.columns

There are 3299523 data points


Index(['Trip ID', 'Taxi ID', 'Trip Start Timestamp', 'Trip End Timestamp',
       'Trip Seconds', 'Trip Miles', 'Pickup Census Tract',
       'Dropoff Census Tract', 'Pickup Community Area',
       'Dropoff Community Area', 'Fare', 'Tips', 'Tolls', 'Extras',
       'Trip Total', 'Payment Type', 'Company', 'Pickup Centroid Latitude',
       'Pickup Centroid Longitude', 'Pickup Centroid Location',
       'Dropoff Centroid Latitude', 'Dropoff Centroid Longitude',
       'Dropoff Centroid  Location'],
      dtype='object')

In [4]:
# Converting trip start and trip end into datetime + extracting time information

dtformat = "%m/%d/%Y %I:%M:%S %p"
tformat = '%H:%M:%S'
dformat = "%m/%d/%Y"

df["Trip Start Timestamp"] = pd.to_datetime(df["Trip Start Timestamp"], format=dtformat)
df["Trip End Timestamp"] = pd.to_datetime(df["Trip End Timestamp"], format=dtformat)


df["Trip Start Time"] = df["Trip Start Timestamp"].dt.time
df["Trip End Time"] = df["Trip End Timestamp"].dt.time

df['date'] = df['Trip Start Timestamp'].dt.date
df['year'] = df['Trip Start Timestamp'].dt.year
df['month'] = df['Trip Start Timestamp'].dt.month
df['day'] = df['Trip Start Timestamp'].dt.day
df['hour'] = df['Trip Start Timestamp'].dt.hour
df['weekday'] = df['Trip Start Timestamp'].dt.day_name()


According to the website from the chicago government https://www.chicago.gov/content/dam/city/depts/bacp/publicvehicleinfo/Chicabs/chicagotaxiplacard20200629.pdf the basic fare is calculated as follow:

- Base Fare \$3.25 
- Each additional mile \$2.25
- Every 36 seconds of elapsed time \$0.20
- First additional passenger (aged 13 through 64) \$1.00    
    - Each additional passenger \$0.50
- Convenience Fee for electronic payment \$0.50
- Vomit Clean-up Fee \$50.00
- Illinois Airport Departure Tax \$4.00 (for taxi leaving the airports)

Clearly the variable fare is affected by several factors:

- Whether payment is made in cash, or other electronic payment for which \$0.50 is incurred
- Whether pickup / dropoff involves the Chicago airport
- Trip miles 
- Trip seconds

We first perform a simple train-val-test split, allocating 30 \% of the dataset as the test-set and classify based on a simple rule-based method:

Fare = 3.25  + trip_miles * 2.25 + (trip_seconds / 36) * 0.20 + 0.50 * credit_card_payment + 4 * airport_trip

In the event where pickup or dropoff community area is not given, we will impute it with the average number of airport trip in the training data

To assess the performance of our classifiers, we are going to use the RMSE (root mean squared error) metric defined as:

$$ \text{RMSE} = \sqrt{\frac{\sum_{i=1}^{N}(\hat{y}_{i}-y_{i})^{2}}{N}} $$

where $\hat{y_{i}}$ and $y_{i}$ are the predicted and actual fare of the $i$-th data point out of $N$ data points. This intuitively measure the average difference between the predicted and measured fare

Alternatively, we can also measure the mean absolute error (MAE)

$$ \text{MAE} = \frac{\sum_{i=1}^{N}|y_{i} - \hat{y}_{i}|}{N} $$

In [5]:
# We set aside 30% of the data to be the test set 
# Then we further split 25% of the remaining data to be the validation set

train_val_df, test_df = train_test_split(df, test_size=0.3)
train_df, val_df = train_test_split(train_val_df, test_size=0.25)
print(len(train_df), len(val_df), len(test_df))

1732249 577417 989857


In [6]:
# Importing metric function and defining benchmark classifier calculation
# We give the option to include airport or credit card calculation for benchmark

from sklearn.metrics import mean_squared_error, mean_absolute_error

def calculate_benchmark_classifier(df, include_cc=True, include_airport=True):
    is_airport = ((df["Pickup Community Area"] == 76) | (df["Dropoff Community Area"] == 76))
    fare = 3.25 + df["Trip Miles"] * 2.25 + df["Trip Seconds"] / 36 * 0.20
    if include_cc:
        fare += (df["Payment Type"] != 'Cash') * 0.50
    if include_airport:
        fare += is_airport * 4
    return fare.values


def calculate_metric(y_true, y_pred, header='Validation'):
    print("{} results as follow:".format(header))
    print("RMSE of {}".format(mean_squared_error(y_true, y_pred, squared=False)))
    print("MAE of {}".format(mean_absolute_error(y_true, y_pred)))
    print("################")

In [14]:
# Setting squared=False to mean_squared_error gives us the RMSE

benchmark_prediction_no_cc_no_airport = calculate_benchmark_classifier(val_df, 
                                                                       include_cc=False,
                                                                       include_airport=False)
benchmark_prediction_cc_no_airport = calculate_benchmark_classifier(val_df, 
                                                                    include_cc=True,
                                                                    include_airport=False)

benchmark_prediction_no_cc_airport = calculate_benchmark_classifier(val_df, 
                                                                    include_cc=False,
                                                                    include_airport=True)

benchmark_prediction_cc_airport = calculate_benchmark_classifier(val_df, 
                                                                 include_cc=True,
                                                                 include_airport=True)

calculate_metric(val_df["Fare"], benchmark_prediction_no_cc_no_airport, 
                 header="Validation Benchmark No CC no Airport")
calculate_metric(val_df["Fare"], benchmark_prediction_cc_no_airport, 
                 header="Validation Benchmark CC no Airport")
calculate_metric(val_df["Fare"], benchmark_prediction_no_cc_airport, 
                 header="Validation Benchmark no CC Airport")
calculate_metric(val_df["Fare"], benchmark_prediction_cc_airport, 
                 header="Validation Benchmark CC Airport")


Validation Benchmark No CC no Airport results as follow:
RMSE of 7.72871045716822
MAE of 5.429368115051839
################
Validation Benchmark CC no Airport results as follow:
RMSE of 7.88667058655427
MAE of 5.675545551231702
################
Validation Benchmark no CC Airport results as follow:
RMSE of 8.401591827917427
MAE of 5.975091427281613
################
Validation Benchmark CC Airport results as follow:
RMSE of 8.575892129192477
MAE of 6.222267891123556
################


The result varies between RMSE of 7.72 to 8.57 with mean absolute error between 5.43 to 6.22 as a benchmark. This suggests some variation in the amount being charged but maybe within a certain range of city guideline. This variation leads us to question whether each company is charging differently or whether at any given time of the day 
Here we want to clarify a couple of questions with regards to model fit:

- Whether different company charges different rates per mile for taxi
- Whether airport trip or non-cash payment do incur extra charges
- Whether the time of the day or the community area pickup / dropoff affects the charges

A good way to answer some of these questions is to first start with a simple linear regression model so that we can see the importance of some of these variables before building mode complicated models. We begin by comparing the coefficient of the models with trip miles and trip seconds vs models with credit card and with airport details

In [17]:
benchmark_prediction_no_cc_no_airport = calculate_benchmark_classifier(test_df, 
                                                                       include_cc=False,
                                                                       include_airport=False)
benchmark_prediction_cc_no_airport = calculate_benchmark_classifier(test_df, 
                                                                    include_cc=True,
                                                                    include_airport=False)

benchmark_prediction_no_cc_airport = calculate_benchmark_classifier(test_df, 
                                                                    include_cc=False,
                                                                    include_airport=True)

benchmark_prediction_cc_airport = calculate_benchmark_classifier(test_df, 
                                                                 include_cc=True,
                                                                 include_airport=True)

calculate_metric(test_df["Fare"], benchmark_prediction_no_cc_no_airport, 
                 header="Test Benchmark No CC no Airport")
calculate_metric(test_df["Fare"], benchmark_prediction_cc_no_airport, 
                 header="Test Benchmark CC no Airport")
calculate_metric(test_df["Fare"], benchmark_prediction_no_cc_airport, 
                 header="Test Benchmark no CC Airport")
calculate_metric(test_df["Fare"], benchmark_prediction_cc_airport, 
                 header="Test Benchmark CC Airport")

Test Benchmark No CC no Airport results as follow:
RMSE of 7.714620095325074
MAE of 5.426843878290837
################
Test Benchmark CC no Airport results as follow:
RMSE of 7.872624581604764
MAE of 5.672779886107005
################
Test Benchmark no CC Airport results as follow:
RMSE of 8.391132989006024
MAE of 5.974203137703504
################
Test Benchmark CC Airport results as follow:
RMSE of 8.565388071962378
MAE of 6.221097582501541
################


This performance is similar in the test-set where the no cc no airport model performs the best with RMSE 7.71 and MAE of 5.43

In [7]:
def extract_features(df, features, include_non_cash=True, include_airport=True, 
                     include_community=False):
    features = df[features]
    if include_airport:
        features["is_airport"] = ((df["Pickup Community Area"] == 76) \
                                | (df["Dropoff Community Area"] == 76)).values * 1
    if include_non_cash:
        features["is_cash"] = (df["Payment Type"] == 'Cash').values * 1

    if include_community:
        ca_dtype = pd.CategoricalDtype(categories=np.arange(1, 79))

        df_pickup_ca = pd.get_dummies(df["Pickup Community Area"]\
                                          .astype(ca_dtype).fillna(78))
        df_dropoff_ca = pd.get_dummies(df["Dropoff Community Area"]\
                                           .astype(ca_dtype).fillna(78))
        features = pd.concat([features, df_pickup_ca, df_dropoff_ca], axis=1)
    return features


def fit_ols_model(features_df, fares, include_constant=True):
    model = sm.OLS(fares.values,
                   sm.add_constant(features_df))
    model = model.fit()
    return model


def predict_ols_model(model, features_df, include_constant=True):
    if include_constant:
        return model.predict(sm.add_constant(features_df))
    else:
        return model.predict(features_df)


In [9]:
features = ["Trip Miles", "Trip Seconds"]
include_non_cash = False
include_airport = False


train_features = extract_features(train_df, features, 
                                  include_non_cash=include_non_cash, 
                                  include_airport=include_airport)
train_fares = train_df["Fare"]

val_features = extract_features(val_df, features,
                                include_non_cash=include_non_cash, 
                                include_airport=include_airport)
val_fares = val_df["Fare"]

ols_model = fit_ols_model(train_features, train_fares, include_constant=True)
ols_model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.875
Model:,OLS,Adj. R-squared:,0.875
Method:,Least Squares,F-statistic:,6066000.0
Date:,"Mon, 24 Oct 2022",Prob (F-statistic):,0.0
Time:,00:04:27,Log-Likelihood:,-5398800.0
No. Observations:,1732249,AIC:,10800000.0
Df Residuals:,1732246,BIC:,10800000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.8246,0.007,544.269,0.000,3.811,3.838
Trip Miles,1.5357,0.001,1787.662,0.000,1.534,1.537
Trip Seconds,0.0063,6.54e-06,958.465,0.000,0.006,0.006

0,1,2,3
Omnibus:,1471455.104,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,66277527.113
Skew:,3.884,Prob(JB):,0.0
Kurtosis:,32.29,Cond. No.,2490.0


Firstly we see that trip seconds coefficient at 0.0063 is similar to the recommended guideline by the city which charges 0.20 for every 36 seconds passed (0.0063 * 36 = 0.2268). The constant value 3.82 is a bit off from the surcharge 3.25 and the additional miule charge amounts to 1.54. The model also indicates a good fit with Adj. R-squared of 0.875 

In [10]:
val_pred = predict_ols_model(ols_model, val_features)
calculate_metric(val_fares, val_pred, 
                 header="Validation Benchmark OLS model with 2 variables")

Validation Benchmark OLS model with 2 variables results as follow:
RMSE of 5.473495063078026
MAE of 2.882093096181407
################


Most importantly perhaps we see a much better RMSE and MAE compared to the benchmark model. This suggests and average of 2.88 prediction off with a standard deviation of 5.47. Next we try to include non-cash payment information and airport information

In [11]:
include_non_cash = True
include_airport = True
train_features = extract_features(train_df, features, 
                                  include_non_cash=include_non_cash, 
                                  include_airport=include_airport)
train_fares = train_df["Fare"]

val_features = extract_features(val_df, features,
                                include_non_cash=include_non_cash, 
                                include_airport=include_airport)
val_fares = val_df["Fare"]

ols_model = fit_ols_model(train_features, train_fares, include_constant=True)
ols_model.summary()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0,1,2,3
Dep. Variable:,y,R-squared:,0.881
Model:,OLS,Adj. R-squared:,0.881
Method:,Least Squares,F-statistic:,3200000.0
Date:,"Mon, 24 Oct 2022",Prob (F-statistic):,0.0
Time:,00:04:28,Log-Likelihood:,-5358000.0
No. Observations:,1732249,AIC:,10720000.0
Df Residuals:,1732244,BIC:,10720000.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.1699,0.008,518.975,0.000,4.154,4.186
Trip Miles,1.4424,0.001,1604.436,0.000,1.441,1.444
Trip Seconds,0.0061,6.42e-06,954.459,0.000,0.006,0.006
is_airport,3.5078,0.012,284.356,0.000,3.484,3.532
is_cash,-0.4761,0.009,-55.754,0.000,-0.493,-0.459

0,1,2,3
Omnibus:,1433390.201,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,65788379.333
Skew:,3.713,Prob(JB):,0.0
Kurtosis:,32.263,Cond. No.,4490.0


Despite the strong multiollinearity, the coefficient of the variables seem to be convincing enough. For example, the trip second is close to the 0.2 charge per 36 seconds $(0.0061 \times 36 = 0.22)$, while paying with cash saves us roughly 0.47 cents, which is similar to the electronic payment surcharge of 0.50 cents in the chicago city guidelines. When pickup point or dropoff is at the airport, there is an additional 3.5 to the fare as well, which is close to the guideline of 4 fee for entering the airport. 

The cost per mile however is a bit off from the baseline of 2.25 per mile and the minimum fare as indicated by the constant parameter is off from the 3.25 baseline as indicated in the guidelines. Again, these discrepancies suggest measurement error in the trip miles or seconds, or perhaps some underlying discrepancies charged by different companies. Let's try replacing the trip with interaction terms with each company

In [12]:
val_pred = predict_ols_model(ols_model, val_features)
calculate_metric(val_fares, val_pred, 
                 header="Validation Benchmark OLS model with 2 variables")

Validation Benchmark OLS model with 2 variables results as follow:
RMSE of 5.3472655752650216
MAE of 2.8834547597843216
################


Just a slight improvement on the RMSE  and MAE but regardless the coefficients are convincing enough that the slight improvement might be the result of a missing features. Next we try to see if different companies might charge differently per mile 

In [13]:
# Casting as an array for memory efficient computation

company_per_mile = pd.get_dummies(train_df["Company"])
company_cols = company_per_mile.columns

company_per_mile = company_per_mile.values \
    *  train_features["Trip Miles"].values.reshape(-1, 1)

train_features_company = pd.DataFrame(company_per_mile, columns=company_cols)
train_features_company["Trip Seconds"] = train_features["Trip Seconds"].values
train_features_company["is_cash"] = train_features["is_cash"].values
train_features_company["is_airport"] = train_features["is_airport"].values

ols_model = fit_ols_model(train_features_company, train_fares, include_constant=True)
ols_model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.882
Model:,OLS,Adj. R-squared:,0.882
Method:,Least Squares,F-statistic:,323600.0
Date:,"Mon, 24 Oct 2022",Prob (F-statistic):,0.0
Time:,00:04:37,Log-Likelihood:,-5349400.0
No. Observations:,1732249,AIC:,10700000.0
Df Residuals:,1732208,BIC:,10700000.0
Df Model:,40,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.1615,0.008,519.451,0.000,4.146,4.177
24 Seven Taxi,1.4233,0.004,363.253,0.000,1.416,1.431
2733 - 74600 Benny Jona,1.2927,0.015,86.932,0.000,1.264,1.322
3011 - 66308 JBL Cab Inc.,1.1897,0.289,4.114,0.000,0.623,1.757
312 Medallion Management Corp,1.3834,0.007,196.982,0.000,1.370,1.397
3556 - 36214 RC Andrews Cab,1.3865,0.023,60.030,0.000,1.341,1.432
3591 - 63480 Chuks Cab,1.0735,0.041,26.002,0.000,0.993,1.154
3620 - 52292 David K. Cab Corp.,1.3540,0.014,98.932,0.000,1.327,1.381
"3721 - Santamaria Express, Alvaro Santamaria",1.3443,0.126,10.691,0.000,1.098,1.591

0,1,2,3
Omnibus:,1434052.14,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,64537462.052
Skew:,3.724,Prob(JB):,0.0
Kurtosis:,31.96,Cond. No.,1370000.0


In [14]:
dtype = pd.CategoricalDtype(categories=company_cols)

company_per_mile_val = pd.get_dummies(val_df["Company"].astype(dtype))

company_per_mile_val = company_per_mile_val.values \
    *  val_features["Trip Miles"].values.reshape(-1, 1)

val_features_company = pd.DataFrame(company_per_mile_val, columns=company_cols)
val_features_company["Trip Seconds"] = val_features["Trip Seconds"].values
val_features_company["is_cash"] = val_features["is_cash"].values
val_features_company["is_airport"] = val_features["is_airport"].values

val_pred = predict_ols_model(ols_model, val_features_company)
calculate_metric(val_fares, val_pred, 
                 header="Validation Benchmark OLS model with 2 variables")

Validation Benchmark OLS model with 2 variables results as follow:
RMSE of 5.32051803669006
MAE of 2.878946204132115
################


Generally we don't see any difference between the cab providers based on the cofficients. Furthermore the RMSE, MAE and adjusted R-squared do not improve in general. As such, going forward we will not use these variables. We are going to use the trip miles, trip seconds, cash/non cash and whether the trip involves the OHARE airport area as the base features


In [11]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

In [27]:
include_non_cash = True
include_airport = True
features = ["Trip Miles", "Trip Seconds"]

train_features = extract_features(train_df, features, 
                                  include_non_cash=include_non_cash, 
                                  include_airport=include_airport)
train_fares = train_df["Fare"]

val_features = extract_features(val_df, features,
                                include_non_cash=include_non_cash, 
                                include_airport=include_airport)
val_fares = val_df["Fare"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [28]:
xgbr = XGBRegressor().fit(train_features, train_fares)

In [19]:
rf = RandomForestRegressor().fit(train_features, train_fares)
gbr = GradientBoostingRegressor().fit(train_features, train_fares)
xgbr = XGBRegressor().fit(train_features, train_fares)

In [21]:
calculate_metric(val_fares, rf.predict(val_features), 
                 header="Validation Benchmark for Random Forest Regressor")
calculate_metric(val_fares, gbr.predict(val_features), 
                 header="Validation Benchmark for Gradient Boosting Regressor")

Validation Benchmark for Random Forest Regressor results as follow:
RMSE of 3.427392140771912
MAE of 1.1342844868720536
################
Validation Benchmark for Gradient Boosting Regressor results as follow:
RMSE of 3.64364686215597
MAE of 1.38624668572024
################


In [29]:
calculate_metric(val_fares, xgbr.predict(val_features), 
                 header="Validation Benchmark for XGBoost Regressor")

Validation Benchmark for XGBoost Regressor results as follow:
RMSE of 3.195222129209245
MAE of 1.0710769445248707
################


The ensemble model achieves better MAE and RMSE on the validation set compared to the linear model, let's try using the community area features since these models might be able to learn better relationship between the features

In [16]:
include_non_cash = True
include_airport = True
include_community = True

train_features = extract_features(train_df, features, 
                                  include_non_cash=include_non_cash, 
                                  include_airport=include_airport,
                                  include_community=include_community)
train_fares = train_df["Fare"]

val_features = extract_features(val_df, features,
                                include_non_cash=include_non_cash, 
                                include_airport=include_airport,
                                include_community=include_community)
val_fares = val_df["Fare"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
rf = RandomForestRegressor().fit(train_features, train_fares)
gbr = GradientBoostingRegressor().fit(train_features, train_fares)
xgbr = XGBRegressor().fit(train_features.values, train_fares)



In [20]:
xgbr = XGBRegressor().fit(train_features.values, train_fares)

In [37]:
calculate_metric(val_fares, rf.predict(val_features), 
                 header="Validation Benchmark for Random Forest Regressor")
calculate_metric(val_fares, gbr.predict(val_features), 
                 header="Validation Benchmark for Gradient Boosting Regressor")



Validation Benchmark for Random Forest Regressor results as follow:
RMSE of 2.9985050432088545
MAE of 0.9785473193457149
################




Validation Benchmark for Gradient Boosting Regressor results as follow:
RMSE of 3.599175287182206
MAE of 1.445593875347446
################


In [21]:
calculate_metric(val_fares, xgbr.predict(val_features), 
                 header="Validation Benchmark for XGBoost Regressor")

Validation Benchmark for Gradient Boosting Regressor results as follow:
RMSE of 3.0271009793156978
MAE of 1.0696894433159443
################


Seems like we can manage better MAE with random forest model in combination with the community area. For now let's save the two models

In [8]:
import joblib

In [48]:
joblib.dump(rf, "./taxi_analysis/model/random_forest_model.joblib")
joblib.dump(gbr, "./taxi_analysis/model/gradient_boosting.joblib")

['./model/gradient_boosting.joblib']

Next we perform prediction on the test data

In [23]:
include_non_cash = True
include_airport = True
include_community = True
test_features = extract_features(test_df, features, 
                                  include_non_cash=include_non_cash, 
                                  include_airport=include_airport,
                                  include_community=include_community)
test_fares = test_df["Fare"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [50]:
calculate_metric(test_fares, rf.predict(test_features), 
                 header="Test Results for Random Forest Regressor")
calculate_metric(test_fares, gbr.predict(test_features), 
                 header="Test Results for Gradient Boosting Regressor")



Test Results for Random Forest Regressor results as follow:
RMSE of 2.9983245344989045
MAE of 0.9779574273792171
################




Test Results for Gradient Boosting Regressor results as follow:
RMSE of 3.578332729827094
MAE of 1.4423219477862224
################


In [24]:
calculate_metric(test_fares, xgbr.predict(test_features), 
                 header="Test Results for Gradient Boosting Regressor")

Test Results for Gradient Boosting Regressor results as follow:
RMSE of 3.0103655435604977
MAE of 1.0683409047085797
################


We observe similar performance on the test set and the validation set, next we perform inference on the pre-covid dataset just to see whether the model can still perform well

In [52]:
years = [2017, 2018, 2019, 2020]
for year in years:
    df = pd.read_csv(os.path.join(str(year), "./chicago_taxi_cleaned.csv"))
    df_features = extract_features(df, features, include_non_cash=True, 
                                   include_airport=True, include_community=True)
    df_fares = df["Fare"]
    calculate_metric(df_fares, rf.predict(df_features.values), 
                     header="Results for the year {}, Random Forest Regressor".format(year))
    calculate_metric(df_fares, gbr.predict(df_features.values), 
                     header="Results for the year {}, Gradient Boosting Regressor".format(year))
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Results for the year 2017, Random Forest Regressor results as follow:
RMSE of 2.018624251106619
MAE of 0.625294408403186
################
Results for the year 2017, Gradient Boosting Regressor results as follow:
RMSE of 1.9658744399370136
MAE of 1.0191167994738362
################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Results for the year 2018, Random Forest Regressor results as follow:
RMSE of 1.8134918995961353
MAE of 0.6017923582774529
################
Results for the year 2018, Gradient Boosting Regressor results as follow:
RMSE of 1.9643492388668597
MAE of 1.0185141830504145
################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Results for the year 2019, Random Forest Regressor results as follow:
RMSE of 2.0568767199614686
MAE of 0.6797031352799068
################
Results for the year 2019, Gradient Boosting Regressor results as follow:
RMSE of 2.238899742526288
MAE of 1.1369876958480518
################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Results for the year 2020, Random Forest Regressor results as follow:
RMSE of 2.3185807790613757
MAE of 0.7479134932839862
################
Results for the year 2020, Gradient Boosting Regressor results as follow:
RMSE of 2.400683833758345
MAE of 1.12538831059798
################


In [25]:
years = [2017, 2018, 2019, 2020]
for year in years:
    df = pd.read_csv(os.path.join(str(year), "./chicago_taxi_cleaned.csv"))
    df_features = extract_features(df, features, include_non_cash=True, 
                                   include_airport=True, include_community=True)
    df_fares = df["Fare"]
    calculate_metric(df_fares, xgbr.predict(df_features.values), 
                     header="Results for the year {}, Random Forest Regressor".format(year))
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Results for the year 2017, Random Forest Regressor results as follow:
RMSE of 1.755078440097429
MAE of 0.7298531646834807
################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Results for the year 2018, Random Forest Regressor results as follow:
RMSE of 1.6259528432889798
MAE of 0.6885473735252768
################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Results for the year 2019, Random Forest Regressor results as follow:
RMSE of 1.7600056390634595
MAE of 0.7471779608234222
################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Results for the year 2020, Random Forest Regressor results as follow:
RMSE of 1.9564162891017354
MAE of 0.8019389113688782
################


In [18]:
for year in years:
    df = pd.read_csv(os.path.join(str(year), "./chicago_taxi_cleaned.csv"))
    df_fares = df["Fare"]
    calculate_metric(df_fares, calculate_benchmark_classifier(df, include_cc=False, 
                                                              include_airport=False), 
                     header="Results for the year {}, Benchmark Model".format(year))


Results for the year 2017, Benchmark Model results as follow:
RMSE of 5.240923558889103
MAE of 3.468403479468315
################
Results for the year 2018, Benchmark Model results as follow:
RMSE of 5.60572279524452
MAE of 3.662089295811558
################
Results for the year 2019, Benchmark Model results as follow:
RMSE of 6.155571220105781
MAE of 4.002580472256923
################
Results for the year 2020, Benchmark Model results as follow:
RMSE of 5.871705655304449
MAE of 3.927683635003288
################
