# 1. Load files and EDA

In [None]:
### 1.1 load files, and add tipDefault columns for tips prediction#@title Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import normalize
import datetime as dt

np.random.seed(42)
%matplotlib inline

## 1.1 load files

In [None]:
df = pd.read_csv("share_ride_data.csv")
df = df.sample(frac=0.02, random_state=42)

1.2 Features engeneering part 1: OneHotEncoding

In [None]:
# use OneHotEncoding add features describe day of the week; time of the day
from sklearn.preprocessing import OneHotEncoder
df["pickupHour"] = pd.to_datetime(df["Trip Start Timestamp"], format='%m/%d/%Y %I:%M:%S %p').dt.hour
df["pickDayofweek"] = pd.to_datetime(df["Trip Start Timestamp"], format='%m/%d/%Y %I:%M:%S %p').dt.weekday
df.drop_duplicates(subset=["pickupHour","pickDayofweek","Pickup Community Area","Dropoff Community Area",'Pickup Centroid Latitude','Pickup Centroid Longitude'], inplace= True, keep='last')
encoder = OneHotEncoder(sparse=False)
onehot_encoded = encoder.fit_transform(df[["pickupHour","pickDayofweek","Pickup Community Area","Dropoff Community Area"]])
onehot_encoded_frame = pd.DataFrame(onehot_encoded,columns = encoder.get_feature_names(['hourofday', 'dayofweek','pickuparea','dropoffarea']))

# 2. modeling
## 2.1 features for modeling

In [None]:
y = df['Trip Miles']
fea = df[['Pickup Centroid Latitude','Pickup Centroid Longitude']]
# combine original features and onehot_encoded_frame
features = pd.concat([onehot_encoded_frame.reset_index(),fea.reset_index()], axis=1)

## 2.2 Preparation
Split the training data into training/validation sets for cross validation (hyperparameter tuning).

In [None]:
# split into 0.8 training dataset and 0.2 test dataset
X_t, X_test, y_t, y_test = train_test_split(features,y, test_size=0.2, random_state=42)
# #split into 0.6 training dataset 0.2 validation dataset 
X_train, X_val, y_train,y_val = train_test_split(X_t, y_t, test_size=0.2, random_state=42)

## 2.2 Basic Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# random forest with normalized features
forest_1 = RandomForestRegressor(random_state=0)
forest_1.fit(X_train, y_train)
y_pred_train = forest_1.predict(X_train)
y_pred_f = forest_1.predict(X_val)

In [None]:
#calculate rmse for training and validation 
print('in-sample rmse:', mean_squared_error(y_train, y_pred_train, squared=False))
print('out-sample rmse:', mean_squared_error(y_val, y_pred_f, squared=False))

## 2.3 Hyperparameter tuning Random Forest

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, make_scorer

In [None]:
def RMSE(y_true,y_pred):
    mse = mean_squared_error(y_true, y_pred, squared=False)
    return rmse

def R2(y_true,y_pred):    
    r2 = r2_score(y_true, y_pred)
    return r2

def two_score(y_true,y_pred):    
    RMSE(y_true,y_pred) #set score here and not below if using MSE in GridCV
    score = R2(y_true,y_pred)
    return score

def two_scorer():
    return make_scorer(two_score, greater_is_better=True) # change for false if using MSE

In [None]:
# Hyperparameter tuning for random forest 
parameters = {'max_depth': np.arange(2,5), 'min_samples_split': np.arange(2,50,10), 'min_samples_leaf': np.arange(1,50,10)}
model = RandomForestRegressor()
clf = GridSearchCV(model, parameters, cv=5, scoring=two_scorer(),n_jobs=-1)
clf.fit(X_t, y_t)
best_max_depth = clf.best_params_['max_depth']
best_min_samples_split = clf.best_params_['min_samples_split']
best_min_samples_leaf = clf.best_params_['min_samples_leaf']

In [None]:
forest_best =RandomForestRegressor( max_depth= best_max_depth, min_samples_split= best_min_samples_split, min_samples_leaf = best_min_samples_leaf)
forest_best.fit(X_t, y_t)

In [None]:
y_pred_train_b = forest_best.predict(X_train)
y_pred_b = forest_best.predict(X_val)

In [None]:
print('in-sample rmse:', mean_squared_error(y_train, y_pred_train_b, squared=False))
print('out-sample rmse:', mean_squared_error(y_val, y_pred_b, squared=False))

## 2.4 summary
with Hyperparameter tuning overfitting problem solved