# 1. Load files and EDA

In [None]:
### 1.1 load files, and add tipDefault columns for tips prediction#@title Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import normalize
import datetime as dt

np.random.seed(42)
%matplotlib inline

## 1.1 load files, and add tipDefault columns for tips prediction

In [None]:
df = pd.read_csv("share_ride_data.csv")
df = df.sample(frac=0.02, random_state=42)
# create binary feature tipPay
df["tipPay"] = (df['Tip'] != 0).astype(int)

## 1.2 Features engeneering part 1: OneHotEncoding

In [None]:
# use OneHotEncoding add features describe day of the week; time of the day
from sklearn.preprocessing import OneHotEncoder
df["pickupHour"] = pd.to_datetime(df["Trip Start Timestamp"], format='%m/%d/%Y %I:%M:%S %p').dt.hour
df["pickDayofweek"] = pd.to_datetime(df["Trip Start Timestamp"], format='%m/%d/%Y %I:%M:%S %p').dt.weekday
df.drop_duplicates(subset=["pickupHour","pickDayofweek","Pickup Community Area","Dropoff Community Area",'Pickup Centroid Latitude','Pickup Centroid Longitude'], inplace= True, keep='last')
encoder = OneHotEncoder(sparse=False)
onehot_encoded = encoder.fit_transform(df[["pickupHour","pickDayofweek","Pickup Community Area","Dropoff Community Area"]])
onehot_encoded_frame = pd.DataFrame(onehot_encoded,columns = encoder.get_feature_names(['hourofday', 'dayofweek','pickuparea','dropoffarea']))

## 1.3 Examine datasets

In [None]:
df.info()

In [None]:
# summary of statistics
df.describe().T

In [None]:
# check the unique value counts of each column, small number could be numeric category data
pd.DataFrame(df.unique(), columns=['Unique Values']).T

I briefly examine the summary statistics here:

Features have largely heterogeneous distributions.
If I use linear methods such as logistic regression, I need to normalize variables
The labels are all zeros and ones. The mean is 0.16, implying that only 16% of the data are labeled default=0.
This is an imbalanced binary classification problem.
Timestamp variables seem to be continuous because of the high number of unique values in each column.

In [None]:
#examine data districution 
columnsName = df.columns
for i in range(0,14):
  plt.figure(figsize=(15,5))
  plt.hist(df.iloc[:,i],bins=150,ec='w');plt.title(columnsName[i]);

trip timestamp have seasonality
Trip miles, Additional charges, Trip total heavy right skewed. use logarithms deal with tese features
need to normalize data because the distributions are highly hetergeneous. Otherwise, models such as logistic regression may never converge.

## 1.4 Examine multicollinearity
There's no too much for me to do because the dataset is very structured itself: there are no missing values; variables are well-defined; all values are integers or floats; I have no context of the columns
So, I mainly consider the possibility of redundant features. In particular, multicollinearity.
Multicollinearity can cause very unstable model performance, especially for ordinary linear regression, because X ′ X is not full-rank and is not invertible.

In [None]:
# heatmap correlation between features
plt.figure(figsize=(15,12))
corr_before = df.corr()
sns.heatmap(corr_before, vmin=-1, vmax=1, cmap="RdBu_r", lw=.1)
plt.title('Correlation heatmap of variables');

I plot the correlation of each variables in a heatmap:

Trip Toal calculate by fair + tip, fair and tip values won't have predcting power if Trip Toal is already included
trip total and trip mile are linear correlated, only use trip total for tipDefault prediction
# 2. model experiment
## 2.1 features for modeling

In [None]:
fea = df[['Pickup Centroid Latitude','Pickup Centroid Longitude']]
# combine original features and onehot_encoded_frame
features = pd.concat([onehot_encoded_frame.reset_index(),fea.reset_index()], axis=1)

## 2.2 Preparation
Normalize data because the distributions are highly hetergeneous. Otherwise, models such as logistic regression may never converge.
Split the training data into training/validation sets for cross validation (hyperparameter tuning).

In [None]:
# features
X = normalize(features)
# label
y = df["tipPay"]

# split into 0.8 training dataset and 0.2 test dataset
X_t, X_test, y_t, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# #split into 0.6 training dataset 0.2 validation dataset 
X_train, X_val, y_train,y_val = train_test_split(X_t, y_t, test_size=0.2, random_state=42)

## 2.3 Logistic regression
use f1 score
since there are lot's of samples that are default=0 than default=1, then precision might be useful: precision does not include number of true negative in its calculation, so not influenced by imbalance
use F1 score to incorporates both the quality of predictions and compleness of the predictions

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

In [None]:
# logistic regression with normalized features
logit_1 = LogisticRegression()
logit_1.fit(X_train, y_train)
y_pred_1 = logit_1.predict(X_val)

training_accuracy = logit_1.score(X_train, y_train)
print('Training accuracy:', round(logit_1.score(X_train, y_train),3))
print('Validation accuracy:', round(logit_1.score(X_val, y_val),3))
print('Validation precision:', round(recall_score(y_val, y_pred_1),3))
print('Validation recall:', round(precision_score(y_val, y_pred_1),3))
print('Validation f1 score:', round(f1_score(y_val, y_pred_1),3))

In [None]:
#plot confusio matrix
plot_confusion_matrix(logit_1, X_val, y_val, values_format='d')
plt.title('vanilla Logistic regression: confusion matrix of validation data');

## 2.3 Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# random forest with normalized features
forest_1 = RandomForestClassifier(random_state=0)
forest_1.fit(X_train, y_train)
y_pred_f = forest_1.predict(X_val)

print('Training accuracy:', round(forest_1.score(X_train, y_train),3))
print('Validation accuracy:', round(forest_1.score(X_val, y_val),3))
print('Validation precision:', round(recall_score(y_val, y_pred_f),3))
print('Validation recall:', round(precision_score(y_val, y_pred_f),3))
print('Validation f1 score:', round(f1_score(y_val, y_pred_f),3))

In [None]:
#plot confusio matrix
plot_confusion_matrix(forest_1, X_val, y_val, values_format='d')
plt.title('vanilla Random forest: confusion matrix of validation data');

# 3. model improvement
## 3.1 Re-weight Logistic regression

In [None]:
# re-weight logistic regression with normalized features
logit_2 = LogisticRegression(class_weight='balanced')
logit_2.fit(X_train, y_train)
y_pred_2 = logit_2.predict(X_val)

training_accuracy = logit_2.score(X_train, y_train)
print('Training accuracy:', round(logit_2.score(X_train, y_train),3))
print('Validation accuracy:', round(logit_2.score(X_val, y_val),3))
print('Validation precision:', round(recall_score(y_val, y_pred_2),3))
print('Validation recall:', round(precision_score(y_val, y_pred_2),3))
print('Validation f1 score:', round(f1_score(y_val, y_pred_2),3))

In [None]:
#plot confusio matrix
plot_confusion_matrix(logit_2, X_val, y_val, values_format='d')
plt.title('re-weight Logistic regression: confusion matrix of validation data');

## 3.2 Re-weight Random forest

In [None]:
# random forest with normalized features
forest_2 = RandomForestClassifier(random_state=0,class_weight='balanced')
forest_2.fit(X_train, y_train)
y_pred_r2 = forest_2.predict(X_val)

print('Training accuracy:', round(forest_2.score(X_train, y_train),3))
print('Validation accuracy:', round(forest_2.score(X_val, y_val),3))
print('Validation precision:', round(recall_score(y_val, y_pred_r2),3))
print('Validation recall:', round(precision_score(y_val, y_pred_r2),3))
print('Validation f1 score:', round(f1_score(y_val, y_pred_r2),3))

In [None]:
#plot confusio matrix
plot_confusion_matrix(forest_2, X_val, y_val, values_format='d')
plt.title('re-weight Random forest: confusion matrix of validation data');

# 4. Hyperparameter tuning with cross validation
## 4.1 Hyperparameter tuning Logistic regression

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Hyperparameter tuning for logistic regression
parameters = {'C': np.linspace(.001,1,20)}
model = LogisticRegression(class_weight='balanced')
clf = GridSearchCV(model, parameters, cv=5, n_jobs=-1)
clf.fit(X_t, y_t)
best_C = np.linspace(.001,1,20)[clf.cv_results_['rank_test_score'].argmin()]

In [None]:
# model w/ best C
logit_best = LogisticRegression(C=best_C, class_weight='balanced')
logit_best.fit(X_t, y_t)

## 4.2 Hyperparameter tuning Random Forest

In [None]:
# Hyperparameter tuning for random forest 
parameters = {'max_depth': np.arange(2,5), 'min_samples_split': np.arange(2,50,10), 'min_samples_leaf': np.arange(1,50,10)}
# parameters = {'max_depth': np.arange(2,4), 'min_samples_split': np.arange(2,13,10)}
model = RandomForestClassifier(class_weight='balanced')
clf = GridSearchCV(model, parameters, cv=5, n_jobs=-1)
clf.fit(X_t, y_t)
best_max_depth = clf.best_params_['max_depth']
best_min_samples_split = clf.best_params_['min_samples_split']
best_min_samples_leaf = clf.best_params_['min_samples_leaf']

In [None]:
best_max_depth = clf.best_params_['max_depth']
best_min_samples_split = clf.best_params_['min_samples_split']
best_min_samples_leaf = clf.best_params_['min_samples_leaf']

In [None]:
# model w/ best C
forest_best = RandomForestClassifier(class_weight='balanced', max_depth= best_max_depth, min_samples_split= best_min_samples_split, min_samples_leaf = best_min_samples_leaf)
forest_best.fit(X_t, y_t)

## 4.3 Evaluation for best models
### 4.3.1 Evaluation for best Logistic Regression

In [None]:
# same feature engineering has applied in earlier part the test data
# logistic regression prediction
y_pred_logit_best = logit_best.predict(X_test)
y_test_prob_logit_best =logit_best.predict_proba(X_test)
print('Train Accuracy:', round(logit_best.score(X_t, y_t),3))
print('Test Accuracy:', round(logit_best.score(X_test, y_test),3))
print('Test Precision:', round(recall_score(y_test, y_pred_logit_best),3))
print('Test Recall:', round(precision_score(y_test, y_pred_logit_best),3))
print('Test F1 score:', round(f1_score(y_test, y_pred_logit_best),3))

In [None]:
#plot confusio matrix
plot_confusion_matrix(logit_best, X_test, y_test, values_format='d')
plt.title('Best logistic regression: confusion matrix of test data');

### 4.3.2 Evaluation for best Random Forest

In [None]:
y_pred_forest_best = forest_best.predict(X_test)
y_test_prob_forest_best = forest_best.predict_proba(X_test)
print('Train Accuracy:', round(forest_best.score(X_t, y_t),3))
print('Test Accuracy:', round(forest_best.score(X_test, y_test),3))
print('Test Precision:', round(recall_score(y_test, y_pred_forest_best),3))
print('Test Recall:', round(precision_score(y_test, y_pred_forest_best),3))
print('Test F1 score:', round(f1_score(y_test, y_pred_forest_best),3))

In [None]:
#plot confusio matrix
plot_confusion_matrix(forest_best, X_test, y_test, values_format='d')
plt.title('Best Random forest: confusion matrix of test data');

### 4.3.3 Compare performance of models

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
# plot roc for comparation
# green best random forest, red best logistic regression
fpr_1, tpr_1, thresholds_1 = roc_curve(y_test, y_test_prob_logit_best[:,1])
plt.plot(fpr_1,tpr_1,color="red", label='best logistic regression')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC curve for learners")
fpr_2, tpr_2, thresholds_2 = roc_curve(y_test, y_test_prob_forest_best[:,1])
plt.plot(fpr_2,tpr_2, color="red", label='best random forest')
plt.plot([0, 1], [0, 1], color="black", linestyle="--")
plt.legend()
plt.show()

### 4.3.3 Result
Re-weight Logistic regression has the best performance base on tradeoff between F1 score and AUC
We want a related higer F1 score to ensure we have a prediction with quality and completeness
We want a related higer accuracy to ensure our predicting power