# Execute final model and generate visualization

# Setup

In [1]:
import pandas as pd
import psycopg2 as pg
import os
import sys
import pandas.io.sql as pd_sql
import numpy as np
import sqlalchemy
project_dir = str(os.path.dirname(os.path.abspath('')))
print(project_dir)
sys.path.append(project_dir)
import matplotlib.pyplot as plt
from src.pickle.pickle_util import save_pickle, load_pickle

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

my_random_state = 72



/Users/erik/metis/data_hailing


## Load processed data as it was in 03_feat_eng_and_model notebook

In [2]:
file_path = project_dir + '/data/processed/' + 'df_merge_trip_income_from_03_feat_notebook'
df_merge_trip_income = pd.read_csv(file_path)

In [3]:
df_merge_trip_income

Unnamed: 0,community_area_number,trip_start_timestamp,trip_seconds,trip_miles,fare,pickup_centroid_latitude,pickup_centroid_longitude,per_capita_income_,part_of_day,surge_estimate
0,8,2019-07-17 11:30:00,857.0,4.5,10.0,41.907520,-87.626659,88669,2,1.057538
1,8,2019-07-17 23:45:00,779.0,5.4,10.0,41.900221,-87.629105,88669,5,0.995199
2,8,2019-07-17 22:30:00,671.0,4.2,7.5,41.892042,-87.631864,88669,5,0.879183
3,8,2019-07-17 16:45:00,460.0,1.0,10.0,41.891972,-87.612945,88669,4,2.106109
4,8,2019-07-17 09:30:00,744.0,2.0,7.5,41.905858,-87.630865,88669,2,1.122956
...,...,...,...,...,...,...,...,...,...,...
185174,54,2019-07-17 14:30:00,864.0,3.9,7.5,41.650222,-87.599463,8201,3,0.842076
185175,55,2019-07-17 17:45:00,387.0,1.8,5.0,41.651922,-87.564929,22677,4,0.950209
185176,55,2019-07-17 18:00:00,1112.0,8.7,12.5,41.651922,-87.564929,22677,4,0.871371
185177,74,2019-07-17 08:30:00,2265.0,13.4,22.5,41.687239,-87.719313,34381,2,0.987042


## Load pickle of random forest model

In [4]:
file_path = project_dir + '/models/' + 'random_forest_from_02_feat_eng_and_model_notebook.pickle'  ### eka should be _03_feat_
rf_model = load_pickle(file_path)

## Get X and y for modeling

In [5]:
def get_X(df_merge_trip_income, dummies_on_part_of_day=True):
    X = df_merge_trip_income[['pickup_centroid_latitude', 'pickup_centroid_longitude', 'per_capita_income_', 'part_of_day']]
    if dummies_on_part_of_day:
        X = pd.get_dummies(X, columns=['part_of_day'])
    else:
        X = X.drop['part_of_day']
    return X


def get_y(df_merge_trip_income, surge_cutoff=1.1):
    y = df_merge_trip_income.loc[:, 'surge_estimate'] > surge_cutoff
    return y

In [6]:
X = get_X(df_merge_trip_income)
y = get_y(df_merge_trip_income)

# Use model to generate predictions for visualizing

In [7]:
predict_proba_threshold = .45
df_merge_trip_income['true_y'] = y
df_merge_trip_income['predict_y_hat'] = y_pred = (rf_model.predict_proba(X))[:,1] > predict_proba_threshold
df_merge_trip_income['predict_proba_y_hat'] = rf_model.predict_proba(X)[:,1]

In [8]:
df_merge_trip_income[df_merge_trip_income['predict_y_hat']==True]

Unnamed: 0,community_area_number,trip_start_timestamp,trip_seconds,trip_miles,fare,pickup_centroid_latitude,pickup_centroid_longitude,per_capita_income_,part_of_day,surge_estimate,true_y,predict_y_hat,predict_proba_y_hat
259,8,2019-07-17 01:30:00,269.0,0.9,5.0,41.891972,-87.612945,88669,0,1.251008,True,True,0.511066
440,8,2019-07-17 16:30:00,907.0,2.5,7.5,41.907413,-87.640902,88669,4,0.971974,False,True,0.455459
532,8,2019-07-17 18:15:00,1227.0,2.0,7.5,41.907413,-87.640902,88669,4,0.899598,False,True,0.455459
706,8,2019-07-17 19:00:00,746.0,5.2,10.0,41.907413,-87.640902,88669,4,1.026293,False,True,0.455459
785,8,2019-07-17 18:45:00,537.0,1.4,5.0,41.907413,-87.640902,88669,4,0.926832,False,True,0.455459
...,...,...,...,...,...,...,...,...,...,...,...,...,...
185073,48,2019-07-17 18:15:00,730.0,3.9,10.0,41.728456,-87.575637,28887,4,1.183921,True,True,0.484464
185077,48,2019-07-17 19:15:00,599.0,3.0,7.5,41.735249,-87.575637,28887,4,1.050913,False,True,0.510417
185091,48,2019-07-17 18:15:00,2547.0,17.8,22.5,41.728456,-87.575637,28887,4,0.804477,False,True,0.484464
185138,18,2019-07-17 14:15:00,345.0,1.5,7.5,41.934487,-87.798636,22014,3,1.552440,True,True,0.824693


# Evaluate final model performance

In [9]:
confusion_matrix(df_merge_trip_income.loc[:, 'true_y'], df_merge_trip_income.loc[:, 'predict_y_hat'])

array([[136520,    936],
       [ 46857,    866]])

In [10]:
accuracy_score(df_merge_trip_income.loc[:, 'true_y'], df_merge_trip_income.loc[:, 'predict_y_hat'])

0.7419091797666042

In [11]:
precision_score(df_merge_trip_income.loc[:, 'true_y'], df_merge_trip_income.loc[:, 'predict_y_hat'])

0.48057713651498335

In [12]:
recall_score(df_merge_trip_income.loc[:, 'true_y'], df_merge_trip_income.loc[:, 'predict_y_hat'])

0.018146386438404962

In [14]:
rf_model.feature_importances_

array([0.29218812, 0.31202335, 0.14484304, 0.01142502, 0.01571451,
       0.02483924, 0.0262007 , 0.1447239 , 0.02804212])

In [15]:
X.columns

Index(['pickup_centroid_latitude', 'pickup_centroid_longitude',
       'per_capita_income_', 'part_of_day_0', 'part_of_day_1', 'part_of_day_2',
       'part_of_day_3', 'part_of_day_4', 'part_of_day_5'],
      dtype='object')

# Save to csv for work in Tableau

In [13]:
file_path = project_dir + '/data/processed/' + 'df_merge_trip_income_with_y_pred_from_scratch_08_viz.csv'
# assert not os.path.exists(file_path)
# df_merge_trip_income.to_csv(file_path)

In [14]:
df_merge_trip_income

Unnamed: 0,community_area_number,trip_start_timestamp,trip_seconds,trip_miles,fare,pickup_centroid_latitude,pickup_centroid_longitude,per_capita_income_,part_of_day,surge_estimate,true_y,predict_y_hat,predict_proba_y_hat
0,8,2019-07-17 11:30:00,857.0,4.5,10.0,41.907520,-87.626659,88669,2,1.057538,False,False,0.336938
1,8,2019-07-17 23:45:00,779.0,5.4,10.0,41.900221,-87.629105,88669,5,0.995199,False,False,0.263608
2,8,2019-07-17 22:30:00,671.0,4.2,7.5,41.892042,-87.631864,88669,5,0.879183,False,False,0.262218
3,8,2019-07-17 16:45:00,460.0,1.0,10.0,41.891972,-87.612945,88669,4,2.106109,True,False,0.406936
4,8,2019-07-17 09:30:00,744.0,2.0,7.5,41.905858,-87.630865,88669,2,1.122956,True,False,0.294096
...,...,...,...,...,...,...,...,...,...,...,...,...,...
185174,54,2019-07-17 14:30:00,864.0,3.9,7.5,41.650222,-87.599463,8201,3,0.842076,False,False,0.000000
185175,55,2019-07-17 17:45:00,387.0,1.8,5.0,41.651922,-87.564929,22677,4,0.950209,False,False,0.047727
185176,55,2019-07-17 18:00:00,1112.0,8.7,12.5,41.651922,-87.564929,22677,4,0.871371,False,False,0.047727
185177,74,2019-07-17 08:30:00,2265.0,13.4,22.5,41.687239,-87.719313,34381,2,0.987042,False,False,0.043755


Actually, I think I need to group stuff by community area, and possibly also by time of day

In [15]:
group_by_com_part = df_merge_trip_income.groupby(['community_area_number', 'part_of_day']).mean().reset_index()

In [16]:
group_by_com_part.dtypes

community_area_number          int64
part_of_day                    int64
trip_seconds                 float64
trip_miles                   float64
fare                         float64
pickup_centroid_latitude     float64
pickup_centroid_longitude    float64
per_capita_income_             int64
surge_estimate               float64
true_y                       float64
predict_y_hat                float64
predict_proba_y_hat          float64
dtype: object

In [17]:
group_by_com_part

Unnamed: 0,community_area_number,part_of_day,trip_seconds,trip_miles,fare,pickup_centroid_latitude,pickup_centroid_longitude,per_capita_income_,surge_estimate,true_y,predict_y_hat,predict_proba_y_hat
0,1,0,727.666667,4.416667,10.416667,42.006743,-87.666754,23939,1.144967,0.250000,0.250000,0.193445
1,1,1,1759.525862,11.432759,17.758621,42.007966,-87.667126,23939,0.932143,0.051724,0.000000,0.034690
2,1,2,1519.996721,7.866557,14.032787,42.009273,-87.668791,23939,0.952416,0.131148,0.000000,0.121355
3,1,3,1424.195122,6.619919,12.063008,42.008554,-87.668277,23939,0.918597,0.154472,0.000000,0.159436
4,1,4,1308.970588,4.972876,10.882353,42.007989,-87.669120,23939,0.972037,0.267974,0.000000,0.265361
...,...,...,...,...,...,...,...,...,...,...,...,...
418,77,1,1508.385870,9.540217,15.611413,41.987117,-87.662642,33385,0.947962,0.076087,0.000000,0.072673
419,77,2,1391.269231,7.247059,13.540724,41.985874,-87.662695,33385,0.978527,0.140271,0.000000,0.141689
420,77,3,1213.179558,5.846133,11.180939,41.986197,-87.662646,33385,0.932613,0.154696,0.000000,0.145953
421,77,4,1112.619469,4.310619,10.039823,41.986124,-87.662472,33385,1.005718,0.269027,0.056637,0.267360


In [18]:
file_path = project_dir + '/data/processed/' + 'group_by_com_part_from_scratch_08_viz.csv'
# assert not os.path.exists(file_path)
# group_by_com_part.to_csv(file_path)