# Analysis - 2021
Given Uber ride information and COVID information from NYC, what what is the likelihood

In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('/home/felipe/repos/tcc/nyc_data/csv/2021_preprocessed_sample.csv')

In [2]:
y = dataset['SR_Flag']
X = dataset.drop('SR_Flag', axis=1)

In [3]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
trip_max_idx = X['trip_duration'].idxmax()

In [4]:
X.drop(X.index[trip_max_idx], 0, inplace=True)
y.drop(y.index[trip_max_idx], 0, inplace=True)

  """Entry point for launching an IPython kernel.
  


In [5]:
y = y.apply(lambda x: 0 if x == 0 else 1)

In [6]:
#      trip_duration <= 5
#  5 < trip_duration <= 10
# 10 < trip_duration <= 15
# 15 < trip_duration <= 20
# 20 < trip_duration <= 30
# 30 < trip_duration <= 45
# 45 < trip_duration <= 60
# 60 < trip_duration

def get_trip_duration_category_from_duration(trip_duration):
    trip_duration = trip_duration/60
    if trip_duration <= 5:
        return 1
    elif 5 < trip_duration <= 10:
        return 2
    elif 10 < trip_duration <= 15:
        return 3
    elif 15 < trip_duration <= 20:
        return 4
    elif 20 < trip_duration <= 30:
        return 5
    elif 30 < trip_duration <= 45:
        return 6
    elif 45 < trip_duration <= 60:
        return 7
    elif 60 < trip_duration:
        return 8

X['trip_duration'] = X['trip_duration'].apply(get_trip_duration_category_from_duration)

In [7]:
X

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,pickup_day_period,dropoff_day_period,trip_duration,CASE_COUNT,PROBABLE_CASE_COUNT,CASE_COUNT_7DAY_AVG,ALL_CASE_COUNT_7DAY_AVG
0,1609460100.000,1609461720.000,245,221,0,0,5,1336,784,3806,4752
1,1609515480.000,1609516620.000,206,115,2,2,4,1336,784,3806,4752
2,1609575095.000,1609575637.000,56,56,1,1,2,4188,1066,3911,4881
3,1609575264.000,1609575913.000,130,122,1,1,3,4188,1066,3911,4881
4,1609609179.000,1609609181.000,92,73,2,2,1,4188,1066,3911,4881
...,...,...,...,...,...,...,...,...,...,...,...
93952,1627775409.000,1627777548.000,114,72,3,0,6,845,324,1098,1467
93953,1627774607.000,1627775110.000,236,239,3,3,2,845,324,1098,1467
93954,1627774161.000,1627775213.000,90,88,3,3,4,845,324,1098,1467
93955,1627774386.000,1627775389.000,17,80,3,3,4,845,324,1098,1467


In [8]:
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, make_scorer

In [9]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_clf = DecisionTreeClassifier()

accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score, zero_division=0, average='weighted')
recall_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
rmse_scorer = make_scorer(mean_squared_error, squared=False)

k_folds = range(2,12)

decision_tree_scores_cv = cross_validate(decision_tree_clf, X, y, cv=10, scoring={'accuracy': accuracy_scorer,
                                                                                  'precision': precision_scorer,
                                                                                  'recall': recall_scorer,
                                                                                  'f1': f1_scorer,
                                                                                  'rmse': rmse_scorer})
decision_tree_accuracy_scores = decision_tree_scores_cv['test_accuracy'].tolist()
decision_tree_precision_scores = decision_tree_scores_cv['test_precision'].tolist()
decision_tree_recall_scores = decision_tree_scores_cv['test_recall'].tolist()
decision_tree_f1_scores = decision_tree_scores_cv['test_f1'].tolist()
decision_tree_rmse_scores = decision_tree_scores_cv['test_rmse'].tolist()

decision_tree_scores = {'accuracy': decision_tree_accuracy_scores,
                        'precision': decision_tree_precision_scores,
                        'recall': decision_tree_recall_scores,
                        'f1': decision_tree_f1_scores,
                        'rmse': decision_tree_rmse_scores}

In [10]:
from sklearn.ensemble import RandomForestClassifier

accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score, zero_division=0, average='weighted')
recall_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
rmse_scorer = make_scorer(mean_squared_error, squared=False)

depths = range(1,11)
random_forest_scores = []

for depth in depths:
    random_forest = RandomForestClassifier(max_depth=depth, bootstrap=False)
    scores = cross_validate(decision_tree_clf, X, y, cv=10, scoring={'accuracy': accuracy_scorer,
                                                                     'precision': precision_scorer,
                                                                     'recall': recall_scorer,
                                                                     'f1': f1_scorer,
                                                                     'rmse': rmse_scorer})
    random_forest_acc = scores['test_accuracy'].tolist()
    random_forest_prec = scores['test_precision'].tolist()
    random_forest_recall = scores['test_recall'].tolist()
    random_forest_f1 = scores['test_f1'].tolist()
    random_forest_rmse = scores['test_rmse'].tolist()
    
    random_forest_scores.append({'depth': depth,
                                 'accuracy': random_forest_acc,
                                 'precision': random_forest_prec,
                                 'recall': random_forest_recall,
                                 'f1': random_forest_f1,
                                 'rmse': random_forest_rmse})

In [11]:
from sklearn.neighbors import KNeighborsClassifier

accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score, zero_division=0, average='weighted')
recall_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
rmse_scorer = make_scorer(mean_squared_error, squared=False)

k_values = range(1,6)
knn_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_validate(knn, X, y, cv=10, scoring={'accuracy': accuracy_scorer,
                                                       'precision': precision_scorer,
                                                       'recall': recall_scorer,
                                                       'f1': f1_scorer,
                                                       'rmse': rmse_scorer})
    knn_acc = scores['test_accuracy'].tolist()
    knn_prec = scores['test_precision'].tolist()
    knn_recall = scores['test_recall'].tolist()
    knn_f1 = scores['test_f1'].tolist()
    knn_rmse = scores['test_rmse'].tolist()
    
    knn_scores.append({'k': k,
                       'accuracy': knn_acc,
                       'precision': knn_prec,
                       'recall': knn_recall,
                       'f1': knn_f1,
                       'rmse': knn_rmse})

In [12]:
from sklearn.cluster import KMeans

accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score, zero_division=0, average='weighted')
recall_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
rmse_scorer = make_scorer(mean_squared_error, squared=False)

k_values = range(1,6)
kmeans_scores = []

kmeans_recall_scorer = make_scorer(recall_score, zero_division=1, average='weighted')
kmeans_f1_scorer = make_scorer(f1_score, zero_division=1, average='weighted')

for k in k_values:
    kmeans = KMeans(n_clusters=k)
    acc_scores = cross_val_score(kmeans, X, y, cv=10, scoring='accuracy')
    prec_scores = cross_val_score(kmeans, X, y, cv=10, scoring=precision_scorer)
    recall_scores = cross_val_score(kmeans, X, y, cv=10, scoring=kmeans_recall_scorer)
    f1_scores = cross_val_score(kmeans, X, y, cv=10, scoring=kmeans_f1_scorer)
    rmse_scores = cross_val_score(kmeans, X, y, cv=10, scoring=rmse_scorer)
    
    kmeans_scores.append({'k': k,
                          'accuracy': acc_scores.tolist(),
                          'precision': prec_scores.tolist(),
                          'recall': recall_scores.tolist(),
                          'f1': f1_scores.tolist(),
                          'rmse': rmse_scores.tolist()})

In [13]:
from sklearn.neural_network import MLPClassifier

accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score, zero_division=0, average='weighted')
recall_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
rmse_scorer = make_scorer(mean_squared_error, squared=False)

mlp_classifier = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2))
mlp_scores = cross_validate(mlp_classifier, X, y, cv=10, scoring={'accuracy': accuracy_scorer,
                                                                  'precision': precision_scorer,
                                                                  'recall': recall_scorer,
                                                                  'f1': f1_scorer,
                                                                  'rmse': rmse_scorer})
mlp_acc = mlp_scores['test_accuracy'].tolist()
mlp_prec = mlp_scores['test_precision'].tolist()
mlp_recall = mlp_scores['test_recall'].tolist()
mlp_f1 = mlp_scores['test_f1'].tolist()
mlp_rmse = mlp_scores['test_rmse'].tolist()

mlp_scores = {'accuracy': mlp_acc,
              'precision': mlp_prec,
              'recall': mlp_recall,
              'f1': mlp_f1,
              'rmse': mlp_rmse}

In [14]:
from sklearn.naive_bayes import GaussianNB

accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score, zero_division=0, average='weighted')
recall_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
rmse_scorer = make_scorer(mean_squared_error, squared=False)

gnb_classifier = GaussianNB()
gnb_scores = cross_validate(gnb_classifier, X, y, cv=10, scoring={'accuracy': accuracy_scorer,
                                                                  'precision': precision_scorer,
                                                                  'recall': recall_scorer,
                                                                  'f1': f1_scorer,
                                                                  'rmse': rmse_scorer})
gnb_acc = gnb_scores['test_accuracy'].tolist()
gnb_prec = gnb_scores['test_precision'].tolist()
gnb_recall = gnb_scores['test_recall'].tolist()
gnb_f1 = gnb_scores['test_f1'].tolist()
gnb_rmse = gnb_scores['test_rmse'].tolist()

gnb_scores = {'accuracy': gnb_acc,
              'precision': gnb_prec,
              'recall': gnb_recall,
              'f1': gnb_f1,
              'rmse': gnb_rmse}

In [15]:
from sklearn.naive_bayes import ComplementNB

accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score, zero_division=0, average='weighted')
recall_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
rmse_scorer = make_scorer(mean_squared_error, squared=False)

cnb_classifier = ComplementNB()
cnb_scores = cross_validate(cnb_classifier, X, y, cv=10, scoring={'accuracy': accuracy_scorer,
                                                                  'precision': precision_scorer,
                                                                  'recall': recall_scorer,
                                                                  'f1': f1_scorer,
                                                                  'rmse': rmse_scorer})
cnb_acc = cnb_scores['test_accuracy'].tolist()
cnb_prec = cnb_scores['test_precision'].tolist()
cnb_recall = cnb_scores['test_recall'].tolist()
cnb_f1 = cnb_scores['test_f1'].tolist()
cnb_rmse = cnb_scores['test_rmse'].tolist()

cnb_scores = {'accuracy': cnb_acc,
              'precision': cnb_prec,
              'recall': cnb_recall,
              'f1': cnb_f1,
              'rmse': cnb_rmse}

In [16]:
from sklearn.naive_bayes import MultinomialNB

accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score, zero_division=0, average='weighted')
recall_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
rmse_scorer = make_scorer(mean_squared_error, squared=False)

mnb_classifier = MultinomialNB()
mnb_scores = cross_validate(mnb_classifier, X, y, cv=10, scoring={'accuracy': accuracy_scorer,
                                                                  'precision': precision_scorer,
                                                                  'recall': recall_scorer,
                                                                  'f1': f1_scorer,
                                                                  'rmse': rmse_scorer})
mnb_acc = mnb_scores['test_accuracy'].tolist()
mnb_prec = mnb_scores['test_precision'].tolist()
mnb_recall = mnb_scores['test_recall'].tolist()
mnb_f1 = mnb_scores['test_f1'].tolist()
mnb_rmse = mnb_scores['test_rmse'].tolist()

mnb_scores = {'accuracy': mnb_acc,
              'precision': mnb_prec,
              'recall': mnb_recall,
              'f1': mnb_f1,
              'rmse': mnb_rmse}

In [17]:
supervised_scores = {'decision_tree': decision_tree_scores,
                     'random_forest': random_forest_scores,
                     'knn': knn_scores,
                     'kmeans': kmeans_scores,
                     'mlp': mlp_scores,
                     'gnb': gnb_scores,
                     'cnb': cnb_scores,
                     'mnb': mnb_scores}

In [18]:
import json

with open('2021_scores.json', 'w') as json_2021:
    json.dump(supervised_scores, json_2021, indent=4)