# Analysis - 2020
Given Uber ride information and COVID information from NYC, what what is the likelihood

In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('/home/felipe/repos/tcc/nyc_data/csv/preprocessed_sample.csv')

In [2]:
y = dataset['SR_Flag']
X = dataset.drop('SR_Flag', axis=1)

In [3]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
trip_max_idx = X['trip_duration'].idxmax()

In [4]:
X.drop(X.index[trip_max_idx], 0, inplace=True)
y.drop(y.index[trip_max_idx], 0, inplace=True)

  """Entry point for launching an IPython kernel.
  


In [5]:
y = y.apply(lambda x: 0 if x == 0 else 1)

In [6]:
#      trip_duration <= 5
#  5 < trip_duration <= 10
# 10 < trip_duration <= 15
# 15 < trip_duration <= 20
# 20 < trip_duration <= 30
# 30 < trip_duration <= 45
# 45 < trip_duration <= 60
# 60 < trip_duration

def get_trip_duration_category_from_duration(trip_duration):
    trip_duration = trip_duration/60
    if trip_duration <= 5:
        return 1
    elif 5 < trip_duration <= 10:
        return 2
    elif 10 < trip_duration <= 15:
        return 3
    elif 15 < trip_duration <= 20:
        return 4
    elif 20 < trip_duration <= 30:
        return 5
    elif 30 < trip_duration <= 45:
        return 6
    elif 45 < trip_duration <= 60:
        return 7
    elif 60 < trip_duration:
        return 8

X['trip_duration'] = X['trip_duration'].apply(get_trip_duration_category_from_duration)

In [7]:
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, f1_score, make_scorer

accuracy_scorer = make_scorer(accuracy_score)
recall_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)

In [8]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_clf = DecisionTreeClassifier(random_state=0)

k_folds = range(2,12)

decision_tree_scores_cv = cross_validate(decision_tree_clf, X, y, cv=10, scoring={'accuracy': accuracy_scorer, 'recall': recall_scorer, 'f1': f1_scorer})
decision_tree_accuracy_scores = decision_tree_scores_cv['test_accuracy'].tolist()
decision_tree_recall_scores = decision_tree_scores_cv['test_recall'].tolist()
decision_tree_f1_scores = decision_tree_scores_cv['test_f1'].tolist()

decision_tree_scores = {'accuracy': decision_tree_accuracy_scores, 'recall': decision_tree_recall_scores, 'f1': decision_tree_f1_scores}

In [10]:
from sklearn.ensemble import RandomForestClassifier

depths = range(1,11)
random_forest_scores = []

for depth in depths:
    random_forest = RandomForestClassifier(max_depth=depth, random_state=0)
    scores = cross_validate(decision_tree_clf, X, y, cv=10, scoring={'accuracy': accuracy_scorer, 'recall': recall_scorer, 'f1': f1_scorer})
    random_forest_acc = scores['test_accuracy'].tolist()
    random_forest_recall = scores['test_recall'].tolist()
    random_forest_f1 = scores['test_f1'].tolist()
    random_forest_scores.append({'depth': depth, 'accuracy': random_forest_acc, 'recall': random_forest_recall, 'f1': random_forest_f1})

In [12]:
from sklearn.neighbors import KNeighborsClassifier

k_values = range(1,6)
knn_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_validate(knn, X, y, cv=10, scoring={'accuracy': accuracy_scorer, 'recall': recall_scorer, 'f1': f1_scorer})
    knn_acc = scores['test_accuracy'].tolist()
    knn_recall = scores['test_recall'].tolist()
    knn_f1 = scores['test_f1'].tolist()
    knn_scores.append({'k': k, 'accuracy': knn_acc, 'recall': knn_recall, 'f1': knn_f1})

In [14]:
from sklearn.cluster import KMeans

k_values = range(1,6)
kmeans_scores = []

kmeans_recall_scorer = make_scorer(recall_score, zero_division=1, average='weighted')
kmeans_f1_scorer = make_scorer(f1_score, zero_division=1, average='weighted')

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=0)
    acc_scores = cross_val_score(kmeans, X, y, cv=10, scoring='accuracy')
    recall_scores = cross_val_score(kmeans, X, y, cv=10, scoring=kmeans_recall_scorer)
    f1_scores = cross_val_score(kmeans, X, y, cv=10, scoring=kmeans_f1_scorer)
    kmeans_scores.append({'k': k, 'accuracy': acc_scores.tolist(), 'recall': recall_scores.tolist(), 'f1': f1_scores.tolist()})

In [16]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate

mlp_classifier = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
mlp_scores = cross_validate(mlp_classifier, X, y, cv=10, scoring=('accuracy', 'recall', 'f1'))
mlp_acc = mlp_scores['test_accuracy'].tolist()
mlp_recall = mlp_scores['test_recall'].tolist()
mlp_f1 = mlp_scores['test_f1'].tolist()
mlp_scores = {'accuracy': mlp_acc, 'recall': mlp_recall, 'f1': mlp_f1}

In [17]:
from sklearn.naive_bayes import GaussianNB

gnb_classifier = GaussianNB()
gnb_scores = cross_validate(gnb_classifier, X, y, cv=10, scoring=('accuracy', 'recall', 'f1'))
gnb_acc = gnb_scores['test_accuracy'].tolist()
gnb_recall = gnb_scores['test_recall'].tolist()
gnb_f1 = gnb_scores['test_f1'].tolist()
gnb_scores = {'accuracy': gnb_acc, 'recall': gnb_recall, 'f1': gnb_f1}

In [18]:
supervised_scores = {'decision_tree': decision_tree_scores,
                     'random_forest': random_forest_scores,
                     'knn': knn_scores,
                     'kmeans': kmeans_scores,
                     'mlp': mlp_scores,
                     'gnb': gnb_scores}

In [25]:
import json

with open('2020_scores.json', 'w') as json_2020:
    json.dump(supervised_scores, json_2020, indent=4)