# Analysis
Given Uber ride information and COVID information from NYC, what what is the likelihood

In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('/home/felipe/repos/tcc/nyc_data/csv/preprocessed_sample.csv')

In [2]:
y = dataset['SR_Flag']
X = dataset.drop('SR_Flag', axis=1)

In [3]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
trip_max_idx = X['trip_duration'].idxmax()

In [4]:
X.drop(X.index[trip_max_idx], 0, inplace=True)
y.drop(y.index[trip_max_idx], 0, inplace=True)

  """Entry point for launching an IPython kernel.
  


In [5]:
y = y.apply(lambda x: 0 if x == 0 else 1)

In [6]:
#      trip_duration <= 5
#  5 < trip_duration <= 10
# 10 < trip_duration <= 15
# 15 < trip_duration <= 20
# 20 < trip_duration <= 30
# 30 < trip_duration <= 45
# 45 < trip_duration <= 60
# 60 < trip_duration

def get_trip_duration_category_from_duration(trip_duration):
    trip_duration = trip_duration/60
    if trip_duration <= 5:
        return 1
    elif 5 < trip_duration <= 10:
        return 2
    elif 10 < trip_duration <= 15:
        return 3
    elif 15 < trip_duration <= 20:
        return 4
    elif 20 < trip_duration <= 30:
        return 5
    elif 30 < trip_duration <= 45:
        return 6
    elif 45 < trip_duration <= 60:
        return 7
    elif 60 < trip_duration:
        return 8

X['trip_duration'] = X['trip_duration'].apply(get_trip_duration_category_from_duration)

In [7]:
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, f1_score, make_scorer

accuracy_scorer = make_scorer(accuracy_score)
recall_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)

In [12]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_clf = DecisionTreeClassifier(random_state=0)

k_folds = range(2,12)

decision_tree_scores_cv = cross_validate(decision_tree_clf, X, y, cv=10, scoring={'accuracy': accuracy_scorer, 'recall': recall_scorer, 'f1': f1_scorer})
decision_tree_accuracy_scores = decision_tree_scores_cv['test_accuracy']
decision_tree_recall_scores = decision_tree_scores_cv['test_recall']
decision_tree_f1_scores = decision_tree_scores_cv['test_f1']

decision_tree_scores = {'accuracy': decision_tree_accuracy_scores, 'recall': decision_tree_recall_scores, 'f1': decision_tree_f1_scores}

In [13]:
decision_tree_scores

{'accuracy': array([0.05411682, 0.05453906, 0.09845179, 0.61801548, 0.952076  ,
        0.95214638, 0.94764251, 0.95200563, 0.95193526, 0.95214638]),
 'recall': array([1.        , 0.        , 0.        , 0.00440529, 0.        ,
        0.00146843, 0.00293686, 0.        , 0.        , 0.00146843]),
 'f1': array([0.09200838, 0.        , 0.        , 0.00110416, 0.        ,
        0.00293255, 0.00534759, 0.        , 0.        , 0.00293255])}

In [14]:
from sklearn.ensemble import RandomForestClassifier

depths = range(1,11)
random_forest_scores = []

for depth in depths:
    random_forest = RandomForestClassifier(max_depth=depth, random_state=0)
    scores = cross_validate(decision_tree_clf, X, y, cv=10, scoring={'accuracy': accuracy_scorer, 'recall': recall_scorer, 'f1': f1_scorer})
    random_forest_scores.append(scores)

In [15]:
random_forest_scores

[{'fit_time': array([0.44908595, 0.37511659, 0.35484886, 0.37873745, 0.42142606,
         0.43230247, 0.42915583, 0.41904831, 0.39918923, 0.34738207]),
  'score_time': array([0.00647116, 0.00687599, 0.00703812, 0.00656533, 0.00664759,
         0.0060885 , 0.00596619, 0.00607467, 0.00633764, 0.00625205]),
  'test_accuracy': array([0.05411682, 0.05453906, 0.09845179, 0.61801548, 0.952076  ,
         0.95214638, 0.94764251, 0.95200563, 0.95193526, 0.95214638]),
  'test_recall': array([1.        , 0.        , 0.        , 0.00440529, 0.        ,
         0.00146843, 0.00293686, 0.        , 0.        , 0.00146843]),
  'test_f1': array([0.09200838, 0.        , 0.        , 0.00110416, 0.        ,
         0.00293255, 0.00534759, 0.        , 0.        , 0.00293255])},
 {'fit_time': array([0.43662333, 0.37405038, 0.35597682, 0.38113689, 0.42293882,
         0.42770171, 0.42503405, 0.41690803, 0.3948164 , 0.34085941]),
  'score_time': array([0.00652313, 0.00697899, 0.00732446, 0.00679851, 0.00624

In [16]:
from sklearn.neighbors import KNeighborsClassifier

k_values = range(1,6)
knn_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_validate(knn, X, y, cv=10, scoring={'accuracy': accuracy_scorer, 'recall': recall_scorer, 'f1': f1_scorer})
    knn_scores.append(scores)

In [17]:
knn_scores

[{'fit_time': array([14.84459066, 14.89941931, 15.0635047 , 14.51857686, 12.56020975,
         14.86787701, 15.76802921, 17.98520064, 15.79055738, 13.73194814]),
  'score_time': array([0.24651337, 0.2575345 , 0.24527502, 0.24595094, 0.24759889,
         0.25109887, 0.24246359, 0.24988508, 0.249156  , 0.25768733]),
  'test_accuracy': array([0.05439831, 0.04447572, 0.08719212, 0.61090781, 0.952076  ,
         0.952076  , 0.95087966, 0.952076  , 0.60190007, 0.952076  ]),
  'test_recall': array([1.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.00146843, 0.        ]),
  'test_f1': array([0.09203325, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.00035342, 0.        ])},
 {'fit_time': array([16.66842341, 15.97079182, 16.37093878, 14.59956241, 12.69203138,
         14.47011161, 14.99907589, 15.41337824, 15.81996465, 13.33957505]),
  'score_time': array([0.25226712, 0.2526691 , 0.25122786,

In [18]:
from sklearn.cluster import KMeans

k_values = range(1,6)
kmeans_scores = []

kmeans_recall_scorer = make_scorer(recall_score, zero_division=1, average='weighted')
kmeans_f1_scorer = make_scorer(f1_score, zero_division=1, average='weighted')

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=0)
    acc_scores = cross_val_score(kmeans, X, y, cv=10, scoring='accuracy')
    recall_scores = cross_val_score(kmeans, X, y, cv=10, scoring=kmeans_recall_scorer)
    f1_scores = cross_val_score(kmeans, X, y, cv=10, scoring=kmeans_f1_scorer)
    kmeans_scores.append(scores)

In [19]:
kmeans_scores

[{'fit_time': array([15.66473937, 15.21270442, 15.4752357 , 14.80301905, 13.345469  ,
         15.0468545 , 15.4168849 , 15.11970448, 14.83489203, 12.68433499]),
  'score_time': array([0.25272369, 0.25091243, 0.24864626, 0.25944495, 0.26091242,
         0.25060296, 0.25448966, 0.25154638, 0.2484591 , 0.25556231]),
  'test_accuracy': array([0.05524279, 0.01766362, 0.02695285, 0.58494018, 0.952076  ,
         0.952076  , 0.952076  , 0.952076  , 0.952076  , 0.952076  ]),
  'test_recall': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  'test_f1': array([0.09210793, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ])},
 {'fit_time': array([15.66473937, 15.21270442, 15.4752357 , 14.80301905, 13.345469  ,
         15.0468545 , 15.4168849 , 15.11970448, 14.83489203, 12.68433499]),
  'score_time': array([0.25272369, 0.25091243, 0.24864626, 0.25944495, 0.26091242,
         0.25060296, 0.25448966, 0.25154638, 0.2484591 , 0.25556

In [20]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate

mlp_classifier = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
mlp_scores = cross_validate(mlp_classifier, X, y, cv=10, scoring=('accuracy', 'recall', 'f1'))

In [21]:
from sklearn.naive_bayes import GaussianNB

gnb_classifier = GaussianNB()
gnb_scores = cross_validate(gnb_classifier, X, y, cv=10, scoring=('accuracy', 'recall', 'f1'))

In [None]:
supervised_scores = {'decision_tree': decision_tree_scores,
                     'random_forest': random_forest_scores,
                     'knn': knn_scores,
                     'kmeans': kmeans_scores,
                     'mlp': mlp_scores,
                     'gnb': gnb_scores}

In [None]:
supervised_scores