In [80]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from scipy.stats import pearsonr
from numpy import cov
from scipy.stats import spearmanr

In [40]:
df = pd.read_csv('data_location_040420')

In [41]:
df = df.drop(['Unnamed: 0','pickup_datetime','dropoff_datetime','Pickup_Day','Pickup_Time',
             'Dropoff_Day','Dropoff_Time'], axis=1)

In [42]:
df.head()

Unnamed: 0,passenger_count,trip_distance,PULocationID,DOLocationID,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,Pickup_Month,Pickup_Year,Dropoff_Month,Dropoff_Year,diff_seconds,tip_percentage
0,2,2.6,79,163,12,0.3,0.5,2,0.0,16.56,1,2017,1,2017,959.0,0.166667
1,2,0.66,79,79,5,0.3,0.5,1,0.0,8.16,1,2017,1,2017,335.0,0.2
2,1,0.6,79,232,4,0.3,0.5,1,0.0,7.5,1,2017,1,2017,236.0,0.25
3,1,4.2,79,146,15,0.3,0.5,3,0.0,19.56,1,2017,1,2017,851.0,0.2
4,1,9.1,79,257,26,0.3,0.5,5,0.0,32.75,1,2017,1,2017,1140.0,0.192308


In [43]:
df.shape

(91802, 16)

In [44]:
# Create X variable for features and Y variable for target
X = df.drop('PULocationID', axis=1) #features
y = df['PULocationID'] #target

In [57]:
# Split the Train and Test Data
# Apply Scaling and Fitting
# Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)
# Applying standard scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Neural Network

In [58]:
def nn(X_train, X_test, y_train, y_test):
    # Create multilayer perceptron neural network
    mlpc = MLPClassifier(hidden_layer_sizes=(25,25,25), max_iter=500)
    mlpc.fit(X_train, y_train)
    pred_mlpc = mlpc.predict(X_test)
    # Display classification report and confusion matrix
    print(classification_report(y_test, pred_mlpc))
    print(confusion_matrix(y_test, pred_mlpc))
    # Calculate accuracy of model
    cm = accuracy_score(y_test, pred_mlpc)
    print("Model Accuracy: "+"{:.2%}".format(cm))

In [59]:
df['PULocationID'].unique()

array([ 79,  48, 132, 230, 249, 148, 161, 234,  68, 164, 154, 108, 216,
       134,  10, 130,   8,  28,  56, 207, 138])

In [60]:
nn(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

          10       0.14      0.15      0.15        13
          28       0.00      0.00      0.00         3
          48       0.34      0.21      0.26      3819
          56       0.00      0.00      0.00         5
          68       0.24      0.10      0.14      2548
          79       0.27      0.57      0.37      5156
         108       0.00      0.00      0.00         1
         130       0.00      0.00      0.00         5
         132       0.83      0.94      0.88      3243
         134       0.00      0.00      0.00         3
         138       0.59      0.74      0.65      1780
         148       0.29      0.30      0.29      3432
         154       0.00      0.00      0.00         2
         161       0.27      0.24      0.26      3271
         164       0.25      0.08      0.12      2611
         207       0.00      0.00      0.00         1
         216       0.00      0.00      0.00         5
         230       0.27    

  'precision', 'predicted', average, warn_for)


# Random Forest

### Ensemble Model of Decision Trees

In [61]:
def RandomForest_Model(estimators):
    rf = RandomForestClassifier(n_estimators=estimators)
    rf.fit(X_train, y_train)
    rf_score_test = rf.score(X_test,y_test)
    rf_score_train = rf.score(X_train,y_train)
    print("Number of Decision Trees: " + str(estimators))
    print("Test Score: " + str(rf_score_test))
    print("Train Score: " + str(rf_score_train))

#### 10 Estimators

In [62]:
RandomForest_Model(10)

Number of Decision Trees: 10
Test Score: 0.324010784020043
Train Score: 0.9883262831103284


#### 50 Estimators

In [63]:
RandomForest_Model(50)

Number of Decision Trees: 50
Test Score: 0.35170610822145365
Train Score: 0.9999818449193006


#### 100 Estimators

In [64]:
RandomForest_Model(100)

Number of Decision Trees: 100
Test Score: 0.35622668227989435
Train Score: 1.0


# Bagging

In [65]:
def Bagging_model(max_samples, max_features, n_estimators):
    bg = BaggingClassifier(DecisionTreeClassifier(), max_samples = max_samples, 
                           max_features = max_features, n_estimators = n_estimators)
    bg.fit(X_train, y_train)
    bg_test_score = bg.score(X_test, y_test)
    bg_train_score = bg.score(X_train, y_train)
    print("Max Samples: " + str(max_samples))
    print("Max Features: " + str(max_features))
    print("Number of Decision Trees: " + str(n_estimators))
    print("Test Score: " + str(bg_test_score))
    print("Train Score: " + str(bg_train_score))

In [66]:
Bagging_model(0.5, 1.0, 20)

Max Samples: 0.5
Max Features: 1.0
Number of Decision Trees: 20
Test Score: 0.4378965714441328
Train Score: 0.93596703037345


# Boosting - Ada Boost

In [67]:
def Boosting_model(n_estimators, learning_rate):
    adb = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators=n_estimators, learning_rate=learning_rate)
    adb.fit(X_train, y_train)
    adb_test_score = adb.score(X_test, y_test)
    adb_train_score = adb.score(X_train, y_train)
    print("Number of Decision Trees: " + str(n_estimators))
    print("Learning Rate: " + str(learning_rate))
    print("Test Score: "+ str(adb_test_score))
    print("Train Score: " + str(adb_train_score))

In [68]:
Boosting_model(10,1)

Number of Decision Trees: 10
Learning Rate: 1
Test Score: 0.38610059638898725
Train Score: 1.0


In [72]:
# There appears to be a problem with overfitting even with a 90/10 split, an 80/20 split, and a 60/40 split
# The Bagging model had the lowest training score and the highest testing score
# We will move to removing some variables to see if this helps with the overfitting done during the training

In [73]:
# Explore correlation between PULocationID and other variables
# Pearson's Correlation
# Above 0.5 and close to 1 indicates strong correlation
# Below -0.5 and close to -1 indicates strong correlation in opposite directions
# Range is from -1 to 1
# Spearman's Correlation
# Above 0.5 and close to 1 indicates strong correlation
# Below -0.5 and close to -1 indicates strong correlation in opposite directions
# Range is from -1 to 1
# Covariance
# The closer to 0, the more independent the variables are

In [84]:
# We are always going to compare the variable to the PickUp location ID 
# so we can set that as data1 for all calculations
data1 = df['PULocationID']
def cor_cov(data2):
    corr_p, _ = pearsonr(data1, data2)
    print('Pearsons correlation: %.3f' % corr_p)
    corr_s, _ = spearmanr(data1, data2)
    print('Spearmans correlation: %.3f' % corr_s)
    covariance = cov(data1, data2)
    print("Covariance")
    print(covariance)

# Passenger Count

In [85]:
cor_cov(df['passenger_count'])

Pearsons correlation: -0.007
Spearmans correlation: -0.003
Covariance
[[ 4.61184523e+03 -5.81564259e-01]
 [-5.81564259e-01  1.55389489e+00]]


# Trip Distance

In [86]:
cor_cov(df['trip_distance'])

Pearsons correlation: -0.074
Spearmans correlation: -0.104
Covariance
[[4611.84522696  -25.35121142]
 [ -25.35121142   25.51366209]]


# DropOff Location ID

In [87]:
cor_cov(df['DOLocationID'])

Pearsons correlation: 0.003
Spearmans correlation: 0.002
Covariance
[[4611.84522696   15.64676741]
 [  15.64676741 5445.23575684]]


# Fare Amount

In [88]:
cor_cov(df['fare_amount'])

Pearsons correlation: -0.067
Spearmans correlation: -0.096
Covariance
[[4611.84522696  -63.49599187]
 [ -63.49599187  192.74933461]]


# Surcharge

In [89]:
cor_cov(df['surcharge'])

Pearsons correlation: 0.004
Spearmans correlation: 0.005
Covariance
[[4.61184523e+03 3.80480040e-04]
 [3.80480040e-04 1.96072024e-06]]


# MTA Tax

In [90]:
cor_cov(df['mta_tax'])

Pearsons correlation: 0.007
Spearmans correlation: 0.010
Covariance
[[4.61184523e+03 1.24731903e-02]
 [1.24731903e-02 6.28839162e-04]]


# Tip Amount

In [91]:
cor_cov(df['tip_amount'])

Pearsons correlation: -0.060
Spearmans correlation: -0.094
Covariance
[[4611.84522696  -12.12148261]
 [ -12.12148261    8.93049957]]


# Tolls Amount

In [92]:
cor_cov(df['tolls_amount'])

Pearsons correlation: -0.042
Spearmans correlation: -0.083
Covariance
[[ 4.61184523e+03 -5.05609043e+00]
 [-5.05609043e+00  3.12196452e+00]]


# Total Amount

In [93]:
cor_cov(df['total_amount'])

Pearsons correlation: -0.069
Spearmans correlation: -0.103
Covariance
[[4611.84522696  -81.95583379]
 [ -81.95583379  310.12489226]]


# Pickup Month

In [94]:
cor_cov(df['Pickup_Month'])

Pearsons correlation: -0.012
Spearmans correlation: -0.007
Covariance
[[ 4.61184523e+03 -2.64032104e+00]
 [-2.64032104e+00  1.14021120e+01]]


# Pickup Year

In [95]:
cor_cov(df['Pickup_Year'])

Pearsons correlation: -0.042
Spearmans correlation: -0.047
Covariance
[[ 4.61184523e+03 -2.30594385e+00]
 [-2.30594385e+00  6.64016743e-01]]


# Dropoff Month

In [96]:
cor_cov(df['Dropoff_Month'])

Pearsons correlation: -0.011
Spearmans correlation: -0.007
Covariance
[[ 4.61184523e+03 -2.48688412e+00]
 [-2.48688412e+00  1.14020161e+01]]


# Dropoff Year

In [97]:
cor_cov(df['Dropoff_Year'])

Pearsons correlation: -0.042
Spearmans correlation: -0.047
Covariance
[[ 4.61184523e+03 -2.31390823e+00]
 [-2.31390823e+00  6.64134167e-01]]


# Difference Seconds (Trip Duration)

In [98]:
cor_cov(df['diff_seconds'])

Pearsons correlation: -0.003
Spearmans correlation: -0.064
Covariance
[[ 4.61184523e+03 -8.42701926e+02]
 [-8.42701926e+02  1.43098937e+07]]


# Tip Percentage

In [99]:
cor_cov(df['tip_percentage'])

Pearsons correlation: -0.000
Spearmans correlation: -0.011
Covariance
[[ 4.61184523e+03 -3.26597701e-04]
 [-3.26597701e-04  5.09342137e-03]]


In [100]:
# There are no strong correlations. All are close to zero.
# Trip Distance has a higher Covariance
# Also Dropoff Location ID, Fare Amount, Tip Amount, and Total Amount
# Therefore I will redo the models with only these variables

In [101]:
df.columns

Index(['passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID',
       'fare_amount', 'surcharge', 'mta_tax', 'tip_amount', 'tolls_amount',
       'total_amount', 'Pickup_Month', 'Pickup_Year', 'Dropoff_Month',
       'Dropoff_Year', 'diff_seconds', 'tip_percentage'],
      dtype='object')

In [102]:
df_updated = df.drop(['passenger_count','surcharge','mta_tax','tolls_amount','Pickup_Month', 'Pickup_Year', 
                      'Dropoff_Month','Dropoff_Year', 'diff_seconds', 'tip_percentage'], axis=1)

In [103]:
df_updated.head()

Unnamed: 0,trip_distance,PULocationID,DOLocationID,fare_amount,tip_amount,total_amount
0,2.6,79,163,12,2,16.56
1,0.66,79,79,5,1,8.16
2,0.6,79,232,4,1,7.5
3,4.2,79,146,15,3,19.56
4,9.1,79,257,26,5,32.75


In [106]:
df_updated.shape

(91802, 6)

In [118]:
# Create X variable for features and Y variable for target
X = df_updated.drop('PULocationID', axis=1) #features
y = df_updated['PULocationID'] #target

In [119]:
# Split the Train and Test Data
# Apply Scaling and Fitting
# Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# Applying standard scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [120]:
nn(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

          10       0.00      0.00      0.00         7
          28       0.00      0.00      0.00         0
          48       0.26      0.38      0.31      1904
          56       0.00      0.00      0.00         3
          68       0.30      0.10      0.15      1241
          79       0.32      0.62      0.42      2601
         130       0.00      0.00      0.00         2
         132       0.85      0.93      0.89      1644
         134       0.00      0.00      0.00         2
         138       0.56      0.70      0.62       868
         148       0.33      0.26      0.29      1718
         154       0.00      0.00      0.00         2
         161       0.29      0.39      0.33      1662
         164       0.28      0.11      0.16      1299
         207       0.00      0.00      0.00         0
         216       0.00      0.00      0.00         1
         230       0.26      0.17      0.20      1949
         234       0.23    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [121]:
RandomForest_Model(10)

Number of Decision Trees: 10
Test Score: 0.3997603616360765
Train Score: 0.9320543020928365


In [122]:
RandomForest_Model(50)

Number of Decision Trees: 50
Test Score: 0.4145743695877131
Train Score: 0.949156465734399


In [123]:
RandomForest_Model(100)

Number of Decision Trees: 100
Test Score: 0.41522792876205
Train Score: 0.949306245829986


In [124]:
Bagging_model(0.5, 1.0, 20)

Max Samples: 0.5
Max Features: 1.0
Number of Decision Trees: 20
Test Score: 0.44158814879363867
Train Score: 0.8680437357879114


In [125]:
Boosting_model(10,1)

Number of Decision Trees: 10
Learning Rate: 1
Test Score: 0.4049343717662437
Train Score: 0.9492790130853338


In [126]:
df_updated.head()

Unnamed: 0,trip_distance,PULocationID,DOLocationID,fare_amount,tip_amount,total_amount
0,2.6,79,163,12,2,16.56
1,0.66,79,79,5,1,8.16
2,0.6,79,232,4,1,7.5
3,4.2,79,146,15,3,19.56
4,9.1,79,257,26,5,32.75


In [127]:
# 3 Locations
# JFK, East Village, Astoria Park
JFK = df_updated[df_updated["PULocationID"] == 132]
EASTV = df_updated[df_updated["PULocationID"] == 79]
ASTORIAPK = df_updated[df_updated["PULocationID"] == 8]

In [128]:
three_frames = [JFK,EASTV,ASTORIAPK]

In [129]:
df_three = pd.concat(three_frames)

In [130]:
df_three.head()

Unnamed: 0,trip_distance,PULocationID,DOLocationID,fare_amount,tip_amount,total_amount
22821,20.3,132,61,58,5,64.3
22822,12.4,132,49,37,7,45.95
22823,18.83,132,41,52,8,67.09
22824,18.7,132,262,52,10,68.34
22825,11.4,132,138,31,4,36.3


In [131]:
df_three.shape

(21207, 6)

In [133]:
# Create X variable for features and Y variable for target
X = df_three.drop('PULocationID', axis=1) #features
y = df_three['PULocationID'] #target

In [134]:
# Split the Train and Test Data
# Apply Scaling and Fitting
# Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# Applying standard scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [135]:
nn(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           8       0.00      0.00      0.00         1
          79       0.99      0.99      0.99      2604
         132       0.98      0.98      0.98      1637

    accuracy                           0.99      4242
   macro avg       0.66      0.66      0.66      4242
weighted avg       0.99      0.99      0.99      4242

[[   0    1    0]
 [   0 2574   30]
 [   1   24 1612]]
Model Accuracy: 98.68%


In [136]:
RandomForest_Model(10)

Number of Decision Trees: 10
Test Score: 0.9884488448844885
Train Score: 0.9988800471559093


In [137]:
RandomForest_Model(50)

Number of Decision Trees: 50
Test Score: 0.9882131070249882
Train Score: 1.0


In [138]:
RandomForest_Model(100)

Number of Decision Trees: 100
Test Score: 0.987034417727487
Train Score: 1.0


In [139]:
Bagging_model(0.5, 1.0, 20)

Max Samples: 0.5
Max Features: 1.0
Number of Decision Trees: 20
Test Score: 0.9877416313059877
Train Score: 0.9957559681697613


In [140]:
Boosting_model(10,1)

Number of Decision Trees: 10
Learning Rate: 1
Test Score: 0.9827911362564828
Train Score: 1.0


In [141]:
# 3 Locations
# LaGuardia, Clinton East, Jamaica
LaGuardia = df_updated[df_updated["PULocationID"] == 138]
ClintonEast = df_updated[df_updated["PULocationID"] == 48]
Jamaica = df_updated[df_updated["PULocationID"] == 130]

In [142]:
six_frames = [JFK,EASTV,ASTORIAPK,LaGuardia,ClintonEast,Jamaica]

In [143]:
data_six = pd.concat(six_frames)

In [144]:
data_six.head()

Unnamed: 0,trip_distance,PULocationID,DOLocationID,fare_amount,tip_amount,total_amount
22821,20.3,132,61,58,5,64.3
22822,12.4,132,49,37,7,45.95
22823,18.83,132,41,52,8,67.09
22824,18.7,132,262,52,10,68.34
22825,11.4,132,138,31,4,36.3


In [145]:
data_six.shape

(35471, 6)

In [146]:
# Create X variable for features and Y variable for target
X = data_six.drop('PULocationID', axis=1) #features
y = data_six['PULocationID'] #target

In [147]:
# Split the Train and Test Data
# Apply Scaling and Fitting
# Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# Applying standard scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [148]:
nn(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           8       0.00      0.00      0.00         1
          48       0.79      0.65      0.71      1958
          79       0.76      0.85      0.81      2620
         130       0.00      0.00      0.00         1
         132       0.89      0.96      0.93      1640
         138       0.79      0.72      0.75       875

    accuracy                           0.80      7095
   macro avg       0.54      0.53      0.53      7095
weighted avg       0.80      0.80      0.80      7095

[[   0    1    0    0    0    0]
 [   0 1269  606    0   19   64]
 [   0  296 2231    0   31   62]
 [   0    0    0    0    0    1]
 [   0    7   12    0 1577   44]
 [   0   36   70    1  140  628]]
Model Accuracy: 80.41%


  'precision', 'predicted', average, warn_for)


In [149]:
RandomForest_Model(10)

Number of Decision Trees: 10
Test Score: 0.8679351656095842
Train Score: 0.9920002819283902


In [150]:
RandomForest_Model(50)

Number of Decision Trees: 50
Test Score: 0.8797744890768147
Train Score: 0.9971807160981111


In [151]:
RandomForest_Model(100)

Number of Decision Trees: 100
Test Score: 0.8816067653276956
Train Score: 0.9973216802932056


In [152]:
Bagging_model(0.5, 1.0, 20)

Max Samples: 0.5
Max Features: 1.0
Number of Decision Trees: 20
Test Score: 0.9028893587033122
Train Score: 0.971243304200733


In [153]:
Boosting_model(10,1)

Number of Decision Trees: 10
Learning Rate: 1
Test Score: 0.8941508104298802
Train Score: 0.9973216802932056
