#  Deep Learning Tests

## Preparation

In [1]:
# Import necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
# Open File
df = pd.read_csv('NYC_Cab_Fare_Wrangled_Time_Distance.csv')

In [3]:
# Show first few rows
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,euclidean_distance,...,dayofweek,cold_month,weekend,rush_hour,year,minute,second,15_min_intervals,total_seconds,summer_month
0,0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1,0.009436,...,0,0,0,1,2009,26,21,69,62781,1
1,1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1,0.079696,...,1,1,0,1,2010,52,16,67,60736,0
2,2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2,0.013674,...,3,0,0,0,2011,35,0,2,2100,1
3,3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1,0.02534,...,5,0,1,0,2012,30,42,18,16242,0
4,4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1,0.01947,...,1,1,0,1,2010,51,0,31,28260,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53465062 entries, 0 to 53465061
Data columns (total 22 columns):
Unnamed: 0            int64
key                   object
fare_amount           float64
pickup_datetime       object
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
passenger_count       int64
euclidean_distance    float64
month                 int64
hour                  int64
dayofweek             int64
cold_month            int64
weekend               int64
rush_hour             int64
year                  int64
minute                int64
second                int64
15_min_intervals      int64
total_seconds         int64
summer_month          int64
dtypes: float64(6), int64(14), object(2)
memory usage: 8.8+ GB


In [5]:
# Choose relevant columns
df = df[['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'euclidean_distance', 'cold_month', 'weekend', 'rush_hour', '15_min_intervals', 'summer_month']]

### 500,000 rows

In [6]:
df_sub = df.sample(n=500000)
df_sub.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,euclidean_distance,cold_month,weekend,rush_hour,15_min_intervals,summer_month
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,11.326031,-73.975557,40.750815,-73.974736,40.751204,1.69184,0.03467676,0.41909,0.282736,0.299016,55.59175,0.236896
std,9.550769,0.034416,0.026829,0.038199,0.068023,1.306473,0.0737706,0.493411,0.45033,0.457827,26.099228,0.425178
min,2.5,-74.258628,40.478775,-80.095871,5.65,1.0,1e-06,0.0,0.0,0.0,0.0,0.0
25%,6.0,-73.99233,40.736561,-73.991607,40.735622,1.0,0.01305657,0.0,0.0,0.0,37.0,0.0
50%,8.5,-73.982116,40.753304,-73.980676,40.753908,1.0,0.02208048,0.0,0.0,0.0,58.0,0.0
75%,12.5,-73.968466,40.767462,-73.96558,40.768387,2.0,0.03899589,1.0,1.0,1.0,78.0,0.0
max,459.9,-73.700982,40.916949,-69.729976,62.143497,6.0,35.12818,1.0,1.0,1.0,95.0,1.0


In [7]:
df_sub.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,euclidean_distance,cold_month,weekend,rush_hour,15_min_intervals,summer_month
2999477,2012-08-17 21:00:27.0000007,14.9,2012-08-17 21:00:27,-73.874523,40.774043,-73.869136,40.720666,1,0.053648,0,0,0,84,1
46094238,2014-01-08 18:07:00.000000185,14.5,2014-01-08 18:07:00,-73.954872,40.765432,-73.98983,40.756545,2,0.03607,1,0,1,72,0
28073526,2013-10-31 08:17:00.000000191,28.33,2013-10-31 08:17:00,-73.912388,40.746285,-73.992225,40.730687,5,0.081346,0,0,1,33,0
41858144,2015-02-21 18:39:29.00000012,4.5,2015-02-21 18:39:29,-73.962593,40.763248,-73.960449,40.759655,1,0.004184,1,1,0,74,0
29066295,2013-12-30 23:57:00.00000028,5.5,2013-12-30 23:57:00,-73.984115,40.761647,-73.974345,40.75747,1,0.010625,1,0,0,95,0


In [9]:
y = np.array(df_sub.fare_amount)
X = np.array(df_sub.iloc[:,3:14])
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Linear Regression                                  

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
fare_predictions = lin_reg.predict(X_train)
lin_mse = mean_squared_error(y_train, fare_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

7.603150293810446

## Decision Tree

In [13]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
fare_predictions = tree_reg.predict(X_train)
tree_mse = mean_squared_error(y_train, fare_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

3.845549308344409e-16

In [14]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
tree_rmse_scores=np.sqrt(-scores)
print(scores)

[-31.5691684  -31.95237707 -32.04110961 -29.86179506 -27.15530762]


In [15]:
lin_scores = cross_val_score(lin_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
lin_rmse_scores=np.sqrt(-lin_scores)
print(lin_rmse_scores)

[13.54238725 20.04310279  7.66763898  7.2101136   7.23181161]


## Deep Learning Tests

In [25]:
# keras_regression_test requires "from sklearn.model_selection import train_test_split"
def keras_regression_test(X, y, numbers=[128,64], batch_size=32, activation='relu', optimizer='adam', loss='mean_squared_error'):
        
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    # Save the number of columns in predictors: n_cols
    n_cols = X_train.shape[1]

    # Set up the model: model
    model = Sequential()
    
    # Add the first layer
    model.add(Dense(numbers[0], activation=activation, input_shape=(n_cols,)))
    
    # Add addition layers
    for i in range(len(numbers)-1):
        model.add(Dense(numbers[i+1], activation=activation))

    # Add the output layer
    model.add(Dense(1))

    # Compile the model
    model.compile(optimizer=optimizer, loss=loss)

    # Define early_stopping_monitor
    early_stopping_monitor = EarlyStopping(patience=2)

    # Fit the model
    model.fit(X_train, y_train, validation_split=0.3, epochs=30, batch_size=batch_size, callbacks=[early_stopping_monitor])

    # Get score for predictions
    score = model.evaluate(X_test, y_test)
    
    # Get root mean squared error
    rmse = np.sqrt(score)
    
    # Return root mean squared error
    print(rmse)
    
    return model

In [None]:
# keras_regression_test requires "from sklearn.model_selection import train_test_split"
def keras_regression_final(X, y, numbers=[128,64], batch_size=32, activation='relu', optimizer='adam', loss='mean_squared_error'):
    
    # Save the number of columns in predictors: n_cols
    n_cols = X.shape[1]

    # Set up the model: model
    model = Sequential()
    
    # Add the first layer
    model.add(Dense(numbers[0], activation=activation, input_shape=(n_cols,)))
    
    # Add addition layers
    for i in range(len(numbers)-1):
        model.add(Dense(numbers[i+1], activation=activation))

    # Add the output layer
    model.add(Dense(1))

    # Compile the model
    model.compile(optimizer=optimizer, loss=loss)

    # Define early_stopping_monitor
    early_stopping_monitor = EarlyStopping(patience=2)

    # Fit the model
    model.fit(X, y, epochs=30, batch_size=batch_size, callbacks=[early_stopping_monitor])
    
    return model

In [17]:
keras_regression_test(numbers=[100,50])

Train on 262500 samples, validate on 112500 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
4.535789480285946


<keras.engine.sequential.Sequential at 0x1a35fc6320>

In [22]:
df_sub = df.sample(n=5000000)

In [26]:
y = np.array(df_sub.fare_amount)
X = np.array(df_sub.iloc[:,3:14])
keras_regression_test(X, y, numbers=[100,50])

Train on 2625000 samples, validate on 1125000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
4.323052388678065


<keras.engine.sequential.Sequential at 0x1c09b33048>

In [27]:
y = np.array(df.fare_amount)
X = np.array(df.iloc[:,3:14])
keras_regression_test(X, y, numbers=[100,50])

Train on 28069157 samples, validate on 12029639 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
4.345326023849211


<keras.engine.sequential.Sequential at 0x1a3444d1d0>

In [None]:
# new instances where we do not know the answer
Xnew, a = make_regression(n_samples=3, n_features=2, noise=0.1, random_state=1)
Xnew = scalarX.transform(Xnew)
# make a prediction
ynew = model.predict(Xnew)
# show the inputs and predicted outputs
for i in range(len(Xnew)):
	print("X=%s, Predicted=%s" % (Xnew[i], ynew[i]))

In [19]:
model = keras_regression_test(numbers=[100,50], batch_size=36)

Train on 262500 samples, validate on 112500 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
4.3533494672510376


In [21]:
y = np.array(df.fare_amount)
X = np.array(df.iloc[:,3:14])
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = keras_regression_test(numbers=[100,50])

Train on 262500 samples, validate on 112500 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
5.668036938678171


In [170]:
batch_sizes = [18, 24, 30, 36]
for batch_size in batch_sizes:
    print('batch size: ', batch_size)
    keras_regression_test(numbers=[100,50], batch_size=batch_size)

batch size:  18
Train on 262500 samples, validate on 112500 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
4.46488461424254
batch size:  24
Train on 262500 samples, validate on 112500 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
4.441380409507338
batch size:  30
Train on 262500 samples, validate on 112500 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
4.527793049015641
batch size:  36
Train on 262500 samples, validate on 112500 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
4.363663320832129


In [51]:
keras_regression_test(X, y, numbers=[100,50])

Train on 262500 samples, validate on 112500 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30


4.307730960928071

In [54]:
df_sub2 = df.sample(n=5000000)
y2 = np.array(df_sub2.fare_amount)
X2 = np.array(df_sub2.iloc[:,3:12])

In [56]:
y2 = np.array(df_sub2.fare_amount)
X2 = np.array(df_sub2.iloc[:,3:12])

In [59]:
keras_regression_test(X2, y2, numbers=[100,50])

Train on 2625000 samples, validate on 1125000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30


4.300137875702223

Ideas to improve:
1T) Hours into hours minutes seconds (cut month)
2L) Airport 
3L) Boroughs
3T) Weekday, Weekend
4T) Rush Hour 
5T) Seasons
6T) Holidays
7T) Evenings
8T) Weather

### Full Amount (53,000,000 +)

In [164]:
y = np.array(df.fare_amount)
X = np.array(df.iloc[:,3:9])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

### 5,000,000 rows

In [None]:
keras_regression_test(X2, y2, layers=2, nodes=2, numbers=[128,64])

### More columns added

In [142]:
df_sub2 = df1.sample(n=5000000)
df_sub2.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,euclidean_distance,cold_month,weekend,rush_hour,15_min_intervals,summer_month
29352613,2015-04-07 11:54:02.0000002,14.0,2015-04-07 11:54:02,-73.998657,40.76384,-73.992523,40.735592,1,0.028906,0,0,0,47,0
51380817,2009-04-24 16:46:00.0000001,20.1,2009-04-24 16:46:00,-73.98416,40.72791,-73.986007,40.671605,1,0.056335,0,0,1,67,0
13315395,2012-08-07 21:08:13.0000004,6.1,2012-08-07 21:08:13,-73.99192,40.749328,-74.00505,40.728833,1,0.02434,0,0,0,84,1
21353531,2009-11-28 11:51:00.00000094,49.57,2009-11-28 11:51:00,-73.783647,40.648635,-74.000827,40.747402,1,0.238583,1,1,0,47,0
33664742,2015-04-04 18:58:07.0000002,8.0,2015-04-04 18:58:07,-73.983826,40.738216,-74.002022,40.739578,1,0.018247,0,1,0,75,0


In [150]:
X2 = np.array(df_sub2.iloc[:,3:14])
keras_regression_test(X2, y2, numbers=[100,50])

Train on 10500000 samples, validate on 4500000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30


4.336311837035749

In [152]:
keras_regression_test(X, y, numbers=[1000])

Train on 262500 samples, validate on 112500 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30


4.3363916224920676

In [10]:
keras_regression_test(numbers=[100,50])

Train on 262500 samples, validate on 112500 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
4.829188289492332


<keras.engine.sequential.Sequential at 0x127464ef0>

In [11]:
keras_regression_test()

Train on 262500 samples, validate on 112500 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
5.119094347493275


<keras.engine.sequential.Sequential at 0x127464550>