In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
CURRENT_DIR = Path('.').resolve()
MODULES_DIR = CURRENT_DIR.parent.joinpath('src')
sys.path.append(str(MODULES_DIR))
DATA_DIR = CURRENT_DIR.parent.joinpath('Data','Process')

In [7]:
df = pd.read_csv(str(DATA_DIR)+'/taxi_sc.csv',index_col=0)
df_prescaled = pd.read_csv(str(DATA_DIR)+'/taxi_sc_prescaling.csv',index_col=0)

In [6]:
df.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,day_of_week,hour,distance,pickup_dist_JFK_Airport,pickup_dist_Laguardia_Airport,pickup_dist_Newark_Airport,dropoff_dist_JFK_Airport,dropoff_dist_Laguardia_Airport,dropoff_dist_Newark_Airport,fare_amount
0,3.916896,-1.128227,4.151116,-1.365537,-0.526888,-1.469611,-0.078187,-0.078219,-1.560801,0.535442,-0.646405,-3.957854,-2.321151,3.494461,-4.62807,-1.901119,3.621417,4.5
1,-1.203464,-1.510227,-0.146055,1.083475,-0.526888,-0.932852,-1.531625,-1.230176,-1.047515,0.381408,1.292368,0.651855,1.791541,-1.403329,0.631996,-0.076296,0.144374,16.9
2,-0.210322,0.395464,-0.519838,-0.020874,0.239932,-0.396092,0.503188,0.267369,-0.020945,-2.083145,-0.529455,0.30011,0.01054,-0.13733,0.425078,0.438538,-0.519138,5.7
3,-0.34127,-0.677271,-0.529984,0.243605,-0.526888,0.140667,-0.659562,0.612956,1.005625,-1.467007,-0.207528,0.018578,0.400941,-0.48241,0.557964,0.412867,-0.456748,7.7
4,0.226261,0.652445,0.559839,1.145221,-0.526888,-0.932852,-0.95025,-0.769393,-1.047515,-1.004903,-0.369533,0.017033,-0.590553,0.320629,0.03396,-0.964083,0.773758,5.3


In [8]:
df_prescaled.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,day_of_week,hour,distance,pickup_dist_JFK_Airport,pickup_dist_Laguardia_Airport,pickup_dist_Newark_Airport,dropoff_dist_JFK_Airport,dropoff_dist_Laguardia_Airport,dropoff_dist_Newark_Airport
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1,2009,6,15,0,17,0.009436,0.10134,0.055043,0.337147,0.09271,0.064326,0.339123
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,1,16,0.079696,0.245731,0.157402,0.16533,0.242961,0.109925,0.220812
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2,2011,8,18,3,0,0.013674,0.234714,0.113076,0.209742,0.23705,0.12279,0.198236
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1,2012,4,21,5,4,0.02534,0.225895,0.122792,0.197636,0.240846,0.122149,0.200358
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,1,7,0.01947,0.225847,0.098115,0.225807,0.225878,0.087741,0.242228


-----

# Deep feedforward networks

We'll train one in Keras for a regression task

## Model architecture

We shall use a deeper model to account for the additional complexity. The deep feedforward network will have four hidden layers. The first bidden layer will have 128 nodes, with each successive hidden layer having half nodes of its predecessor. This neural network size is a good starting point for us and it should no take too long to train this neural network. A general rule of thumb is that we should start with a small neural network and only increase its complexity as required.

![Imgur](https://imgur.com/5hXrZTV.png)

## Loss functions for regression problems

In regression, the **root mean squere error (RMSE)** is often used as the error metric.

The formula for RMSE is as follows:

$$ \textrm{RMSE} = \sqrt{(\textrm{prediction}-\textrm{actual})^2}


## Model building in Python using keras

Now, let's implement our model architecture in Keras. 

First, split the DataFrame into the training features (X) and the target variable that we're trying to predict(y):

In [9]:
X = df.loc[:,df.columns != 'fare_amount']
y = df.loc[:,'fare_amount']

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Next, let's build our `Sequential` model in keras according to the neural network architecture we outlined earlier:


In [16]:
from keras.models import Sequential
from keras.layers import Dense

In [17]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1))

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               2304      
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 8)                 264       
                                                                 
 dense_4 (Dense)             (None, 1)                 9         
                                                                 
Total params: 12,913
Trainable params: 12,913
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.compile(loss='mse',optimizer='adam', metrics=['mse'])

In [87]:
model.fit(X_train, y_train, epochs=20)

Epoch 1/20
   30/12086 [..............................] - ETA: 42s - loss: 10.3740 - mse: 10.3740

2023-06-22 11:53:33.803192: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 52598000 exceeds 10% of free system memory.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f19786af2b0>

In [88]:
def predict_random(df_prescaled, X_test, model):
    sample = X_test.sample(n=1, random_state=np.random.randint(low=0,
    high=10000))
    idx = sample.index[0] # index
    actual_fare = df_prescaled.loc[idx,'fare_amount'] # Real value
    day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
    'Saturday', 'Sunday']
    day_of_week = day_names[df_prescaled.loc[idx,'day_of_week']]
    hour = df_prescaled.loc[idx,'hour']
    predicted_fare = model.predict(sample)[0][0]
    rmse = np.sqrt(np.square(predicted_fare-actual_fare))    
    print(f'Trip Details: {day_of_week}, {hour}:00 hrs')
    print(f'Actual fare: ${actual_fare: .2f}')
    print(f'Predicted fare: $ {predicted_fare: .2f}')
    print(f'RMSE: ${rmse: .2f}')

In [89]:
predict_random(df_prescaled, X_test, model)

Trip Details: Thursday, 6:00 hrs
Actual fare: $ 4.90
Predicted fare: $  6.01
RMSE: $ 1.11


In [90]:
predict_random(df_prescaled, X_test, model)

Trip Details: Wednesday, 23:00 hrs
Actual fare: $ 10.50
Predicted fare: $  10.31
RMSE: $ 0.19


In [91]:
predict_random(df_prescaled, X_test, model)

Trip Details: Wednesday, 0:00 hrs
Actual fare: $ 37.00
Predicted fare: $  28.67
RMSE: $ 8.33


In [92]:
predict_random(df_prescaled, X_test, model)

Trip Details: Friday, 1:00 hrs
Actual fare: $ 8.50
Predicted fare: $  8.42
RMSE: $ 0.08


In [93]:
predict_random(df_prescaled, X_test, model)

Trip Details: Sunday, 1:00 hrs
Actual fare: $ 8.90
Predicted fare: $  8.07
RMSE: $ 0.83


In [94]:
predict_random(df_prescaled, X_test, model)

Trip Details: Monday, 22:00 hrs
Actual fare: $ 6.10
Predicted fare: $  6.27
RMSE: $ 0.17


In [95]:
predict_random(df_prescaled, X_test, model)

Trip Details: Thursday, 21:00 hrs
Actual fare: $ 17.70
Predicted fare: $  20.15
RMSE: $ 2.45


In [96]:
predict_random(df_prescaled, X_test, model)

Trip Details: Sunday, 21:00 hrs
Actual fare: $ 6.50
Predicted fare: $  4.96
RMSE: $ 1.54


In [97]:
from sklearn.metrics import mean_squared_error
train_pred = model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
test_pred = model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
print("Train RMSE: {:0.2f}".format(train_rmse))
print("Test RMSE: {:0.2f}".format(test_rmse))

Train RMSE: 3.22
Test RMSE: 3.24
