<a href="https://colab.research.google.com/github/euguroglu/Machine-Learning-Projects/blob/master/Pytorch_Neural_Network_New_York_City_Taxi_Fare_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing libraries

In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing data and visualization

In [4]:
df = pd.read_csv('NYCTaxiFares.csv')

In [5]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


In [6]:
df['fare_amount'].describe()

count    120000.000000
mean         10.040326
std           7.500134
min           2.500000
25%           5.700000
50%           7.700000
75%          11.300000
max          49.900000
Name: fare_amount, dtype: float64

In [7]:
df.describe()

Unnamed: 0,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0
mean,10.040326,0.333333,-73.976626,40.751443,-73.974501,40.751695,1.347167
std,7.500134,0.471406,0.031497,0.025821,0.032419,0.030279,0.759263
min,2.5,0.0,-74.465447,40.121653,-74.443323,40.164927,1.0
25%,5.7,0.0,-73.992386,40.736594,-73.991478,40.735914,1.0
50%,7.7,0.0,-73.982084,40.753661,-73.980411,40.754441,1.0
75%,11.3,1.0,-73.96871,40.76802,-73.9655,40.76888,1.0
max,49.9,1.0,-73.311845,40.981292,-73.49614,40.993498,5.0


## Feature engineering

Longitude and latitude information are very close to each other, it seems not possible to extract useable values using them. Those parameters can be used to calculate distance between pickup and drop off locations using haveersine theorem. https://en.wikipedia.org/wiki/Haversine_formula#:~:text=The%20haversine%20formula%20determines%20the,and%20angles%20of%20spherical%20triangles.

1)Distance calculation

In [8]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [9]:
df['dist_km'] = haversine_distance(df,'pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude')

In [10]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_datetime    120000 non-null  object 
 1   fare_amount        120000 non-null  float64
 2   fare_class         120000 non-null  int64  
 3   pickup_longitude   120000 non-null  float64
 4   pickup_latitude    120000 non-null  float64
 5   dropoff_longitude  120000 non-null  float64
 6   dropoff_latitude   120000 non-null  float64
 7   passenger_count    120000 non-null  int64  
 8   dist_km            120000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 8.2+ MB


2)Datetime calculation

In [12]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   pickup_datetime    120000 non-null  datetime64[ns, UTC]
 1   fare_amount        120000 non-null  float64            
 2   fare_class         120000 non-null  int64              
 3   pickup_longitude   120000 non-null  float64            
 4   pickup_latitude    120000 non-null  float64            
 5   dropoff_longitude  120000 non-null  float64            
 6   dropoff_latitude   120000 non-null  float64            
 7   passenger_count    120000 non-null  int64              
 8   dist_km            120000 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(6), int64(2)
memory usage: 8.2 MB


In [14]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321


In [15]:
my_time = df['pickup_datetime'][0]

In [16]:
my_time.hour

8

In [17]:
df['EDTdate'] = df['pickup_datetime'] - pd.Timedelta(hours=4)

3)Hour and am-pm calculation

In [18]:
df['Hour'] = df['EDTdate'].dt.hour

In [19]:
df['AMorPM'] = np.where(df['Hour']<12,'am','pm')

In [20]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56+00:00,4,am
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53+00:00,11,am
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26+00:00,7,am
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03+00:00,17,pm
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01+00:00,22,pm


4)Weekday or weekend calculation

In [21]:
df['Weekday'] = df['EDTdate'].dt.strftime('%a')

In [22]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM,Weekday
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56+00:00,4,am,Mon
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53+00:00,11,am,Sat
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26+00:00,7,am,Sat
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03+00:00,17,pm,Sun
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01+00:00,22,pm,Fri


In [23]:
cat_cols = ['Hour','AMorPM','Weekday']
cont_cols = ['pickup_longitude','pickup_latitude', 'dropoff_longitude', 'dropoff_latitude','passenger_count', 'dist_km']

In [24]:
y_col = ['fare_amount']

In [25]:
df.dtypes

pickup_datetime      datetime64[ns, UTC]
fare_amount                      float64
fare_class                         int64
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dist_km                          float64
EDTdate              datetime64[ns, UTC]
Hour                               int64
AMorPM                            object
Weekday                           object
dtype: object

In [26]:
for cat in cat_cols:
  df[cat] = df[cat].astype('category')

In [27]:
df.dtypes

pickup_datetime      datetime64[ns, UTC]
fare_amount                      float64
fare_class                         int64
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dist_km                          float64
EDTdate              datetime64[ns, UTC]
Hour                            category
AMorPM                          category
Weekday                         category
dtype: object

In [28]:
df['Hour'].head()

0     4
1    11
2     7
3    17
4    22
Name: Hour, dtype: category
Categories (24, int64): [0, 1, 2, 3, ..., 20, 21, 22, 23]

In [29]:
df['AMorPM'].head()

0    am
1    am
2    am
3    pm
4    pm
Name: AMorPM, dtype: category
Categories (2, object): [am, pm]

In [30]:
df['Weekday'].head()

0    Mon
1    Sat
2    Sat
3    Sun
4    Fri
Name: Weekday, dtype: category
Categories (7, object): [Fri, Mon, Sat, Sun, Thu, Tue, Wed]

In [31]:
df['AMorPM'].cat.categories

Index(['am', 'pm'], dtype='object')

In [32]:
df['AMorPM'].cat.codes

0         0
1         0
2         0
3         1
4         1
         ..
119995    0
119996    0
119997    1
119998    0
119999    1
Length: 120000, dtype: int8

In [33]:
df['Weekday'].cat.categories

Index(['Fri', 'Mon', 'Sat', 'Sun', 'Thu', 'Tue', 'Wed'], dtype='object')

In [34]:
df['Weekday'].cat.codes

0         1
1         2
2         2
3         3
4         0
         ..
119995    3
119996    0
119997    3
119998    5
119999    2
Length: 120000, dtype: int8

We can easly conver those categorical numbers into array using below code

In [35]:
df['Weekday'].cat.codes.values #it is important to change them into array when we convert them into tensors

array([1, 2, 2, ..., 3, 5, 2], dtype=int8)

In [36]:
hr = df['Hour'].cat.codes.values
ampm = df['AMorPM'].cat.codes.values
wkdy = df['Weekday'].cat.codes.values

In [37]:
wkdy

array([1, 2, 2, ..., 3, 5, 2], dtype=int8)

In [38]:
cats = np.stack([hr,ampm,wkdy],axis=1)

In [39]:
cats

array([[ 4,  0,  1],
       [11,  0,  2],
       [ 7,  0,  2],
       ...,
       [14,  1,  3],
       [ 4,  0,  5],
       [12,  1,  2]], dtype=int8)

In [None]:
#Another choose of code may be 
# cats = np.stack([df[col].cat.codes.values for col in cat_cols],1)

## Converting data sets to tensor array

In [40]:
cats = torch.LongTensor(cats)

In [41]:
conts = np.stack([df[col].values for col in cont_cols],1)

In [42]:
conts

array([[-73.992365  ,  40.730521  , -73.975499  ,  40.744746  ,
          1.        ,   2.12631159],
       [-73.990078  ,  40.740558  , -73.974232  ,  40.744114  ,
          1.        ,   1.39230687],
       [-73.994149  ,  40.751118  , -73.960064  ,  40.766235  ,
          2.        ,   3.32676344],
       ...,
       [-73.988574  ,  40.749772  , -74.011541  ,  40.707799  ,
          3.        ,   5.05252282],
       [-74.004449  ,  40.724529  , -73.992697  ,  40.730765  ,
          1.        ,   1.20892296],
       [-73.955415  ,  40.77192   , -73.967623  ,  40.763015  ,
          3.        ,   1.42739869]])

In [43]:
conts = torch.FloatTensor(conts)

In [44]:
conts

tensor([[-73.9924,  40.7305, -73.9755,  40.7447,   1.0000,   2.1263],
        [-73.9901,  40.7406, -73.9742,  40.7441,   1.0000,   1.3923],
        [-73.9941,  40.7511, -73.9601,  40.7662,   2.0000,   3.3268],
        ...,
        [-73.9886,  40.7498, -74.0115,  40.7078,   3.0000,   5.0525],
        [-74.0044,  40.7245, -73.9927,  40.7308,   1.0000,   1.2089],
        [-73.9554,  40.7719, -73.9676,  40.7630,   3.0000,   1.4274]])

In [45]:
y = torch.FloatTensor(df[y_col].values)

In [46]:
cats.shape

torch.Size([120000, 3])

In [47]:
conts.shape

torch.Size([120000, 6])

In [48]:
y.shape

torch.Size([120000, 1])

In [49]:
cat_szs = [len(df[col].cat.categories)for col in cat_cols]

In [50]:
cat_szs

[24, 2, 7]

## Generale rule to create embedding layers

In [51]:
emb_szs = [(size,min(50,(size+1)//2)) for size in cat_szs]

In [52]:
emb_szs

[(24, 12), (2, 1), (7, 4)]

In [53]:
cats

tensor([[ 4,  0,  1],
        [11,  0,  2],
        [ 7,  0,  2],
        ...,
        [14,  1,  3],
        [ 4,  0,  5],
        [12,  1,  2]])

In [54]:
catz = cats[:2]

In [55]:
catz

tensor([[ 4,  0,  1],
        [11,  0,  2]])

## Demonstration of embedding layer

In [56]:
selfembeds = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs])

In [57]:
selfembeds

ModuleList(
  (0): Embedding(24, 12)
  (1): Embedding(2, 1)
  (2): Embedding(7, 4)
)

In [58]:
#Forward method
embeddingz = []
for i,e in enumerate(selfembeds):
  embeddingz.append(e(catz[:,i]))

Embedding iz one hot encoded but still tensor need be stacked

In [59]:
embeddingz

[tensor([[-0.2231,  0.2582,  0.7847,  1.2335, -0.5773,  2.2784,  2.5977,  0.1996,
           1.0024,  1.4496,  0.0629, -1.9375],
         [-1.2090,  0.7845,  0.7020,  0.4207, -1.0168,  0.2076, -0.9185, -0.0316,
           1.6519, -0.6008,  0.0722,  2.4305]], grad_fn=<EmbeddingBackward>),
 tensor([[0.7984],
         [0.7984]], grad_fn=<EmbeddingBackward>),
 tensor([[ 0.0909,  0.5444, -0.7376,  0.9686],
         [ 1.0283, -1.8250,  0.1308,  0.5596]], grad_fn=<EmbeddingBackward>)]

In [60]:
z = torch.cat(embeddingz,1)

In [61]:
z

tensor([[-0.2231,  0.2582,  0.7847,  1.2335, -0.5773,  2.2784,  2.5977,  0.1996,
          1.0024,  1.4496,  0.0629, -1.9375,  0.7984,  0.0909,  0.5444, -0.7376,
          0.9686],
        [-1.2090,  0.7845,  0.7020,  0.4207, -1.0168,  0.2076, -0.9185, -0.0316,
          1.6519, -0.6008,  0.0722,  2.4305,  0.7984,  1.0283, -1.8250,  0.1308,
          0.5596]], grad_fn=<CatBackward>)

Implementing dropout into embedding code because this code will not be iterated as it is fixed. Implementation is important to reduce overfitting.

In [62]:
selfembedrop = nn.Dropout(0.4)

In [63]:
z = selfembedrop(z)

In [64]:
z

tensor([[-0.3719,  0.0000,  0.0000,  0.0000, -0.9622,  3.7974,  0.0000,  0.3326,
          1.6706,  2.4160,  0.0000, -0.0000,  1.3307,  0.0000,  0.0000, -1.2294,
          1.6143],
        [-2.0149,  0.0000,  1.1699,  0.0000, -1.6946,  0.3460, -1.5309, -0.0527,
          2.7532, -0.0000,  0.0000,  4.0508,  1.3307,  1.7138, -0.0000,  0.2180,
          0.9326]], grad_fn=<MulBackward0>)

## Neural Networks

In [65]:
class TabularModel(nn.Module):
  def __init__(self,emb_szs,n_cont,out_sz,layers,p=0.5):
    super().__init__()


    self.embeds = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs]) #Defining a way to apply embedding to categorical input, creating embedding features
    self.emb_drop = nn.Dropout(p) #Appliyng drop out into embedding data
    self.bn_cont = nn.BatchNorm1d(n_cont) #normalizing numerical data
    layerlist = [] #layer list will be created according to given input layers into model than we will apply sequential function to create whole layers
    n_embs = sum([nf for ni,nf in emb_szs])  #size of the ith embedding unit
    n_in = n_embs + n_cont #generating input size of the input layer, total lenght of the categorical and cont. inputs

    for i in layers:
      layerlist.append(nn.Linear(n_in,i))
      layerlist.append(nn.ReLU(inplace=True))
      layerlist.append(nn.BatchNorm1d(i))
      layerlist.append(nn.Dropout(p))
      n_in = i
    
    layerlist.append(nn.Linear(layers[-1],out_sz))

    self.layers = nn.Sequential(*layerlist)

  def forward(self,x_cat,x_cont):
    embeddings = []

    for i,e in enumerate(self.embeds): # applying embeddings to the categorical input data
      embeddings.append(e(x_cat[:,i]))

    x = torch.cat(embeddings,1) # stack them into same tensor
    x = self.emb_drop(x) #applying dropout to avoid overfitting
    x_cont = self.bn_cont(x_cont) #normalizing cont data
    x = torch.cat([x,x_cont],1)
    x = self.layers(x)
    return x
    

In [66]:
torch.manual_seed(33)

<torch._C.Generator at 0x7fcbf39550c0>

In [67]:
model = TabularModel(emb_szs,conts.shape[1],1,[200,100],p=0.4)

In [68]:
model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(24, 12)
    (1): Embedding(2, 1)
    (2): Embedding(7, 4)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=23, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=1, bias=True)
  )
)

Defining cost function and optimization method

In [69]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

In [70]:
batch_size = 60000
test_size = int(batch_size*0.2)

Train test data split

In [71]:
cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size-test_size:batch_size]
con_train = conts[:batch_size-test_size]
con_test = conts[batch_size-test_size:batch_size]

In [72]:
y_train = y[:batch_size-test_size]
y_test =  y[batch_size-test_size:batch_size]

In [73]:
len(cat_train)

48000

In [74]:
len(cat_test)

12000

## Training

In [None]:
import time

start_time = time.time()

epochs = 300

losses = []

for i in range(epochs):
  i = i+1
  
  y_pred = model(cat_train,con_train)
  loss = torch.sqrt(criterion(y_pred,y_train))
  losses.append(loss)

  if i%10 == 1:
    print('Epoch: {} and Loss: {}'.format(i,loss))
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

duration = time.time() - start_time
print('Training took {} minutes'.format(duration/60))

## Convergence observation

In [None]:
plt.plot(range(epochs),losses)

## Evalulation using test set

In [None]:
with torch.no_grad():
    y_val = model(cat_test,con_test)
    
    loss = torch.sqrt(criterion(y_val,y_test))

In [None]:
loss

## Prediction

In [None]:
for i in range(10):
    
    print('{}.) Predicted:: {:8.2f} True: {:8.2f}'.format(i,y_val[i].item(),y_test[i].item()))

In [None]:
torch.save(model.state_dict(),'TaxiModel.pt')