<a href="https://colab.research.google.com/github/farazabir/nyc-taxi-fare-prediction/blob/main/nyc_taxi_fare_tabularmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv('taxifare.csv')

In [6]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


In [7]:
df['fare_amount'].describe()


Unnamed: 0,fare_amount
count,120000.0
mean,10.040326
std,7.500134
min,2.5
25%,5.7
50%,7.7
75%,11.3
max,49.9


In [8]:
def haversine_distance(df,lat1,long1,lat2,long2):
  r = 6371

  phi1 = np.radians(df[lat1])
  phi2 = np.radians(df[lat2])

  delta_phi = np.radians(df[lat2]-df[lat1])
  delta_lambda = np.radians(df[long2]-df[long1])

  a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
  c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
  d = (r*c)

  return d

In [9]:
df['dist_km'] = haversine_distance(df,'pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude')

In [10]:
df.columns

Index(['pickup_datetime', 'fare_amount', 'fare_class', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km'],
      dtype='object')

In [11]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_datetime    120000 non-null  object 
 1   fare_amount        120000 non-null  float64
 2   fare_class         120000 non-null  int64  
 3   pickup_longitude   120000 non-null  float64
 4   pickup_latitude    120000 non-null  float64
 5   dropoff_longitude  120000 non-null  float64
 6   dropoff_latitude   120000 non-null  float64
 7   passenger_count    120000 non-null  int64  
 8   dist_km            120000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 8.2+ MB


In [13]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [14]:
df['pickup_datetime']

Unnamed: 0,pickup_datetime
0,2010-04-19 08:17:56+00:00
1,2010-04-17 15:43:53+00:00
2,2010-04-17 11:23:26+00:00
3,2010-04-11 21:25:03+00:00
4,2010-04-17 02:19:01+00:00
...,...
119995,2010-04-18 14:33:03+00:00
119996,2010-04-23 10:27:48+00:00
119997,2010-04-18 18:50:40+00:00
119998,2010-04-13 08:14:44+00:00


In [15]:
my_time = df['pickup_datetime'][0]

In [16]:
df['EDTdate'] = df['pickup_datetime']-pd.Timedelta(hours=4)

In [17]:
df['Hour'] = df['EDTdate'].dt.hour

In [18]:
df['AMorPM'] = np.where(df['Hour']<12,'am','pm')

In [19]:
df['Weekday'] = df['EDTdate'].dt.strftime("%a")

In [20]:
df.columns

Index(['pickup_datetime', 'fare_amount', 'fare_class', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km', 'EDTdate', 'Hour', 'AMorPM', 'Weekday'],
      dtype='object')

In [21]:
cat_cols = ['Hour','AMorPM','Weekday']
cont_cols = ['pickup_longitude','pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km']

In [22]:
y_col = ['fare_amount']

In [23]:
df.dtypes

Unnamed: 0,0
pickup_datetime,"datetime64[ns, UTC]"
fare_amount,float64
fare_class,int64
pickup_longitude,float64
pickup_latitude,float64
dropoff_longitude,float64
dropoff_latitude,float64
passenger_count,int64
dist_km,float64
EDTdate,"datetime64[ns, UTC]"


In [24]:
for cat in cat_cols:
  df[cat] = df[cat].astype('category')

In [25]:
df['Hour'].head()

Unnamed: 0,Hour
0,4
1,11
2,7
3,17
4,22


In [26]:
hr = df['Hour'].cat.codes.values
ampm = df['AMorPM'].cat.codes.values
wkdy = df['Weekday'].cat.codes.values

In [27]:
cats = np.stack([hr,ampm,wkdy],axis=1)

In [29]:
cats = torch.tensor(cats,dtype=torch.int64)

In [30]:
consts = np.stack([df[col].values for col in cont_cols],axis=1)
consts = torch.tensor(consts,dtype=torch.float)

In [32]:
y = torch.tensor(df[y_col].values,dtype=torch.float).reshape(-1,1)

In [33]:
y.shape

torch.Size([120000, 1])

In [34]:
cat_szs = [len(df[col].cat.categories) for col in cat_cols]

In [35]:
cat_szs

[24, 2, 7]

In [36]:
emb_szs = [(size,min(50,(size+1)//2)) for size in cat_szs]

In [39]:
selfembeds = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs])

In [40]:
embeddingz = []
for i,e in enumerate(selfembeds):
  embeddingz.append(e(cats[:,i]))

In [41]:
embeddingz

[tensor([[ 0.7356, -0.7243, -0.6232,  ..., -0.4508,  1.6464, -0.4285],
         [-0.8992, -0.1459,  0.6386,  ..., -0.7581,  0.2009, -1.1376],
         [-1.4035,  1.6818, -0.2509,  ..., -0.2642, -1.3553,  1.1293],
         ...,
         [-0.6449,  0.1402, -1.4706,  ..., -0.6659, -1.1868, -0.3038],
         [ 0.7356, -0.7243, -0.6232,  ..., -0.4508,  1.6464, -0.4285],
         [ 1.4760,  0.9357, -0.5699,  ..., -0.0443,  1.7841, -0.3992]],
        grad_fn=<EmbeddingBackward0>),
 tensor([[-0.2743],
         [-0.2743],
         [-0.2743],
         ...,
         [ 2.5238],
         [-0.2743],
         [ 2.5238]], grad_fn=<EmbeddingBackward0>),
 tensor([[-0.0623, -1.8948,  1.5727,  1.0980],
         [ 0.6280, -0.2794, -1.4631, -1.0468],
         [ 0.6280, -0.2794, -1.4631, -1.0468],
         ...,
         [ 0.5355, -1.8416,  0.5863, -0.4094],
         [-0.2855,  0.4123, -0.6508,  0.9049],
         [ 0.6280, -0.2794, -1.4631, -1.0468]], grad_fn=<EmbeddingBackward0>)]

In [42]:
z = torch.cat(embeddingz,1)

In [43]:
selfembdrop = nn.Dropout(0.4)

In [45]:
class TabularModel(nn.Module):

  def __init__(self,emb_szs,n_cont,out_sz,layers,p=0.5):
    super().__init__()
    self.embeds = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs])
    self.emb_drop = nn.Dropout(p)
    self.bn_cont = nn.BatchNorm1d(n_cont)

    layerlist = []
    n_embs = sum((nf for ni,nf in emb_szs))
    n_in  = n_emb + n_cont

    for i in layers:
      layerlist.append(nn.Linear(n_in,i))
      layerlist.append(nn.Relu(inplace=True))
      layerlist.append(nn.BatchNorm1d(i))
      layerlist.append(nn.Dropout(p))
      n_in = i

    layerlist.appned(nn.Linear(layers[-1],out_sz))

    self.layers = nn.Sequential(*layerlist)

def forward(self,x_cat,x_cont):
  embeddings = []

  for i,e in enumerate(self.embeds):
    embeddings.append(e(x_cat[:,i]))

  x = torch.cat(embeddings,1)
  x = self.emb_drop(x)
  x_cont = self.bn_cont(x_cont)

  x = torch.cat([x,x_cont],1)
  x = self.layers(x)
  return x


