In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('./PYTORCH_NOTEBOOKS/Data/NYCTaxiFares.csv')

In [4]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


In [5]:
df['fare_amount'].describe()

count    120000.000000
mean         10.040326
std           7.500134
min           2.500000
25%           5.700000
50%           7.700000
75%          11.300000
max          49.900000
Name: fare_amount, dtype: float64

In [6]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [7]:
df['dist_km'] = haversine_distance(df, 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

In [8]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_datetime    120000 non-null  object 
 1   fare_amount        120000 non-null  float64
 2   fare_class         120000 non-null  int64  
 3   pickup_longitude   120000 non-null  float64
 4   pickup_latitude    120000 non-null  float64
 5   dropoff_longitude  120000 non-null  float64
 6   dropoff_latitude   120000 non-null  float64
 7   passenger_count    120000 non-null  int64  
 8   dist_km            120000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 8.2+ MB


In [10]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   pickup_datetime    120000 non-null  datetime64[ns, UTC]
 1   fare_amount        120000 non-null  float64            
 2   fare_class         120000 non-null  int64              
 3   pickup_longitude   120000 non-null  float64            
 4   pickup_latitude    120000 non-null  float64            
 5   dropoff_longitude  120000 non-null  float64            
 6   dropoff_latitude   120000 non-null  float64            
 7   passenger_count    120000 non-null  int64              
 8   dist_km            120000 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(6), int64(2)
memory usage: 8.2 MB


In [12]:
my_time = df['pickup_datetime'][0]
df['EDTdate'] = df['pickup_datetime'] - pd.Timedelta(hours=4)

In [13]:
df['Hour'] = df['EDTdate'].dt.hour

In [14]:
df['AMorPM'] = np.where(df['Hour'] < 12, 'am', 'pm')

In [15]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56+00:00,4,am
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53+00:00,11,am
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26+00:00,7,am
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03+00:00,17,pm
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01+00:00,22,pm


In [16]:

df['Weekday'] = df['EDTdate'].dt.strftime("%a")

In [17]:
df.columns

Index(['pickup_datetime', 'fare_amount', 'fare_class', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km', 'EDTdate', 'Hour', 'AMorPM', 'Weekday'],
      dtype='object')

In [18]:
cat_cols = ['Hour', 'AMorPM', 'Weekday']
cont_cols = ['pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km']

In [19]:
y_col = ['fare_amount']

In [20]:
df.dtypes

pickup_datetime      datetime64[ns, UTC]
fare_amount                      float64
fare_class                         int64
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dist_km                          float64
EDTdate              datetime64[ns, UTC]
Hour                               int32
AMorPM                            object
Weekday                           object
dtype: object

In [21]:
for cat in cat_cols:
    df[cat] = df[cat].astype('category')

In [22]:
df.dtypes

pickup_datetime      datetime64[ns, UTC]
fare_amount                      float64
fare_class                         int64
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dist_km                          float64
EDTdate              datetime64[ns, UTC]
Hour                            category
AMorPM                          category
Weekday                         category
dtype: object

In [23]:
df['Hour'].head()

0     4
1    11
2     7
3    17
4    22
Name: Hour, dtype: category
Categories (24, int32): [0, 1, 2, 3, ..., 20, 21, 22, 23]

In [24]:
df['AMorPM'].head()

0    am
1    am
2    am
3    pm
4    pm
Name: AMorPM, dtype: category
Categories (2, object): ['am', 'pm']

In [25]:
df['Weekday'].head()

0    Mon
1    Sat
2    Sat
3    Sun
4    Fri
Name: Weekday, dtype: category
Categories (7, object): ['Fri', 'Mon', 'Sat', 'Sun', 'Thu', 'Tue', 'Wed']

In [26]:
df['Weekday'].cat.codes.values 

array([1, 2, 2, ..., 3, 5, 2], dtype=int8)

In [27]:
hr = df['Hour'].cat.codes.values
ampm = df['AMorPM'].cat.codes.values
wkdy = df['Weekday'].cat.codes.values

In [28]:
cats = np.stack([hr, ampm, wkdy], axis = 1)

In [29]:
cats

array([[ 4,  0,  1],
       [11,  0,  2],
       [ 7,  0,  2],
       ...,
       [14,  1,  3],
       [ 4,  0,  5],
       [12,  1,  2]], dtype=int8)

In [30]:
cats = torch.tensor(cats, dtype=torch.int64)

In [31]:
conts = np.stack([df[col].values for col in cont_cols], axis=1)

In [32]:
conts

array([[-73.992365  ,  40.730521  , -73.975499  ,  40.744746  ,
          1.        ,   2.12631159],
       [-73.990078  ,  40.740558  , -73.974232  ,  40.744114  ,
          1.        ,   1.39230687],
       [-73.994149  ,  40.751118  , -73.960064  ,  40.766235  ,
          2.        ,   3.32676344],
       ...,
       [-73.988574  ,  40.749772  , -74.011541  ,  40.707799  ,
          3.        ,   5.05252282],
       [-74.004449  ,  40.724529  , -73.992697  ,  40.730765  ,
          1.        ,   1.20892296],
       [-73.955415  ,  40.77192   , -73.967623  ,  40.763015  ,
          3.        ,   1.42739869]])

In [33]:
conts = torch.tensor(conts, dtype=torch.float)

In [34]:
conts

tensor([[-73.9924,  40.7305, -73.9755,  40.7447,   1.0000,   2.1263],
        [-73.9901,  40.7406, -73.9742,  40.7441,   1.0000,   1.3923],
        [-73.9941,  40.7511, -73.9601,  40.7662,   2.0000,   3.3268],
        ...,
        [-73.9886,  40.7498, -74.0115,  40.7078,   3.0000,   5.0525],
        [-74.0044,  40.7245, -73.9927,  40.7308,   1.0000,   1.2089],
        [-73.9554,  40.7719, -73.9676,  40.7630,   3.0000,   1.4274]])

In [41]:
y = torch.tensor(df[y_col].values, dtype=torch.float) #.reshape(-1, 1)

In [42]:
cats.shape

torch.Size([120000, 3])

In [43]:
conts.shape

torch.Size([120000, 6])

In [44]:
y.shape

torch.Size([120000, 1])

In [45]:
cat_szs = [len(df[col].cat.categories) for col in cat_cols]

In [46]:
cat_szs

[24, 2, 7]

In [47]:
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]

In [48]:
emb_szs

[(24, 12), (2, 1), (7, 4)]

In [50]:
catz = cats[:4]

In [51]:
catz

tensor([[ 4,  0,  1],
        [11,  0,  2],
        [ 7,  0,  2],
        [17,  1,  3]])

In [54]:
selfembeds = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_szs])

In [55]:
selfembeds

ModuleList(
  (0): Embedding(24, 12)
  (1): Embedding(2, 1)
  (2): Embedding(7, 4)
)

In [56]:
embeddingz = []

for i, e in enumerate(selfembeds):
    embeddingz.append(e(catz[:, i]))

In [57]:
embeddingz

[tensor([[-0.8364, -1.2074, -0.5773,  1.8038, -0.1008, -1.5992,  0.7422, -0.0498,
           0.1372, -0.1886, -0.1805,  0.0930],
         [-0.7978, -1.8534, -0.2289,  2.0801, -0.1855,  0.4218,  0.0442,  0.3258,
           0.5471, -0.6554, -0.1237,  1.4257],
         [ 1.1524, -1.4303, -0.2547, -0.2135, -1.9611,  0.2747, -0.8877,  2.3432,
          -0.6827, -1.7206, -0.0618, -1.7225],
         [ 0.1265, -0.3097,  0.6724,  0.6482, -0.2860, -0.3807,  0.3492,  0.6825,
          -1.0482, -0.8275, -0.0144,  0.0623]], grad_fn=<EmbeddingBackward0>),
 tensor([[ 0.7970],
         [ 0.7970],
         [ 0.7970],
         [-1.0884]], grad_fn=<EmbeddingBackward0>),
 tensor([[-0.1464, -0.5839, -0.4923,  1.8152],
         [-0.6657, -0.9739,  0.7996, -1.7298],
         [-0.6657, -0.9739,  0.7996, -1.7298],
         [ 1.0038, -1.3796,  0.2203, -0.2154]], grad_fn=<EmbeddingBackward0>)]

In [58]:
z = torch.cat(embeddingz, 1)

In [59]:
z

tensor([[-0.8364, -1.2074, -0.5773,  1.8038, -0.1008, -1.5992,  0.7422, -0.0498,
          0.1372, -0.1886, -0.1805,  0.0930,  0.7970, -0.1464, -0.5839, -0.4923,
          1.8152],
        [-0.7978, -1.8534, -0.2289,  2.0801, -0.1855,  0.4218,  0.0442,  0.3258,
          0.5471, -0.6554, -0.1237,  1.4257,  0.7970, -0.6657, -0.9739,  0.7996,
         -1.7298],
        [ 1.1524, -1.4303, -0.2547, -0.2135, -1.9611,  0.2747, -0.8877,  2.3432,
         -0.6827, -1.7206, -0.0618, -1.7225,  0.7970, -0.6657, -0.9739,  0.7996,
         -1.7298],
        [ 0.1265, -0.3097,  0.6724,  0.6482, -0.2860, -0.3807,  0.3492,  0.6825,
         -1.0482, -0.8275, -0.0144,  0.0623, -1.0884,  1.0038, -1.3796,  0.2203,
         -0.2154]], grad_fn=<CatBackward0>)

In [60]:
selfembeddrop = nn.Dropout(0.4)

In [61]:
z = selfembeddrop(z)

In [62]:
z

tensor([[-0.0000, -0.0000, -0.0000,  0.0000, -0.1680, -2.6653,  0.0000, -0.0000,
          0.2286, -0.0000, -0.3008,  0.1551,  1.3283, -0.0000, -0.0000, -0.0000,
          3.0254],
        [-0.0000, -3.0890, -0.3816,  0.0000, -0.3091,  0.7030,  0.0736,  0.0000,
          0.9118, -0.0000, -0.2062,  0.0000,  0.0000, -0.0000, -1.6232,  1.3326,
         -2.8830],
        [ 1.9206, -0.0000, -0.4244, -0.0000, -3.2686,  0.0000, -1.4795,  0.0000,
         -0.0000, -2.8676, -0.1030, -2.8708,  0.0000, -0.0000, -1.6232,  0.0000,
         -2.8830],
        [ 0.2109, -0.5162,  1.1207,  1.0803, -0.0000, -0.6344,  0.5820,  1.1374,
         -0.0000, -0.0000, -0.0240,  0.0000, -1.8140,  0.0000, -2.2993,  0.0000,
         -0.3590]], grad_fn=<MulBackward0>)

In [63]:
class TabularModel(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5) -> None:
        
        # layers = [200, 100, 50] -> 200 neurons in the 1st layer, then 100, and then 50.
        
        super().__init__()
        
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum([nf for ni, nf in emb_szs])
        n_in = n_emb + n_cont 
        
        for i in layers: # num of neuron in each layers 
            layerlist.append(nn.Linear(n_in, i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        
        layerlist.append(nn.Linear(layers[-1], out_sz))
        
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        
        for i, e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:, i]))
        
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x 
        

In [64]:
torch.manual_seed(33)
model = TabularModel(emb_szs, conts.shape[1], 1, [200, 100], p=0.4)

In [65]:
model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(24, 12)
    (1): Embedding(2, 1)
    (2): Embedding(7, 4)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=23, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=1, bias=True)
  )
)

In [66]:
criterion = nn.MSELoss() # np.sqrt(MSE) --> RMSE
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [69]:
batch_size = 60000
test_size = int(batch_size * 0.2)

In [None]:
# Data shuffled already
cat_train = 
cat_test = 
con_train = 
con_test = 