In [2]:
pip install torch

Collecting torch
  Downloading torch-1.12.0-cp39-cp39-win_amd64.whl (161.8 MB)
Installing collected packages: torch
Successfully installed torch-1.12.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
from torch import nn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plot

In [4]:
dataset = pd.read_csv("Data/NYCTaxiFares.csv")   # Smaller subset of actual data to allow 
                                                 # most computers to run dataset and model effectively

In [5]:
dataset.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


In [6]:
len(dataset)

120000

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_datetime    120000 non-null  object 
 1   fare_amount        120000 non-null  float64
 2   fare_class         120000 non-null  int64  
 3   pickup_longitude   120000 non-null  float64
 4   pickup_latitude    120000 non-null  float64
 5   dropoff_longitude  120000 non-null  float64
 6   dropoff_latitude   120000 non-null  float64
 7   passenger_count    120000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 7.3+ MB


In [8]:
dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'])
dataset['hour'] = dataset['pickup_datetime'].dt.hour
dataset['am_pm'] = np.where(dataset['hour']<12,'am','pm')
dataset['day_of_week'] = dataset['pickup_datetime'].dt.dayofweek  # from 0 (monday) to 6 (sunday)
dataset['weekday_weekend'] = np.where(dataset['day_of_week']<5,'weekday','weekend')

In [14]:
dataset.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,am_pm,day_of_week,weekday_weekend,distance
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,8,am,0,weekday,2.126312
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,15,pm,5,weekend,1.392307
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,11,am,5,weekend,3.326763
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,21,pm,6,weekend,1.864129
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,2,am,5,weekend,7.231321


In [10]:
from sklearn.metrics.pairwise import haversine_distances
from math import radians

def distance(lat1, long1, lat2, long2):
    location1 = [radians(lat1), radians(long1)]
    location2 = [radians(lat2), radians(long2)]
    result = haversine_distances([location1,location2]) * 6371000/1000  # multiply by earth radius
    
    return max(result[0])

In [11]:
distance(1.28927550956,103.851328261,1.30258712298,103.825826697)  # Distance from City Hall MRT to Orchard MRT in km

3.1980793177511013

In [12]:
dataset['distance'] = dataset.apply(lambda row: distance(row.pickup_latitude,row.pickup_longitude,row.dropoff_latitude,row.dropoff_longitude),axis=1)

In [13]:
dataset.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,am_pm,day_of_week,weekday_weekend,distance
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,8,am,0,weekday,2.126312
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,15,pm,5,weekend,1.392307
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,11,am,5,weekend,3.326763
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,21,pm,6,weekend,1.864129
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,2,am,5,weekend,7.231321


In [37]:
# Note: Passenger count should not affect fare price, hence not included 
cat= ['am_pm','weekday_weekend','day_of_week']  
cont = ['pickup_longitude',  # Maybe some locations are prone to traffic jams --> Higher taxi meter fare?
        'pickup_latitude',
        'dropoff_longitude',
        'dropoff_latitude',
        'hour',
        'distance']
label = ['fare_amount']

In [39]:
for item in cat:
    dataset[item] = dataset[item].astype('category')

cat_stack = np.stack([dataset[item].cat.codes.values for item in cat], axis = 1)
cat_stack = torch.tensor(cat_stack, dtype = torch.int64)

tensor([[0, 0, 0],
        [1, 1, 5],
        [0, 1, 5],
        ...,
        [1, 1, 6],
        [0, 0, 1],
        [1, 1, 5]])

In [40]:
cont_stack = np.stack([dataset[item].values for item in cont], axis = 1)
cont_stack = torch.tensor(cont_stack, dtype = torch.float64)
cont_stack

tensor([[-73.9924,  40.7305, -73.9755,  40.7447,   8.0000,   2.1263],
        [-73.9901,  40.7406, -73.9742,  40.7441,  15.0000,   1.3923],
        [-73.9941,  40.7511, -73.9601,  40.7662,  11.0000,   3.3268],
        ...,
        [-73.9886,  40.7498, -74.0115,  40.7078,  18.0000,   5.0525],
        [-74.0044,  40.7245, -73.9927,  40.7308,   8.0000,   1.2089],
        [-73.9554,  40.7719, -73.9676,  40.7630,  16.0000,   1.4274]],
       dtype=torch.float64)

In [42]:
label = torch.tensor(dataset[label].values)
label

tensor([[ 6.5000],
        [ 6.9000],
        [10.1000],
        ...,
        [12.5000],
        [ 4.9000],
        [ 5.3000]], dtype=torch.float64)