In [3]:
import numpy as np
import pandas as pd
from datetime import datetime
import re
import math

In [4]:
#Read into Dataframe
taxi_data = pd.read_csv("kaggle_data/train.csv")
taxi_data['ORIGIN_STAND'] = taxi_data['ORIGIN_STAND'].fillna(0)

#Calculate and Create Time Column - only for training label creation and test validation
def travel_time(polyline):
    return max(polyline.count("[") - 2, 0) * 15

def parse_timestamp(taxi_data):
    date_time = datetime.fromtimestamp(taxi_data["TIMESTAMP"])
    return date_time.year, date_time.month, date_time.day, date_time.hour, date_time.weekday()

taxi_data["LEN"] = taxi_data["POLYLINE"].apply(travel_time)

mean_duration = taxi_data["LEN"].mean()
standard_deviation = taxi_data["LEN"].std()
median = taxi_data["LEN"].median()
taxi_data = taxi_data[taxi_data["LEN"] < mean_duration + 3*standard_deviation]

outlier_threshold = 3
total_size = len(taxi_data)
trimmed_taxi_data = taxi_data[taxi_data["LEN"] < mean_duration + outlier_threshold * standard_deviation]
print(f"Using: {len(trimmed_taxi_data)}/{total_size}")

trimmed_taxi_data[["YR", "MON", "DAY", "HR", "WK"]] = trimmed_taxi_data[["TIMESTAMP"]].apply(parse_timestamp, axis=1, result_type="expand")

Using: 1692771/1692771


### Incorporating Velocity in Dataset

This factor is calculated using an initial partial trajectory provided for the taxi. It is useful for the duration calculation since speed plays a major role in travel time along with the distance. Since there is no clear way to accurately determine the total length of a trip before it ends, we can assume that an average velocity based on a taxi's initial trajectory will be a good estimate of its overall speed and will be a factor in determining trip duration

In [5]:
#Velocity Calculation adapted from: 
#https://www.ridgesolutions.ie/index.php/2013/11/14/algorithm-to-calculate-speed-from-two-gps-latitude-and-longitude-points-and-time-difference/

def velocity(lat1, lon1, lat2, lon2):
    
    #Convert degrees to radians
    lat1 = lat1 * math.pi / 180.0;
    lon1 = lon1 * math.pi / 180.0;
    lat2 = lat2 * math.pi / 180.0;
    lon2 = lon2 * math.pi / 180.0;
    
    #radius of earth in metres
    r = 6378100;
    
    #P
    rho1 = r * math.cos(lat1)
    z1 = r * math.sin(lat1)
    x1 = rho1 * math.cos(lon1)
    y1 = rho1 * math.sin(lon1)
    
    #Q
    rho2 = r * math.cos(lat2)
    z2 = r * math.sin(lat2)
    x2 = rho2 * math.cos(lon2)
    y2 = rho2 * math.sin(lon2)
    
    #Dot product
    dot = (x1 * x2 + y1 * y2 + z1 * z2)
    cos_theta = dot / (r * r)
    
    if(cos_theta > 1):
        cos_theta = 1
        
    theta = math.acos(cos_theta)
    
    #Distance in Metres
    distance = r * theta
    
    return distance/15 #speed in meters per second

In [6]:
trimmed_taxi_data.reset_index(drop=True, inplace=True)

In [7]:
avg_velocities = []

#Average Velocity
def avg_velo(taxi_data):
    
    k = 10
    
    poly = taxi_data["POLYLINE"]
    
    for i in range(0, len(taxi_data), 1):
        coord = poly[i]
        coord = re.split(r',|\[|\]', coord)
        count = 0
        
        coordinates = []
        
        for value in coord:
            if (count > 2*(k-2) or count > len(coord)-2):
                break
            else:
                #print(value + str(value.isnumeric()))
                if(value != ''):
                    coordinates.append(float(value))
              
        velocities = []
        
        for j in range(0, int(len(coordinates)/4), 1):
            velocities.append(velocity(coordinates[j], coordinates[j+1], coordinates[j+2], coordinates[j+3]))
    
        sum_velo = 0.0
    
        for velo in velocities:
            sum_velo += velo
        
        if(len(velocities)==0):
            num_velo = 1
        else:
            num_velo = len(velocities)
            
        avg_velocities.append(sum_velo/num_velo)

In [8]:
avg_velo(trimmed_taxi_data)

In [9]:
#Mapping Call Type Letters to Numbers
letter_to_num = {
    "A" : 1,
    "B" : 2,
    "C" : 3
}
num_to_letter = {
    1 : "A",
    2 : "B",
    3 : "C"
}

duration = trimmed_taxi_data["LEN"].tolist()

hour = trimmed_taxi_data["HR"].tolist()
month = trimmed_taxi_data["MON"].tolist() 
week = trimmed_taxi_data["WK"].tolist()
day = trimmed_taxi_data["DAY"].tolist()
calltype = trimmed_taxi_data["CALL_TYPE"].tolist()
taxi = trimmed_taxi_data["TAXI_ID"].tolist()
origin = trimmed_taxi_data["ORIGIN_STAND"].tolist()

for count in range(0, len(calltype), 1):
    calltype[count] = (letter_to_num[calltype[count]]) 

In [10]:
#Combine Input Vectors
inputs = []
for count in range(0, len(hour), 1):
    inputs.append([hour[count], month[count], week[count], day[count], calltype[count], origin[count]])

dur_inputs = []
for count in range(0, len(hour), 1):
    dur_inputs.append([hour[count], month[count], week[count], day[count], calltype[count], origin[count], avg_velocities[count]])

In [11]:
import torch
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Model

In [12]:
velo_in_tensor = torch.tensor(inputs, dtype=torch.float32).to(device)
dur_in_tensor = torch.tensor(dur_inputs, dtype=torch.float32).to(device)
target_tensor = torch.tensor(duration, dtype=torch.float32).to(device)
target_velocity_tensor = torch.tensor(avg_velocities, dtype=torch.float32).to(device)

velo_dataset = TensorDataset(velo_in_tensor, target_velocity_tensor)
duration_dataset = TensorDataset(dur_in_tensor, target_tensor)

In [13]:
class MLR(torch.nn.Sequential):
    # Object Constructor
    def __init__(self, input_features, output_features):
        super().__init__()
        self.linear = torch.nn.Linear(input_features, 12)
        self.dropout = torch.nn.Dropout(0.5)
        self.linear2 = torch.nn.Linear(12, 16)
        self.linear3 = torch.nn.Linear(16, 20)
        self.linear4 = torch.nn.Linear(20, output_features)
        self.relu = torch.nn.ReLU()
        self.hiddennorm1 = torch.nn.BatchNorm1d(12)
        self.hiddennorm2 = torch.nn.BatchNorm1d(16)
        self.norm = torch.nn.BatchNorm1d(input_features)
        
    # define the forward function for prediction
    def forward(self, x):
        x = self.norm(x)
        x = self.dropout(self.relu(self.linear(x)))
        x = self.hiddennorm1(x)
        x = self.dropout(self.relu(self.linear2(x)))
        x = self.hiddennorm2(x)
        x = self.dropout(self.relu(self.linear3(x)))
        x = self.relu(self.linear4(x))
        return x
    
predict_velocity = MLR(6, 1).to(device)
predict_duration = MLR(7, 1).to(device)

print(predict_velocity)
print(predict_duration)

MLR(
  (linear): Linear(in_features=6, out_features=12, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (linear2): Linear(in_features=12, out_features=16, bias=True)
  (linear3): Linear(in_features=16, out_features=20, bias=True)
  (linear4): Linear(in_features=20, out_features=1, bias=True)
  (relu): ReLU()
  (hiddennorm1): BatchNorm1d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (hiddennorm2): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (norm): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
MLR(
  (linear): Linear(in_features=7, out_features=12, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (linear2): Linear(in_features=12, out_features=16, bias=True)
  (linear3): Linear(in_features=16, out_features=20, bias=True)
  (linear4): Linear(in_features=20, out_features=1, bias=True)
  (relu): ReLU()
  (hiddennorm1): BatchNorm1d(12, eps=1e-05, momentum=0.1, affine=True, track_ru

In [14]:
class MLR_highdim(torch.nn.Sequential):
    # Object Constructor
    def __init__(self, input_features, output_features):
        super().__init__()
        self.linear1 = torch.nn.Linear(input_features, 128)
        self.linear2 = torch.nn.Linear(128, 64)
        self.linear3 = torch.nn.Linear(64, 16)
        self.linear4 = torch.nn.Linear(16, 8)
        self.linear5 = torch.nn.Linear(8, output_features)
        
        self.relu = torch.nn.ReLU()
        
        self.dropout = torch.nn.Dropout(0.3)
            
        self.norm = torch.nn.BatchNorm1d(input_features)
        self.hiddennorm1 = torch.nn.BatchNorm1d(128)
        self.hiddennorm2 = torch.nn.BatchNorm1d(64)
        self.hiddennorm3 = torch.nn.BatchNorm1d(16)
        
    # define the forward function for prediction
    def forward(self, x):
        x = self.norm(x)
        x = self.dropout(self.relu(self.linear1(x)))
        x = self.hiddennorm1(x)
        x = self.dropout(self.relu(self.linear2(x)))
        x = self.hiddennorm2(x)
        x = self.dropout(self.relu(self.linear3(x)))
        x = self.hiddennorm3(x)
        x = self.dropout(self.relu(self.linear4(x)))
        x = self.relu(self.linear5(x))
        return x
    
predict_velocity_hd = MLR_highdim(6, 1).to(device)
predict_duration_hd = MLR_highdim(7, 1).to(device)

print(predict_velocity_hd)
print(predict_duration_hd)

MLR_highdim(
  (linear1): Linear(in_features=6, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=64, bias=True)
  (linear3): Linear(in_features=64, out_features=16, bias=True)
  (linear4): Linear(in_features=16, out_features=8, bias=True)
  (linear5): Linear(in_features=8, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (norm): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (hiddennorm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (hiddennorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (hiddennorm3): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
MLR_highdim(
  (linear1): Linear(in_features=7, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=64, bias=True)
  (linear3): Linear(in_features=64, out_features=16, bias=True)
  (linear4): Linear(

In [15]:
# Define optimizer (this will perform your parameter updates use)
lr = 0.0001
slr = 1e-4
#opt_velocity = torch.optim.Adam(predict_velocity.parameters(), lr=lr)
#opt_duration = torch.optim.Adam(predict_duration.parameters(), lr=lr)
opt_velocity_hd = torch.optim.SGD(predict_velocity_hd.parameters(), lr=lr)
opt_duration_hd = torch.optim.SGD(predict_duration_hd.parameters(), lr=lr)

#opt_velocity = torch.optim.Adam(predict_velocity.parameters(), lr=lr)
#opt_duration = torch.optim.Adam(predict_duration.parameters(), lr=lr)

### Train

In [16]:
batch_size = 64

train_err = []
parameters = []

velocitytrainloader = DataLoader(velo_dataset, batch_size, shuffle=True)
trainloader = DataLoader(duration_dataset, batch_size, shuffle=True)

In [17]:
def train(epochs, model, optimize, loader):
    for epoch in range(epochs):
        for x, y in loader:
            model.train()
            prediction = model(x)
            loss = torch.sqrt(torch.nn.functional.mse_loss(prediction, torch.unsqueeze(y, 1)))
            optimize.zero_grad()
            loss.backward()
            optimize.step()
            
        print("Epoch: " + str(epoch) + "\t" + "Loss: " + str(loss.tolist()))

In [18]:
epochsV = 20
epochsD = 50
train(epochsV, predict_velocity_hd, opt_velocity_hd, velocitytrainloader)
train(epochsD, predict_duration_hd, opt_duration_hd, trainloader) #Best so far

#train(epochsV, predict_velocity, opt_velocity, velocitytrainloader)
#train(epochsD, predict_duration, opt_duration, trainloader) #Best so far

Epoch: 0	Loss: 4.13284969329834
Epoch: 1	Loss: 2.9492053985595703
Epoch: 2	Loss: 4.038605690002441
Epoch: 3	Loss: 3.3954899311065674
Epoch: 4	Loss: 4.525208950042725
Epoch: 5	Loss: 4.651245594024658
Epoch: 6	Loss: 4.604705810546875
Epoch: 7	Loss: 3.895814895629883
Epoch: 8	Loss: 3.986433744430542
Epoch: 9	Loss: 3.660259246826172
Epoch: 10	Loss: 2.8639087677001953
Epoch: 11	Loss: 3.572251558303833
Epoch: 12	Loss: 4.181982040405273
Epoch: 13	Loss: 4.146925449371338
Epoch: 14	Loss: 5.921876430511475
Epoch: 15	Loss: 3.9241158962249756
Epoch: 16	Loss: 3.593061923980713
Epoch: 17	Loss: 5.42855978012085
Epoch: 18	Loss: 3.128194570541382
Epoch: 19	Loss: 5.916820526123047
Epoch: 0	Loss: 854.5045166015625
Epoch: 1	Loss: 424.1079406738281
Epoch: 2	Loss: 437.63446044921875
Epoch: 3	Loss: 390.8373107910156
Epoch: 4	Loss: 497.8941345214844
Epoch: 5	Loss: 312.143310546875
Epoch: 6	Loss: 385.2196044921875
Epoch: 7	Loss: 378.6519775390625
Epoch: 8	Loss: 361.8486633300781
Epoch: 9	Loss: 403.202514648437

In [19]:
torch.save(predict_velocity_hd, "VelocityPredictor.pt")
torch.save(predict_duration_hd, "DurationPredictor.pt")

In [20]:
#epochs = 10
#train(epochs, predict_velocity, opt_velocity, velocitytrainloader)

In [21]:
#train(epochs, predict_duration, opt_duration, trainloader) #Best so far

In [71]:
#predict_without_velocity = MLR(6, 1).to(device)
#opt_without_velocity = torch.optim.Adam(predict_without_velocity.parameters(), lr=lr)
#without_velo_dataset = TensorDataset(velo_in_tensor, target_tensor)
#withoutvelocitytrainloader = DataLoader(without_velo_dataset, batch_size, shuffle=True)
#train(epochs, predict_without_velocity, opt_without_velocity, withoutvelocitytrainloader)

# PREDICT

In [45]:
#Read into Dataframe
test_data = pd.read_csv("kaggle_data/test_public.csv")
test_data['ORIGIN_STAND'] = taxi_data['ORIGIN_STAND'].fillna(0)
test_data[["YR", "MON", "DAY", "HR", "WK"]] = taxi_data[["TIMESTAMP"]].apply(parse_timestamp, axis=1, result_type="expand")

test_hour = test_data["HR"].tolist()
test_month = test_data["MON"].tolist() 
test_week = test_data["WK"].tolist()
test_day = test_data["DAY"].tolist()
test_calltype = test_data["CALL_TYPE"].tolist()
test_taxi = test_data["TAXI_ID"].tolist()
test_origin = test_data["ORIGIN_STAND"].tolist()

for count in range(0, len(test_calltype), 1):
    test_calltype[count] = (letter_to_num[test_calltype[count]]) 
    
test_inputs = []
for count in range(0, len(test_hour), 1):
    test_inputs.append([test_hour[count], test_month[count], test_week[count], test_day[count], test_origin[count], test_calltype[count]])
    
test_tensor = torch.tensor(test_inputs, dtype=torch.float32).to(device)

test_dataset = TensorDataset(test_tensor)
testloader = DataLoader(test_dataset, batch_size, shuffle=True)

In [73]:
#test_ids = test_data["TRIP_ID"].tolist()
#test_velo = predict_velocity(test_tensor)

#velo = []
#for i in test_velo.tolist():
#    velo.append(i[0])
    
#velo_test_inputs = []
#for count in range(0, len(test_hour), 1):
#    velo_test_inputs.append([test_hour[count], test_month[count], test_week[count], test_day[count], test_calltype[count], test_taxi[count], velo[count]])

#velo_test_tensor = torch.tensor(velo_test_inputs, dtype=torch.float32).to(device)

#test_duration = predict_duration(velo_test_tensor)
#test_duration = test_duration.tolist()
    
#for i in range (0, len(test_ids), 1):
#    print("\""+str(test_ids[i])+"\""","+str(test_duration[i][0]))

In [74]:
#test_duration_without_velocity = predict_without_velocity(test_tensor)
#test_duration_without_velocity = test_duration_without_velocity.tolist()

#print(len(test_ids))
    
#for i in range (0, len(test_ids), 1):
#    print("\""+str(test_ids[i])+"\""","+str(test_duration_without_velocity[i][0]))

In [48]:
predict_velocity.eval()
predict_duration.eval()

test_ids = test_data["TRIP_ID"].tolist()
test_velo = predict_velocity(test_tensor)

velo_hd = []
for i in test_velo.tolist():
    velo_hd.append(i[0])
    
velo_test_inputs = []
for count in range(0, len(test_hour), 1):
    velo_test_inputs.append([test_hour[count], test_month[count], test_week[count], test_day[count], test_calltype[count], test_origin[count], velo_hd[count]])
    
velo_hd_test_tensor = torch.tensor(velo_test_inputs, dtype=torch.float32).to(device)

test_duration_hd = predict_duration(velo_hd_test_tensor)
test_duration_hd = test_duration_hd.tolist()
    
for i in range (0, len(test_ids), 1):
    print("\""+str(test_ids[i])+"\""","+str(test_duration_hd[i][0]))    

"T1",624.7883911132812
"T2",620.5599975585938
"T3",624.7883911132812
"T4",624.7883911132812
"T5",624.7883911132812
"T6",627.220703125
"T7",624.7883911132812
"T8",627.220703125
"T9",624.7883911132812
"T10",624.7883911132812
"T11",624.7883911132812
"T12",617.8628540039062
"T13",617.8628540039062
"T14",617.8628540039062
"T15",617.8628540039062
"T16",596.8775634765625
"T17",584.4378051757812
"T18",617.8628540039062
"T19",624.7883911132812
"T20",617.8628540039062
"T21",624.7883911132812
"T22",627.220703125
"T23",627.220703125
"T24",582.2228393554688
"T25",624.7883911132812
"T26",624.7883911132812
"T27",624.7883911132812
"T28",626.2550048828125
"T29",573.1555786132812
"T30",626.2550048828125
"T31",624.7883911132812
"T32",624.7883911132812
"T33",617.8628540039062
"T34",617.8628540039062
"T35",617.8628540039062
"T36",617.8628540039062
"T37",627.220703125
"T38",624.7883911132812
"T39",592.5403442382812
"T40",624.7883911132812
"T41",593.4191284179688
"T42",617.8628540039062
"T43",617.86285400390