In [351]:
import numpy as np, pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [383]:
# These are all of the files you are given
df_tr = pd.read_csv("train.csv")

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,LEN,YR,MON,DAY,HR,WK
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330,2013,7,1,0,0
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270,2013,7,1,0,0
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960,2013,7,1,0,0
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630,2013,7,1,0,0
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420,2013,7,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1710665,1404171463620000698,C,,,20000698,1404171463,A,False,"[[-8.612469,41.14602],[-8.612487,41.145993],[-...",465,2014,6,30,23,0
1710666,1404171367620000670,C,,,20000670,1404171367,A,False,"[[-8.610138,41.140845],[-8.610174,41.140935],[...",435,2014,6,30,23,0
1710667,1388745716620000264,C,,,20000264,1388745716,A,False,[],0,2014,1,3,10,4
1710668,1404141826620000248,B,,12.0,20000248,1404141826,A,False,"[[-8.630712,41.154885],[-8.63073,41.154813],[-...",915,2014,6,30,15,0


In [384]:
# Over every single 
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)

In [385]:
from datetime import datetime
def parse_time(x):
  # We are using python's builtin datetime library
  # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

  # Each x is essentially a 1 row, 1 column pandas Series
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

# Because we are assigning multiple values at a time, we need to "expand" our computed (year, month, day, hour, weekday) tuples on 
# the column axis, or axis 1
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
df_tr[["YR", "MON", "DAY", "HR", "WK"]] = df_tr[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [391]:
df_tr = df_tr[df_tr["MISSING_DATA"] == False]

In [9]:
# cab_data = pd.read_csv("metaData_taxistandsID_name_GPSlocation.csv")
# cab_dict = {}
# for i in range(1, 64):
#     cab_dict[i] = (cab_data[cab_data["ID"] == i].values[0][2], cab_data[cab_data["ID"] == i].values[0][3])
# lat_mean = np.mean([cab_dict[i][0] for i in range(1, 64)])
# lat_std = np.std([cab_dict[i][0] for i in range(1, 64)])
# long_mean = np.mean([cab_dict[i][1] for i in range(1, 64)])
# long_std = np.std([cab_dict[i][1] for i in range(1, 64)])
# for i in range(1, 64):
#     old_lat = cab_dict[i][0]
#     old_long = cab_dict[i][1]
#     new_lat = (old_lat - lat_mean) / lat_std
#     new_long = (old_long - long_mean) / long_std
#     cab_dict[i] = (new_lat + 3, new_long + 3) # push up the z-normalized values so we can use 0 as a placeholder for "null"

In [10]:
# cab_data = pd.read_csv("metaData_taxistandsID_name_GPSlocation.csv")
# cab_dict = {}
# # for i in range(1, 64):
#     cab_dict[i] = (cab_data[cab_data["ID"] == i].values[0][2], cab_data[cab_data["ID"] == i].values[0][3])

In [12]:
# def get_lat_long(x):
#     i = x["ORIGIN_STAND"]
# #     if (i != i): return 0, 0 # placeholder for null values
#     i = int(i)
#     return cab_dict[i][0], cab_dict[i][1]

# df_tr[["LAT", "LONG"]] = df_tr[["ORIGIN_STAND"]].apply(get_lat_long, axis=1, result_type="expand")

In [None]:
# print(len(df_tr))

In [392]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [393]:
class TaxiDataset(Dataset):
    def __init__(self, dataframe, transform=None, target_transform=None):
        self.dataframe = dataframe
        self.transform = None
        self.target_transform = None
    def __len__(self):
        return len(self.dataframe)
    def __getitem__(self, idx):
        entry = self.dataframe.iloc[idx]
        time = torch.tensor([entry["LEN"]]).to(torch.float32).to(device)
        if (entry["ORIGIN_STAND"] != entry["ORIGIN_STAND"]): # if ORIGIN_STAND is NaN
            # idea to do one-hot encoding comes from https://towardsdatascience.com/deep-neural-networks-for-regression-problems-81321897ca33
            origin_stand = [0 for _ in range(63)]
        else:
            origin_stand = F.one_hot(torch.tensor(int(entry["ORIGIN_STAND"]) - 1), num_classes=63).tolist()
        feature_tuple = (entry["YR"], entry["MON"], entry["WK"], entry["DAY"], entry["HR"], *origin_stand)
        feature_tensor = torch.tensor(feature_tuple).to(torch.float32).to(device)
        return feature_tensor, time

In [413]:
len(df_tr)

1710660

In [414]:
# credit to https://stackoverflow.com/questions/54730276/how-to-randomly-split-a-dataframe-into-several-smaller-dataframes
shuffled = df_tr.sample(frac=1)
result = np.array_split(shuffled, 100)

In [442]:
train_df = pd.concat(result[:4])
outlier_threshold = 3 # taking out anomalies has helped from my experience
mean, std = train_df["LEN"].mean(), train_df["LEN"].std()
train_df = train_df[train_df["LEN"] < mean + outlier_threshold * std]
test_df = result[-1]
train_set = TaxiDataset(train_df)
test_set = TaxiDataset(test_df)

In [443]:
from torch.utils.data import DataLoader

In [444]:
train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = DataLoader(test_set, batch_size=64, shuffle=True)

Features are going to be:
- Year
- Month
- Day
- Hr
- Week
- One-hot encoding of taxi stand

In [524]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(68, 75)
        self.layer2 = nn.Linear(75, 75)
        self.layer3 = nn.Linear(75, 75)
        self.layer4 = nn.Linear(75, 75)
        self.layer8 = nn.Linear(75, 1)
        self.dropout = nn.Dropout(p=0.3)
        self.batchnorm1 = nn.BatchNorm1d(75)
        self.batchnorm2 = nn.BatchNorm1d(75)
        self.batchnorm3 = nn.BatchNorm1d(75) 
        self.batchnorm4 = nn.BatchNorm1d(75)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.batchnorm1(self.relu(self.layer1(x)))
        x = self.batchnorm2(self.relu(self.dropout((self.layer2(x)))))
        x = self.batchnorm3(self.relu(self.layer3(x)))
        x = self.batchnorm4(self.relu(self.dropout(self.layer4(x))))
        x = self.relu(self.layer8(x))
        return x

In [334]:
# i tried many designs, but next time i'm gonna use https://towardsdatascience.com/deep-neural-networks-for-regression-problems-81321897ca33
# for inspiration, just make each layer 256 (and use lots of them)

In [484]:
# code taken from https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    model.eval()

def eval_train_loop(dataloader, model, loss_fn):
    num_batches = len(dataloader)
    train_loss = 0    
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            train_loss += loss_fn(pred, y).item()
    train_loss /= num_batches
    print(f"Avg train loss: {train_loss:>8f} \n")

def test_loop(dataloader, model, loss_fn):
    num_batches = len(dataloader)
    test_loss = 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()

    test_loss /= num_batches
    print(f"Avg test loss: {test_loss:>8f} \n")

The results below demonstrate why we chose learning rate of 1e-4 as opposed to the "normal" learning rate of 1e-3.

In [508]:
model = Net().to(device)
# code taken from https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
learning_rate = 1e-3
batch_size = 64

loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=0.0)
epochs = 10

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_loader, model, loss_fn, optimizer)
    eval_train_loop(train_loader, model, loss_fn)
    test_loop(test_loader, model, loss_fn)

Epoch 1
-------------------------------
Avg train loss: 635068.323427 

Avg test loss: 928501.506996 

Epoch 2
-------------------------------
Avg train loss: 257798.895544 

Avg test loss: 468819.074102 

Epoch 3
-------------------------------
Avg train loss: 4179242959083146838016.000000 

Avg test loss: 3990624343876694441984.000000 

Epoch 4
-------------------------------
Avg train loss: 3762059.166132 

Avg test loss: 9341374.018715 

Epoch 5
-------------------------------
Avg train loss: 175583.400580 

Avg test loss: 411409.032795 

Epoch 6
-------------------------------
Avg train loss: 1446575487985.299561 

Avg test loss: 1222745804070.749756 

Epoch 7
-------------------------------
Avg train loss: 190183.054724 

Avg test loss: 439402.916744 

Epoch 8
-------------------------------
Avg train loss: 174695.300122 

Avg test loss: 409606.637185 

Epoch 9
-------------------------------
Avg train loss: 207696.271067 

Avg test loss: 425557.110687 

Epoch 10
----------------

In [511]:
def run_tests(weight_decay, momentum):
    model = Net().to(device)
    # code taken from https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
    learning_rate = 1e-4
    batch_size = 64

    loss_fn = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay, momentum=momentum)
    epochs = 10

    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loop(train_loader, model, loss_fn, optimizer)
        eval_train_loop(train_loader, model, loss_fn)
        test_loop(test_loader, model, loss_fn)

In [512]:
run_tests(0.0, 0)

Epoch 1
-------------------------------
Avg train loss: 176656.143390 

Avg test loss: 412215.561013 

Epoch 2
-------------------------------
Avg train loss: 184245.964222 

Avg test loss: 427208.596927 

Epoch 3
-------------------------------
Avg train loss: 178819.849611 

Avg test loss: 412331.803871 

Epoch 4
-------------------------------
Avg train loss: 179098.322536 

Avg test loss: 415937.672633 

Epoch 5
-------------------------------
Avg train loss: 174251.355406 

Avg test loss: 409023.052589 

Epoch 6
-------------------------------
Avg train loss: 178918.936778 

Avg test loss: 424530.758017 

Epoch 7
-------------------------------
Avg train loss: 174700.587944 

Avg test loss: 414424.797341 

Epoch 8
-------------------------------
Avg train loss: 175454.213596 

Avg test loss: 406181.459626 

Epoch 9
-------------------------------
Avg train loss: 177387.507153 

Avg test loss: 415012.519094 

Epoch 10
-------------------------------
Avg train loss: 174423.861363 



In [513]:
run_tests(0.2, 0)

Epoch 1
-------------------------------
Avg train loss: 178638.407207 

Avg test loss: 419055.116021 

Epoch 2
-------------------------------
Avg train loss: 184578.827389 

Avg test loss: 426798.667940 

Epoch 3
-------------------------------
Avg train loss: 179000.291345 

Avg test loss: 412743.100280 

Epoch 4
-------------------------------
Avg train loss: 206190.028827 

Avg test loss: 454690.719129 

Epoch 5
-------------------------------
Avg train loss: 185151.356658 

Avg test loss: 411653.638118 

Epoch 6
-------------------------------
Avg train loss: 175497.524535 

Avg test loss: 408742.309527 

Epoch 7
-------------------------------
Avg train loss: 174955.176013 

Avg test loss: 409204.617741 

Epoch 8
-------------------------------
Avg train loss: 177168.024616 

Avg test loss: 415731.075297 

Epoch 9
-------------------------------
Avg train loss: 176564.044695 

Avg test loss: 414135.902460 

Epoch 10
-------------------------------
Avg train loss: 174923.563704 



In [514]:
run_tests(0.4, 0)

Epoch 1
-------------------------------
Avg train loss: 618861.370700 

Avg test loss: 909625.138060 

Epoch 2
-------------------------------
Avg train loss: 196532.464384 

Avg test loss: 433782.717817 

Epoch 3
-------------------------------
Avg train loss: 175405.290079 

Avg test loss: 408307.798420 

Epoch 4
-------------------------------
Avg train loss: 179244.544614 

Avg test loss: 419215.357130 

Epoch 5
-------------------------------
Avg train loss: 191556.296058 

Avg test loss: 437562.401265 

Epoch 6
-------------------------------
Avg train loss: 176239.032097 

Avg test loss: 410178.541715 

Epoch 7
-------------------------------
Avg train loss: 193317.999035 

Avg test loss: 426334.022009 

Epoch 8
-------------------------------
Avg train loss: 175768.150595 

Avg test loss: 413438.548682 

Epoch 9
-------------------------------
Avg train loss: 179736.115487 

Avg test loss: 409613.419135 

Epoch 10
-------------------------------
Avg train loss: 176794.108278 



In [516]:
run_tests(0.6, 0)

Epoch 1
-------------------------------
Avg train loss: 176888.582779 

Avg test loss: 412963.938141 

Epoch 2
-------------------------------
Avg train loss: 181508.986267 

Avg test loss: 419785.563316 

Epoch 3
-------------------------------
Avg train loss: 176554.760876 

Avg test loss: 414742.072674 

Epoch 4
-------------------------------
Avg train loss: 180262.732553 

Avg test loss: 419375.896718 

Epoch 5
-------------------------------
Avg train loss: 181527.859957 

Avg test loss: 416136.754693 

Epoch 6
-------------------------------
Avg train loss: 303610.666927 

Avg test loss: 554356.691873 

Epoch 7
-------------------------------
Avg train loss: 176042.509197 

Avg test loss: 413806.807894 

Epoch 8
-------------------------------
Avg train loss: 194731.906964 

Avg test loss: 441729.108326 

Epoch 9
-------------------------------
Avg train loss: 278638.100141 

Avg test loss: 514510.511777 

Epoch 10
-------------------------------
Avg train loss: 179336.036912 



In [616]:
# credit to https://discuss.pytorch.org/t/how-to-create-mlp-model-with-arbitrary-number-of-hidden-layers/13124/5
class Net2(nn.Module):
    def __init__(self, hidden_cnt):
        super().__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(68, 75))
        for i in range(hidden_cnt):
            self.layers.append(nn.Linear(75, 75))
            self.layers.append(nn.LeakyReLU())
            self.layers.append(nn.BatchNorm1d(75))
        self.layers.append(nn.Linear(75, 1))
        self.layers.append(nn.LeakyReLU())
    
    def forward(self, input_data):
        for layer in self.layers:
            input_data = layer(input_data)
        return input_data

In [617]:
def std_loop(dataloader, model):
    results = []
    for (x, _) in dataloader:
        for singleton in model(x):
            results.append(singleton.item())
    print(f"Standard Deviation: {np.std(results)}")

In [620]:
def run_tests_2(weight_decay, momentum, n_hidden_layers, learning_rate=1e-4, batch_size=64):
    model = Net2(n_hidden_layers).to(device)
    # code taken from https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html

    loss_fn = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay, momentum=momentum)
    epochs = 10

    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loop(train_loader, model, loss_fn, optimizer)
        eval_train_loop(train_loader, model, loss_fn)
        test_loop(test_loader, model, loss_fn)
        std_loop(test_loader, model)

In [625]:
run_tests_2(0.2, 0.3, 40, learning_rate=1e-5, batch_size=64)

Epoch 1
-------------------------------
Avg train loss: 608107220281.666382 

Avg test loss: 608602295103.854126 

Standard Deviation: 741932.8741829855
Epoch 2
-------------------------------
Avg train loss: 17257327577893426954240.000000 

Avg test loss: 17083278106008678825984.000000 

Standard Deviation: 86877486317.58066
Epoch 3
-------------------------------


KeyboardInterrupt: 

In [408]:
def get_input_tensor(entry):    
    if (entry["ORIGIN_STAND"] != entry["ORIGIN_STAND"]): # if ORIGIN_STAND is NaN
        # idea to do one-hot encoding comes from https://towardsdatascience.com/deep-neural-networks-for-regression-problems-81321897ca33
        origin_stand = [0 for _ in range(63)]
    else:
        origin_stand = F.one_hot(torch.tensor(int(entry["ORIGIN_STAND"]) - 1), num_classes=63).tolist()
    feature_tuple = (entry["YR"], entry["MON"], entry["WK"], entry["DAY"], entry["HR"], *origin_stand)
    feature_tensor = torch.tensor(feature_tuple).to(torch.float32).to(device)
    return feature_tensor

I ran into a problem where the model would just output the exact same value every time. I combatted this by adding more layers + batch normalization + reduced learning rate + less epochs.

https://datascience.stackexchange.com/questions/58220/how-to-deal-with-a-constant-value-as-an-output-from-neural-network

https://stackoverflow.com/questions/4493554/neural-network-always-produces-same-similar-outputs-for-any-input

https://stackoverflow.com/questions/39217567/keras-neural-network-outputs-same-result-for-every-input

In [304]:
dt = datetime.fromtimestamp(1408039037)
dt.year, dt.month, dt.day, dt.hour, dt.weekday()

(2014, 8, 14, 17, 3)

In [409]:
df_public_test = pd.read_csv("test_public.csv")
df_public_test[["YR", "MON", "DAY", "HR", "WK"]] = df_public_test[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
pred_dict = {}
for row in df_public_test.iloc:
    pred_dict[row["TRIP_ID"]] = model(torch.unsqueeze(get_input_tensor(row), dim=0)).item()

In [410]:
def get_prediction(x):
    return pred_dict[x]

In [411]:
# Sample submission file that is given on kaggle
df_sample = pd.read_csv("sampleSubmission.csv")
df_sample["TRAVEL_TIME"] = df_sample["TRIP_ID"].apply(get_prediction)
df_sample.to_csv("my_pred.csv", index=None)

In [308]:
class LinReg(nn.Module):
    def __init__(self):
        super(LinReg, self).__init__()
        self.layer1 = nn.Linear(68, 1)
    
    def forward(self, x):
        x = self.layer1(x)
        return x

In [309]:
linreg_model = LinReg().to(device)

In [None]:
# code taken from https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
learning_rate = 0.001
batch_size = 64

loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(linreg_model.parameters(), lr=learning_rate, weight_decay=0.99)
epochs = 5

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_loader, linreg_model, loss_fn, optimizer)
    test_loop(test_loader, linreg_model, loss_fn)
print("Done!")