## Data Loader

In [117]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
import pandas as pd

### Creating Custom Dataset

In [118]:
from torch.utils.data import Dataset

#### Final pre-process (creating dummy variables)

In [142]:
# TODO implement random selection

class ProjectDataset(Dataset):
    def __init__(self, data_dir, data_perc=1, is_test=False,  transform=None, target_transform=None):
        self.data_dir = data_dir
        self.data = pd.read_csv(data_dir)
        self.transform = transform
        self.target_transform = target_transform
        self.data_perc = data_perc
        self.is_test=is_test

        # make categorical
        for column in self.data.columns:
            self.data[column] = self.data[column].astype('category')
            
        if self.is_test == False:
            # take a percentage sample of total data
            self.data = self.data.iloc[0:1558116]
            self.data = self.data.sample(int(len(self.data) * data_perc //1))
        else:
            # test set, no need to sample
            self.data = self.data.iloc[1558116:1558116+320]

        self.features = self.data.iloc[:,:6]
        self.labels = self.data.iloc[:,6]

        # convert features to dummy variables
        for i in range(0,6):
            if i == 2:  # need to rework features, this bypasses taxi_id entirely
               continue
            dummies = pd.get_dummies(self.features.iloc[:,i])
            self.features = pd.concat([self.features, dummies], axis=1)
        self.features = self.features.drop(['CALL_TYPE', 'ORIGIN', 'TAXI_ID', 'MON', 'HR', 'WK'], axis=1)

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        feature = torch.Tensor((np.array(self.features.iloc(axis=0)[idx])))
        label = torch.tensor(self.labels.iloc[idx]).float()
        if self.is_test == True:
            label = torch.tensor(0).float()
        if self.transform:
            feature = self.transform(feature)
        if self.target_transform:
            label = self.target_transform(label)
            
        return feature, label

### Creating custom DataLoader

In [96]:
batch_size = 2**14

trainset = ProjectDataset(data_dir='./data/all_data.csv', data_perc=(1), is_test=False) # consider custom normalize approach for data

train_dataloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                                shuffle=True, num_workers=2)

## Define Our Neural Network

In [97]:
device = ("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.get_device_name()

'NVIDIA GeForce GTX 1080 Ti'

In [98]:
import torch.nn as nn
import torch.nn.functional as F

class ProjectNet(nn.Module):
    def __init__(self, dropout_p, feat_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(feat_dim, feat_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout_p),
            nn.Linear(feat_dim, 512),
            nn.ReLU(),
            nn.Dropout(p=dropout_p),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Dropout(p=dropout_p),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(p=dropout_p),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Dropout(p=dropout_p),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Dropout(p=dropout_p),
            nn.Linear(32, 1),
        )

    def forward(self, x):
        out = self.model(x)
        
        return out

## Training the Network

In [99]:
import torch.optim as optim

In [277]:
def run_epoch(epoch_count, net):
    criterion = nn.L1Loss()
    optimizer = optim.Adam(net.parameters(), lr=1e-3)
    prev_loss = 1e10   # arbitrarily high value so first iteration completes

    for epoch in range(epoch_count):  # loop over the dataset multiple times
        for i in range(40):
            print('-',end="")
        print()
        print("Epoch: ", epoch+1, "/", epoch_count)
        for i in range(40):
            print('-',end="")
        print()
        
        running_loss = 0.0
        total_loss = 0.0
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data[0].to(device), data[1].to(device)
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs.squeeze(), labels.squeeze())
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            total_loss += loss.item()
            ppe = 5     # ppe: print per epoch, number of time to print data per epoch of training
            iter_count = int(len(trainset)/batch_size/ppe)
            if i % iter_count == iter_count - ppe:    # print exactly `ppe` times per epoch of training
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / iter_count:.3f}')
                running_loss = 0.0
        if (total_loss > prev_loss):
            print(f"\nEarly Stopping!\nEpoch over epoch loss increased: {total_loss/i:.3f} > {prev_loss/i:.3f}")
            break
        prev_loss = total_loss
    print('Finished Training')

In [101]:
dropout_perc = 0.5
net=ProjectNet(dropout_perc, trainset[0][0].shape[0]).to(device)  # check modules and send to CUDA
print(f"Total parameter count: {sum(p.numel()for p in net.parameters())}")
run_epoch(epoch_count=20, net=net)

Total parameter count: 2151035
----------------------------------------
Epoch:  1 / 20
----------------------------------------
[1,    15] loss: 563.567
[1,    34] loss: 474.310
[1,    53] loss: 353.671
[1,    72] loss: 332.394
[1,    91] loss: 323.246
----------------------------------------
Epoch:  2 / 20
----------------------------------------
[2,    15] loss: 250.938
[2,    34] loss: 317.212
[2,    53] loss: 316.564
[2,    72] loss: 316.336
[2,    91] loss: 318.403
----------------------------------------
Epoch:  3 / 20
----------------------------------------
[3,    15] loss: 250.163
[3,    34] loss: 316.578
[3,    53] loss: 315.171
[3,    72] loss: 316.501
[3,    91] loss: 314.148
----------------------------------------
Epoch:  4 / 20
----------------------------------------
[4,    15] loss: 249.166
[4,    34] loss: 315.319
[4,    53] loss: 314.252
[4,    72] loss: 314.667
[4,    91] loss: 314.027
----------------------------------------
Epoch:  5 / 20
-------------------------

### Saving our model

In [104]:
PATH = './model/proj_model_2.pth' # next save

In [105]:
torch.save(net.state_dict(), PATH)

### Loading our model

In [106]:
PATH = './model/proj_model_1.pth' # previous save

In [269]:
dropout_perc = 0    # !! DISABLE drop out for testing
net2 = ProjectNet(dropout_perc, trainset[0][0].shape[0]).to(device)

In [270]:
net2.load_state_dict(torch.load(PATH))
net2.to(device)

ProjectNet(
  (model): Sequential(
    (0): Linear(in_features=1110, out_features=1110, bias=True)
    (1): ReLU()
    (2): Dropout(p=0, inplace=False)
    (3): Linear(in_features=1110, out_features=512, bias=True)
    (4): ReLU()
    (5): Dropout(p=0, inplace=False)
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): ReLU()
    (8): Dropout(p=0, inplace=False)
    (9): Linear(in_features=512, out_features=128, bias=True)
    (10): ReLU()
    (11): Dropout(p=0, inplace=False)
    (12): Linear(in_features=128, out_features=128, bias=True)
    (13): ReLU()
    (14): Dropout(p=0, inplace=False)
    (15): Linear(in_features=128, out_features=32, bias=True)
    (16): ReLU()
    (17): Dropout(p=0, inplace=False)
    (18): Linear(in_features=32, out_features=1, bias=True)
  )
)

## Testing our Model

In [274]:
batch_size = 320

testset = ProjectDataset(data_dir='./data/all_data.csv', data_perc=(1), is_test=True) # consider custom normalize approach for data

test_dataloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                                shuffle=False, num_workers=1)

In [278]:
with torch.no_grad():
    for data in test_dataloader:
        features = data[0].to(device)
        outputs = net2(features)

outputs = outputs.detach().cpu().numpy()
outputs = (outputs / 15).round() * 15
out_data = outputs

In [276]:
# final rounding
out_data = np.array(out_data)
to_export = pd.read_csv('./data/test_public.csv')
to_export['TRAVEL_TIME'] = pd.DataFrame(out_data)
to_export = to_export.loc[:,['TRIP_ID', 'TRAVEL_TIME']]
to_export.to_csv('./data/predictions.csv', index=False)