### Acknowledgement

Upon building the final model we use for the result of the competition, we consulted [TensorFlow tutorials](https://www.tensorflow.org/text/tutorials/transformer) and PyTorch tutorials with practical examples such as [Language Modeling](https://pytorch.org/tutorials/beginner/transformer_tutorial.html) and [Language Translation](https://pytorch.org/tutorials/beginner/translation_transformer.html.)

In [32]:
import torch
from torch.utils.data import Dataset, DataLoader
import os, os.path 
import numpy 
import pickle
from glob import glob
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim.lr_scheduler import ReduceLROnPlateau
import matplotlib.pyplot as plt
import time
from tqdm.notebook import tqdm
import math

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

"""
    number of trajectories in each city
    # austin --  train: 43041 test: 6325 
    # miami -- train: 55029 test:7971
    # pittsburgh -- train: 43544 test: 6361
    # dearborn -- train: 24465 test: 3671
    # washington-dc -- train: 25744 test: 3829
    # palo-alto -- train:  11993 test:1686

    trajectories sampled at 10HZ rate, input 5 seconds, output 6 seconds
    
"""

'\n    number of trajectories in each city\n    # austin --  train: 43041 test: 6325 \n    # miami -- train: 55029 test:7971\n    # pittsburgh -- train: 43544 test: 6361\n    # dearborn -- train: 24465 test: 3671\n    # washington-dc -- train: 25744 test: 3829\n    # palo-alto -- train:  11993 test:1686\n\n    trajectories sampled at 10HZ rate, input 5 seconds, output 6 seconds\n    \n'

## Create a Torch.Dataset class for the training dataset

In [2]:
from glob import glob
import pickle
import numpy as np

ROOT_PATH = "./"

cities = ["austin", "miami", "pittsburgh", "dearborn", "washington-dc", "palo-alto"]
splits = ["train", "test"]

def transform_data(np_data, bch_id):
    df = pd.DataFrame(np_data[bch_id], columns = ['x','y'])
    df['x_vel'] = np.gradient(df.x)
    df['y_vel'] = np.gradient(df.y)
    df['vel'] = np.sqrt(df.x_vel**2 + df.y_vel**2)
    df['x_acc'] = np.gradient(df.x_vel)
    df['y_acc'] = np.gradient(df.y_vel)
    df['acc'] = np.gradient(df.vel)
    tangent = np.array([1/df.vel]*2).T * np.array([df.x_vel, df.y_vel]).T
    df['curvature'] = np.abs(df.x_acc * df.y_vel - df.x_vel * df.y_acc) / (df.vel)**3
    out = df[['x', 'y', 'curvature']]
    return out.to_numpy()


def rotate(X, startpoint, endpoint, default_angle):
    
    # Find the slope of the path
    dx = X[:, endpoint, 0] - X[:, startpoint, 0]
    dy = X[:, endpoint, 1] - X[:, startpoint, 1]
    
    # Convert theta to degree in the range(0, 360)
    theta = np.arctan2(dy, dx)
    angle = np.degrees(theta)
    angle[angle < 0] += 360
    
    # Generate the degree we want to rotate by and convert back to theta
    rotate_degree = -1 * (angle - default_angle)
    rotate_theta = np.deg2rad(rotate_degree)
    
    # Reshape the array from [4, batchsize] to [batchsize, 2, 2]
    rot = np.array([np.cos(rotate_theta), -np.sin(rotate_theta),
                np.sin(rotate_theta), np.cos(rotate_theta)])
    rot = rot.T.reshape(-1, 2, 2)
    
    return rot


def get_city_trajectories(city="palo-alto", split="train", normalized=False):
    f_in = ROOT_PATH + split + "/" + city + "_inputs"
    inputs = pickle.load(open(f_in, "rb"))
    inputs = np.asarray(inputs)
    
    outputs = None
    
    if split=="train":
        f_out = ROOT_PATH + split + "/" + city + "_outputs"
        outputs = pickle.load(open(f_out, "rb"))
        outputs = np.asarray(outputs)

    return inputs, outputs


class ArgoverseDataset(Dataset):
    """Dataset class for Argoverse"""
    def __init__(self, city: str, split:str, transform=None, normalized=False):
        super(ArgoverseDataset, self).__init__()
        self.transform = transform
        self.normalized = normalized
        self.split = split

        self.inputs, self.outputs = self.get_city_trajectories(city=city, split=split)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):

        if self.split == 'train':
        
            data = (self.inputs[idx], self.outputs[idx])

#             if self.transform:
#                 data = self.transform(data)

            return data
        
        return self.inputs[idx]
    
    def get_city_trajectories(self, city="palo-alto", split="train"):
        assert city in cities and split in splits

        # get input
        f_in = ROOT_PATH + split + "/" + city + "_inputs"
        inputs = pickle.load(open(f_in, "rb"))
        inputs = np.asarray(inputs)

        # store input starting positions and rotation matrix
        start_pos = inputs[:, 0, :].copy()
        rotate_factor = rotate(inputs, 0, 49, 30)
        max_factor = inputs.max(axis=1)
        
#         print(inputs.reshape(-1, 2).mean(axis=0))
#         print(inputs.reshape(-1, 2).std(axis=0))

        # normalize inputs (translation + rotation)
        if self.normalized:
            for i in range(len(inputs)):
                inputs[i] -= start_pos[i, :]
                
            for i in range(len(inputs)):
                inputs[i] = inputs[i] @ rotate_factor[i].T
            
            max_factor = inputs.max(axis=1)
            
#             for i in range(len(inputs)):
#                 inputs[i] = inputs[i] / max_factor[i]

        # get output
        outputs = None
        if split == "train":  # get and normalize outputs
            f_out = ROOT_PATH + split + "/" + city + "_outputs"
            outputs = pickle.load(open(f_out, "rb"))
            outputs = np.asarray(outputs)
            if self.normalized:
                for i in range(len(inputs)):
                    outputs[i] -= start_pos[i, :]
                    
                for i in range(len(inputs)):
                    outputs[i] = outputs[i] @ rotate_factor[i].T
                
#                 for i in range(len(inputs)):
#                     outputs[i] = outputs[i] / max_factor[i]
        
#             print(inputs.shape)
#             print(outputs.shape)
        
            # Adding curvature as features
            if self.transform:
#                 print(inputs.shape)
#                 print(outputs.shape)
                inputs = np.array([transform_data(inputs, i) for i in range(len(inputs))])
#                 print(inputs.shape)

        self.start_pos = start_pos
        self.rotate_matrix = rotate_factor # np.linalg.inv(rot[i].T) to reverse back
        
        if self.normalized:
            self.n_max = max_factor

        return inputs, outputs

In [4]:
class TotalDataset(Dataset):
    def __init__(self, split):
        super(TotalDataset, self).__init__()
        self.cities = ["austin", "miami", "pittsburgh", "dearborn", "washington-dc", "palo-alto"]
        self.split = split
        self.datasets = [ArgoverseDataset(c, split=split, normalized=True) for c in self.cities]
        self.sizes = [len(data) for data in self.datasets]
        self.cumu_sizes = [0] + np.cumsum(self.sizes).tolist()
        
    def __len__(self):
        return sum(self.sizes)

    def __getitem__(self, idx):
        # compute combined idx
        for i, cumu in enumerate(self.cumu_sizes):
            if cumu <= idx < self.cumu_sizes[i+1]:
                ix = idx - cumu
                dataset = self.datasets[i]

        if self.split == 'train':
            return dataset.inputs[ix], dataset.outputs[ix]     
        return dataset.inputs[ix]

In [5]:
# initalize TOTAL DATASET
total_dataset = TotalDataset('train')
len(total_dataset), total_dataset.sizes

(203816, [43041, 55029, 43544, 24465, 25744, 11993])

## Transformer

In [17]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)

cuda:0


In [18]:
class MultiHeadAttention(nn.Module):
    '''Multi-head self-attention module'''
    def __init__(self, D, H):
        super(MultiHeadAttention, self).__init__()
        self.H = H # number of heads
        self.D = D # dimension
        
        self.wq = nn.Linear(D, D*H)
        self.wk = nn.Linear(D, D*H)
        self.wv = nn.Linear(D, D*H)

        self.dense = nn.Linear(D*H, D)

    def concat_heads(self, x):
        B, H, S, D = x.shape
        x = x.permute((0, 2, 1, 3)).contiguous() 
        x = x.reshape((B, S, H*D))
        return x

    def split_heads(self, x):
        B, S, D_H = x.shape
        x = x.reshape(B, S, self.H, self.D)
        x = x.permute((0, 2, 1, 3))
        return x

    def forward(self, x, mask):

        q = self.wq(x)
        k = self.wk(x)
        v = self.wv(x)

        q = self.split_heads(q)
        k = self.split_heads(k)
        v = self.split_heads(v)

        attention_scores = torch.matmul(q, k.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.D)

        # add the mask to the scaled tensor.
        if mask is not None:
            attention_scores += (mask * -1e9)
        
        attention_weights = nn.Softmax(dim=-1)(attention_scores)
        scaled_attention = torch.matmul(attention_weights, v)
        concat_attention = self.concat_heads(scaled_attention)
        output = self.dense(concat_attention)

        return output, attention_weights

In [20]:
# Positional encodings
def get_angles(pos, i, D):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(D))
    return pos * angle_rates


def positional_encoding(D, position=60, dim=3, device=device):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(D)[np.newaxis, :],
                            D)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    if dim == 3:
        pos_encoding = angle_rads[np.newaxis, ...]
    elif dim == 4:
        pos_encoding = angle_rads[np.newaxis,np.newaxis,  ...]
    return torch.tensor(pos_encoding, device=device)

In [22]:
def create_look_ahead_mask(size, device=device):
    mask = torch.ones((size, size), device=device)
    mask = torch.triu(mask, diagonal=1)
    return mask

In [24]:
class TransformerLayer(nn.Module):
    def __init__(self, D, H, hidden_mlp_dim, dropout_rate):
        super(TransformerLayer, self).__init__()
        self.dropout_rate = dropout_rate
        self.mlp_hidden = nn.Linear(D, hidden_mlp_dim)
        self.mlp_out = nn.Linear(hidden_mlp_dim, D)
        self.layernorm1 = nn.LayerNorm(D, eps=1e-9)
        self.layernorm2 = nn.LayerNorm(D, eps=1e-9)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

        self.mha = MultiHeadAttention(D, H)


    def forward(self, x, look_ahead_mask):
        
        attn, attn_weights = self.mha(x, look_ahead_mask)
        attn = self.dropout1(attn)
        attn = self.layernorm1(attn + x)

        mlp_act = torch.relu(self.mlp_hidden(attn))
        mlp_act = self.mlp_out(mlp_act)
        mlp_act = self.dropout2(mlp_act)
        
        output = self.layernorm2(mlp_act + attn)

        return output, attn_weights

In [26]:
class Transformer(nn.Module):
    '''
    Transformer Encoder
    '''
    def __init__(self, num_layers, D, H, hidden_mlp_dim, inp_features,
                 out_features, dropout_rate, batch_size, kernel_size):
        super(Transformer, self).__init__()
        self.batch_size = batch_size
        self.sqrt_D = torch.tensor(math.sqrt(D))
        self.num_layers = num_layers
        self.input_projection = nn.Sequential(
            nn.Linear(inp_features, hidden_mlp_dim),
            nn.LeakyReLU(), 
            nn.Linear(hidden_mlp_dim, hidden_mlp_dim),
            nn.LeakyReLU(), 
            nn.Linear(hidden_mlp_dim, hidden_mlp_dim),
            nn.LeakyReLU(), 
            nn.Linear(hidden_mlp_dim, D),
            nn.LeakyReLU()
        )
        
        self.output_projection = nn.Sequential(
            nn.Linear(50*D, hidden_mlp_dim),
            nn.LeakyReLU(), 
            nn.Linear(hidden_mlp_dim, hidden_mlp_dim),
            nn.LeakyReLU(), 
            nn.Linear(hidden_mlp_dim, hidden_mlp_dim),
            nn.LeakyReLU(), 
            nn.Linear(hidden_mlp_dim, hidden_mlp_dim),
            nn.LeakyReLU(), 
            nn.Linear(hidden_mlp_dim, out_features)
        )
        
        self.pos_encoding = positional_encoding(D)
        self.dec_layers = nn.ModuleList([TransformerLayer(D, H, hidden_mlp_dim, 
                                        dropout_rate=dropout_rate
                                       ) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, mask):
        B, S, D = x.shape
        attention_weights = {}
        
        x = self.input_projection(x)
        
        x *= self.sqrt_D
        
        x += self.pos_encoding[:, :S, :]

        x = self.dropout(x)

        for i in range(self.num_layers):
            x, block = self.dec_layers[i](x=x,
                                          look_ahead_mask=mask)
            attention_weights['decoder_layer{}'.format(i + 1)] = block
        
        x = self.output_projection(x)
        
        return x, attention_weights
    
    def auto_regressor(self, x, mask, step):
        
        B, S, D = x.shape
        new_inputs = torch.clone(x)
        temp_pred, atn = self.forward(new_inputs, mask)
        temp_pred = temp_pred.reshape(B, -1, 2)
        new_inputs = torch.cat((new_inputs, temp_pred), 1)
        
        
        for idx in range(step, 60, step):
            train_inputs = new_inputs[:, idx:idx+50, :]
            
            starting_pos = torch.unsqueeze(train_inputs[:, 0, :], dim=1)
            Q = torch.from_numpy(rotate(train_inputs.cpu().detach().numpy(), 0, 9, 30)).to(device)
            trans_inputs = torch.matmul((train_inputs - starting_pos),
                                        torch.transpose(Q, 1, 2))
            
            temp_pred, attention = self.forward(train_inputs, mask)
            temp_pred = temp_pred.reshape(B, -1, 2)
            temp_pred = (torch.matmul(temp_pred, Q) + starting_pos)
            new_inputs = torch.cat((new_inputs, temp_pred), 1)
            
        return new_inputs[:, 50:].reshape(B, -1), attention

## Train the Transformer

In [30]:
def train(batch_size, city, split, num_layers, D, H, hidden_mlp_dim, 
          inp_features, out_features, dropout_rate, n_epochs, learning_rate, factor, patience,
          step, kernel_size):
    
    # Create the training/validation set
    train_dataset = ArgoverseDataset(city=city, split=split, transform=False, normalized=True)
    train_sz = int(len(train_dataset) * 0.9)
    val_sz = len(train_dataset) - train_sz
    train_loader, val_loader = torch.utils.data.random_split(train_dataset, [train_sz, val_sz])
    train_loader = DataLoader(train_loader, batch_size=batch_size, drop_last=True)
    val_loader = DataLoader(val_loader, batch_size=batch_size, drop_last=True)
    
    # Initialize the transformer/optimizer/loss function
    transformer = Transformer(num_layers=num_layers, D=D, H=H, hidden_mlp_dim=hidden_mlp_dim,
                          inp_features=inp_features, out_features=out_features,
                          dropout_rate=dropout_rate, batch_size=batch_size,
                          kernel_size=kernel_size).to(device)

    optimizer = torch.optim.Adam(transformer.parameters(), lr=learning_rate) 
    loss_function = nn.MSELoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=factor,
                                  patience=patience, verbose=True) 
    
    # Print out how many parameters to train
    param_sizes = [p.numel() for p in transformer.parameters()]
    print(f"number of weight/biases matrices: {len(param_sizes)} "
          f"for a total of {np.sum(param_sizes)} parameters ")
    
    avg_train_loss, avg_val_loss = [], []
    train_time, elapsed_time = [], []
    
    # Start training
    for epoch in tqdm(list(range(n_epochs))):
        print(f'Epoch {epoch+1}')
        print('Training & Validating ', end='')
        
        start_time = time.time()
        train_loss, val_loss = [], []
        
        # Training set
        for batches, (X, y) in enumerate(train_loader):
            X = X.to(device).float()
            y = y.to(device).float()
            
            # Track progress
            if (batches + 1) % 20 == 0:
                print('-', end='')
            
            # Forward pass
            optimizer.zero_grad()
            S = X.shape[1]
            mask = create_look_ahead_mask(S)
            out, _ = transformer(X, mask) # .auto_regressor(X, mask, step)
            
#             print(out.shape)
#             print(y.reshape(batch_size, -1).shape)
            
            # Backpropagation
            loss = loss_function(out, y.reshape(batch_size, -1)) # y.reshape(batch_size, -1)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
            
        print()
        avg_train = np.mean(train_loss)
        avg_train_loss.append(avg_train)
        
        # End the time
        end_train_time = time.time()
        train_time.append(end_train_time - start_time)
        
        # Evaluate on val set
        with torch.no_grad():
            for batches, (X, y) in enumerate(val_loader):
                X = X.to(device).float()
                y = y.to(device).float()

                S = X.shape[1]
                mask = create_look_ahead_mask(S)
                out, _ = transformer(X, mask) # .auto_regressor(X, mask, step)
                loss = loss_function(out, y.reshape(batch_size, -1)) # y.reshape(batch_size, -1)
                val_loss.append(loss.item())

            avg_val = np.mean(val_loss)
            avg_val_loss.append(avg_val)
        
        end_time = time.time()
        elapsed_time.append(end_time - start_time)

        print(f'- Training Loss: {avg_train}\n- Validation Loss: {avg_val}')
        print(f'- Train Time: {sum(train_time)}\n- Elapsed Time: {sum(elapsed_time)}\n')
        
        scheduler.step(avg_val)
        
    return transformer, (avg_train_loss, avg_val_loss)
    

In [31]:
def train_total(total_dataset, batch_size, num_layers, D, H, hidden_mlp_dim, 
          inp_features, out_features, dropout_rate, n_epochs, learning_rate, factor, patience,
          step, kernel_size):
    
    # Create the training/validation set
    train_sz = int(len(total_dataset) * 0.9)
    val_sz = len(total_dataset) - train_sz
    train_loader, val_loader = torch.utils.data.random_split(total_dataset, [train_sz, val_sz])
    train_loader = DataLoader(train_loader, batch_size=batch_size, drop_last=True, shuffle=True)
    val_loader = DataLoader(val_loader, batch_size=batch_size, drop_last=True)
    
    # Initialize the transformer/optimizer/loss function
    transformer = Transformer(num_layers=num_layers, D=D, H=H, hidden_mlp_dim=hidden_mlp_dim,
                          inp_features=inp_features, out_features=out_features,
                          dropout_rate=dropout_rate, batch_size=batch_size,
                          kernel_size=kernel_size).to(device)

    optimizer = torch.optim.Adam(transformer.parameters(), lr=learning_rate) 
    loss_function = nn.MSELoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=factor,
                                  patience=patience, verbose=True) 
    
    # Print out how many parameters to train
    param_sizes = [p.numel() for p in transformer.parameters()]
    print(f"number of weight/biases matrices: {len(param_sizes)} "
          f"for a total of {np.sum(param_sizes)} parameters ")
    
    avg_train_loss, avg_val_loss = [], []
    train_time, elapsed_time = [], []
    best_val_score = float('inf')
    
    # Start training
    for epoch in tqdm(list(range(n_epochs))):
        print(f'Epoch {epoch+1}')
        print('Training & Validating ', end='')
        
        start_time = time.time()
        train_loss, val_loss = [], []
        
        # Training set
        for batches, (X, y) in enumerate(train_loader):
            X = X.to(device).float()
            y = y.to(device).float()
            
            # Track progress
            if (batches + 1) % 120 == 0:
                print('-', end='')
            
            # Forward pass
            optimizer.zero_grad()
            S = X.shape[1]
            mask = create_look_ahead_mask(S)
            out, _ = transformer(X, mask) # .auto_regressor(X, mask, step)
            
            # Backpropagation
            loss = loss_function(out, y.reshape(batch_size, -1)) # y.reshape(batch_size, -1)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
            
        print()
        avg_train = np.mean(train_loss)
        avg_train_loss.append(avg_train)
        
        # End the time
        end_train_time = time.time()
        train_time.append(end_train_time - start_time)
        
        # Evaluate on val set
        with torch.no_grad():
            for batches, (X, y) in enumerate(val_loader):
                X = X.to(device).float()
                y = y.to(device).float()

                S = X.shape[1]
                mask = create_look_ahead_mask(S)
                out, _ = transformer(X, mask) # .auto_regressor(X, mask, step)
                loss = loss_function(out, y.reshape(batch_size, -1)) # y.reshape(batch_size, -1)
                val_loss.append(loss.item())

            avg_val = np.mean(val_loss)
            avg_val_loss.append(avg_val)
        
        end_time = time.time()
        elapsed_time.append(end_time - start_time)

        print(f'- Training Loss: {avg_train}\n- Validation Loss: {avg_val}')
        print(f'- Train Time: {sum(train_time)}\n- Elapsed Time: {sum(elapsed_time)}\n')
        
        scheduler.step(avg_val)
        
        # save better model
        if avg_val < best_val_score:
            best_val_score = avg_val
            torch.save(transformer, f'total_model.pt')
        
    return transformer, (avg_train_loss, avg_val_loss)

In [32]:
def vis_results(city, split, batch_size, model, idx):
    train_dataset = ArgoverseDataset(city = city, split = split, transform=False, normalized=True)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    
    with torch.no_grad():
        for (X, y) in train_loader:
            X = X.to(device).float()
            S = X.shape[1]
            mask = create_look_ahead_mask(S)
            
            output = model(X, mask)[0].reshape(batch_size, -1, 2)
        
            break
    
    print(X.shape)
    print(y.shape)
    print(output.shape)
    X = X.cpu()
    output = output.cpu()
    
    x_jump = train_dataset.start_pos[idx, 0]
    y_jump = train_dataset.start_pos[idx, 0]
    rot = train_dataset.rotate_matrix[idx].T
    X = X[idx] @ np.linalg.inv(rot) + train_dataset.start_pos[idx]
    y = y[idx] @ np.linalg.inv(rot) + train_dataset.start_pos[idx]
    output = output[idx] @ np.linalg.inv(rot) + train_dataset.start_pos[idx]
    
    plt.scatter(X[:, 0], X[:, 1], label='seed')
    plt.scatter(y[:, 0], y[:, 1], label='ground truth')
    plt.scatter(output[:, 0], output[:, 1], label='prediction')
    plt.title(f'Random Sample From {city}_{split} Projectile Visualization')
    plt.legend()
    plt.show()
        

In [38]:
def make_pred(test_loader, batch_sz, model):
    '''
    Remember to use test_dataset stats, NOT train_dataset
    '''
    count_row = 0
    out = []

    for X in test_loader:
        if len(X) != batch_sz:
            print(len(X))
            to_fill = np.zeros([batch_sz-len(X), 50, 2])
            X = torch.from_numpy(np.append(X, to_fill, axis=0))

        X = X.to(device).float()
    
        S = X.shape[1]
        mask = create_look_ahead_mask(S)

        pred = model(X, mask)[0].reshape(batch_size, -1, 2).cpu().detach().numpy()

        for i in range(batch_sz):
            if count_row >= len(test_dataset):
                break

            rotation =  test_dataset.rotate_matrix[count_row].T
            pred[i] = pred[i] @ np.linalg.inv(rotation)
            pred[i] = pred[i] + test_dataset.start_pos[count_row, : ]
                
            out.append(pred[i])
            count_row += 1 

    out = np.array(out).reshape(len(test_dataset), -1)

    return out


### Training Aggregated Dataset

In [28]:
# Hyperparameter
batch_size = 32 
num_layers = 4 # The more of this, more easily to overfit
D = 32 # DIMENSION                                             8
H = 8 # NUMBER OF HEADS                                       8 
hidden_mlp_dim = 64 # [32, 128]                               32
inp_features = 2 
out_features = 120                                         
dropout_rate = 0 # [0, 0.5]
n_epochs = 100 # [50, 100]
learning_rate = 0.002 # [0.001, 0.005]
factor = 0.5 # 0.1 ~ 0.99
patience = 2
step = 20  # (20)x2
kernel_size = 3

total_net, total_losses = train_total(total_dataset, batch_size, num_layers, D, H, hidden_mlp_dim, 
          inp_features, out_features, dropout_rate, n_epochs, learning_rate, factor, patience,
                               step, kernel_size)

number of weight/biases matrices: 82 for a total of 284888 parameters 


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1
Training & Validating -----------------------------------------------
- Training Loss: 53.94105208743642
- Validation Loss: 34.111977682173624
- Train Time: 100.11404538154602
- Elapsed Time: 102.7796220779419

Epoch 2
Training & Validating -----------------------------------------------
- Training Loss: 29.208138301167885
- Validation Loss: 27.15102251820594
- Train Time: 201.29149627685547
- Elapsed Time: 206.69629311561584

Epoch 3
Training & Validating -----------------------------------------------
- Training Loss: 26.48135671554041
- Validation Loss: 24.872037895070683
- Train Time: 301.95279836654663
- Elapsed Time: 309.80698323249817

Epoch 4
Training & Validating -----------------------------------------------
- Training Loss: 23.52547160812133
- Validation Loss: 26.950462123882847
- Train Time: 402.87421441078186
- Elapsed Time: 413.1440258026123

Epoch 5
Training & Validating -----------------------------------------------
- Training Loss: 22.547018451867032
- Valida

Training & Validating -----------------------------------------------
- Training Loss: 16.14863223731726
- Validation Loss: 15.919437372459555
- Train Time: 3819.2717378139496
- Elapsed Time: 3912.23441696167

Epoch 39
Training & Validating -----------------------------------------------
- Training Loss: 16.071488157819388
- Validation Loss: 16.13752105775869
- Train Time: 3919.211638689041
- Elapsed Time: 4014.590362071991

Epoch 40
Training & Validating -----------------------------------------------
- Training Loss: 16.01641934348916
- Validation Loss: 16.479922321607482
- Train Time: 4019.4878692626953
- Elapsed Time: 4117.308619976044

Epoch 41
Training & Validating -----------------------------------------------
- Training Loss: 15.946612277982622
- Validation Loss: 16.895273418546473
- Train Time: 4118.601330757141
- Elapsed Time: 4218.920133590698

Epoch    41: reducing learning rate of group 0 to 1.2500e-04.
Epoch 42
Training & Validating --------------------------------------

In [30]:
torch.save(total_net, 'best_total.pt')

### Prediction

In [32]:
total_test_dataset = TotalDataset(split='test')
len(total_test_dataset), total_test_dataset.sizes

(29843, [6325, 7971, 6361, 3671, 3829, 1686])

In [41]:
city = 'austin' 

test_dataset = total_test_dataset.datasets[0]
test_loader = DataLoader(test_dataset, batch_size=32)

austin_array = make_pred(test_loader, 32, total_net)
austin_array.shape

21


(6325, 120)

In [43]:
city = 'miami' 

test_dataset = total_test_dataset.datasets[1]
test_loader = DataLoader(test_dataset, batch_size=32)

miami_array = make_pred(test_loader, 32, total_net)
miami_array.shape

3


(7971, 120)

In [49]:
city = 'pittsburgh' 

test_dataset = total_test_dataset.datasets[2]
test_loader = DataLoader(test_dataset, batch_size=32)

pitts_array = make_pred(test_loader, 32, total_net)
pitts_array.shape

25


(6361, 120)

In [45]:
city = 'dearborn' 

test_dataset = total_test_dataset.datasets[3]
test_loader = DataLoader(test_dataset, batch_size=32)

dearborn_array = make_pred(test_loader, 32, total_net)
dearborn_array.shape

23


(3671, 120)

In [46]:
city = 'washington-dc' 

test_dataset = total_test_dataset.datasets[4]
test_loader = DataLoader(test_dataset, batch_size=32)

wash_array = make_pred(test_loader, 32, total_net)
wash_array.shape

21


(3829, 120)

In [47]:
city = 'palo-alto' 

test_dataset = total_test_dataset.datasets[5]
test_loader = DataLoader(test_dataset, batch_size=32)

palo_array = make_pred(test_loader, 32, total_net)
palo_array.shape

22


(1686, 120)

## Write File

In [51]:
import csv

cols = [['ID'] + ['v{}'.format(i) for i in range(120)]]

with open('output.csv', 'w+') as file:
    mywriter = csv.writer(file, delimiter=',')
    mywriter.writerows(cols)

with open('output.csv', 'a') as file:
    mywriter = csv.writer(file, delimiter=',')
    
    count = 0
    for i in range(len(austin_array)):
        temp = [np.append(['{}_austin'.format(i)], austin_array[i])]
        mywriter.writerows(temp)
        count += 1
    print(count)
    
    count = 0
    for i in range(len(miami_array)):
        temp = [np.append(['{}_miami'.format(i)], miami_array[i])]
        mywriter.writerows(temp)
        count += 1
    print(count)
    
    count = 0
    for i in range(len(pitts_array)):
        temp = [np.append(['{}_pittsburgh'.format(i)], pitts_array[i])]
        mywriter.writerows(temp)
        count += 1
    print(count)
    
    count = 0
    for i in range(len(dearborn_array)):
        temp = [np.append(['{}_dearborn'.format(i)], dearborn_array[i])]
        mywriter.writerows(temp)
        count += 1
    print(count)
    
    count = 0
    for i in range(len(wash_array)):
        temp = [np.append(['{}_washington-dc'.format(i)], wash_array[i])]
        mywriter.writerows(temp)
        count += 1
    print(count)
    
    count = 0
    for i in range(len(palo_array)):
        temp = [np.append(['{}_palo-alto'.format(i)], palo_array[i])]
        mywriter.writerows(temp)
        count += 1
    print(count)
    

6325
7971
6361
3671
3829
1686
