# UROP1100T (Spring 2018)

## DESIRE: Distant Future Prediction in Dynamic Scenes with Interacting Agents

[CVPR Paper](https://arxiv.org/pdf/1704.04394.pdf)

[Supplementary Notes](http://www.robots.ox.ac.uk/~namhoon/doc/DESIRE-supp.pdf)

TODO:
    
Subtract Y starting from X, not from itself

# Sample Generation Module

In [1]:
import csv
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from tensorboardX import SummaryWriter

## Data

In [2]:
def print_line_sep():
    print('--------------------------------------')

In [3]:
raw_data = []
with open('raw_data/raw_record.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    raw_data = list(csvreader)
print('Raw data extract:')
print(raw_data[0])
print_line_sep()

filtered_data = [[int(row[0]), int(row[1]), float(row[2]), float(row[3])] for row in raw_data]
print('Filtered data extract:')
print(filtered_data[0])
print_line_sep()

dict_data = {}
for row in filtered_data:
    time = row[0]
    car = row[1]
    x = row[2]
    y = row[3]
    
    if car in dict_data:
        dict_data[car].append([time, x, y])
    else:
        dict_data[car] = [[time, x, y]]
print('Data dictionary extract:')
print(dict_data[172][0])

Raw data extract:
['1', '172', '93.939', '-55.7615', '0', '-3.0066', '5.27149', '4', '10']
--------------------------------------
Filtered data extract:
[1, 172, 93.939, -55.7615]
--------------------------------------
Data dictionary extract:
[1, 93.939, -55.7615]


In [4]:
def get_random_batches(x_seq_len, y_seq_len, batch_size):
    batches = []
    dict_data_keys = list(dict_data.keys())
    
    while(len(batches) < batch_size): # Fill up batches with batch_size rows of x+y_seq_len columns
        car = random.choice(dict_data_keys)
        batch = []
        is_first = True
        
        # Loop through each row of that car until x+y_seq_len columns are found
        for row in dict_data[car]:
            time = row[0]
            x = row[1]
            y = row[2]
            
            # If first item of the sequence, just append
            if is_first:
                batch.append([time, x, y])
                is_first = False
                
            # If not, check if time diff is 1
            else:
                prev_time = batch[-1][0]
                
                # If time diff is not 1,
                # 1) Clear batch item
                # 2) Start from current location as first batch item
                if time - prev_time != 1:
                    batch = []
                    batch.append([time, x, y])
                    
                # Otherwise, just append to batch item
                else:
                    batch.append([time, x, y])
            
            # If batch item columns are enough, break
            if len(batch) == x_seq_len + y_seq_len:
                batches.append(batch)
                break
                
    # Just keep (x,y)  
    X_position_array = [[[item[1], item[2]] for item in batch[:x_seq_len]] for batch in batches]
    Y_position_array = [[[item[1], item[2]] for item in batch[x_seq_len:]] for batch in batches]
    
    X_position_np = np.asarray(X_position_array).transpose(0, 2, 1)
    Y_position_np = np.asarray(Y_position_array).transpose(0, 2, 1)
    
    X_position_tensor = Variable(torch.from_numpy(X_position_np).type(torch.FloatTensor)).cuda()
    Y_position_tensor = Variable(torch.from_numpy(Y_position_np).type(torch.FloatTensor)).cuda()
    
    # Convert position to displacement
    X_displacement_array = []
    Y_displacement_array = []

    for i in range(batch_size):
        x_first = X_position_array[i][0][0]
        y_first = X_position_array[i][0][1]

        X_batch = X_position_array[i]
        X_displacement_array.append([[item[0] - x_first, item[1] - y_first] for item in X_batch])

        Y_batch = Y_position_array[i]
        Y_displacement_array.append([[item[0] - x_first, item[1] - y_first] for item in Y_batch])

    X_displacement_np = np.asarray(X_displacement_array).transpose(0, 2, 1)    
    Y_displacement_np = np.asarray(Y_displacement_array).transpose(0, 2, 1)
    
    X_displacement_tensor = Variable(torch.from_numpy(X_displacement_np).type(torch.FloatTensor)).cuda()
    Y_displacement_tensor = Variable(torch.from_numpy(Y_displacement_np).type(torch.FloatTensor)).cuda()
    
    return X_position_tensor, Y_position_tensor, X_displacement_tensor, Y_displacement_tensor

In [5]:
X_position_test, Y_position_test, X_displacement_test, Y_displacement_test = get_random_batches(20, 40, 8)

print('X positions:')
print(X_position_test.size())
print(X_position_test[0])

print_line_sep()

print('Y positions:')
print(Y_position_test.size())
print(Y_position_test[0])

print_line_sep()

print('X displacements:')
print(X_displacement_test.size())
print(X_displacement_test[0])

print_line_sep()

print('Y displacements:')
print(Y_displacement_test.size())
print(Y_displacement_test[0])

X positions:
torch.Size([8, 2, 20])
tensor([[ 334.9780,  334.9800,  334.9830,  334.9850,  334.9870,  334.9880,
          334.9890,  334.9910,  334.9920,  334.9930,  334.9940,  334.9950,
          334.9950,  334.9960,  334.9960,  334.9970,  334.9970,  334.9980,
          334.9980,  334.9980],
        [ -16.6006,  -16.9825,  -17.3609,  -17.7358,  -18.1070,  -18.4746,
          -18.8385,  -19.2001,  -19.5669,  -19.9454,  -20.3379,  -20.7343,
          -21.1297,  -21.5233,  -21.9147,  -22.3034,  -22.6893,  -23.0718,
          -23.4509,  -23.8265]], device='cuda:0')
--------------------------------------
Y positions:
torch.Size([8, 2, 40])
tensor([[ 334.9980,  334.9990,  334.9990,  334.9990,  334.9990,  334.9990,
          334.9990,  335.0000,  335.0000,  335.0000,  335.0000,  335.0000,
          335.0000,  335.0000,  335.0000,  335.0010,  335.0010,  335.0010,
          335.0010,  335.0010,  335.0010,  335.0000,  335.0000,  335.0000,
          335.0000,  335.0000,  335.0000,  335.0000,  335

## Model

### Encoder

In [6]:
class SampleEncoder(nn.Module):
    def __init__(self,
                 input_dim, seq_len, num_layers,
                 conv_output_dim, conv_kernel_size,
                 gru_hidden_dim):
        super(SampleEncoder, self).__init__()
        
        self.seq_len = seq_len
        self.num_layers = num_layers
        self.conv_output_dim = conv_output_dim
        self.conv_kernel_size = conv_kernel_size
        self.gru_hidden_dim = gru_hidden_dim
        
        # C = X or Y
        # C_i, (input_dim, seq_len) -> tC_i, (conv_output_dim, seq_len)
        self.conv = nn.Conv1d(input_dim, conv_output_dim, conv_kernel_size)

        # tC_i, (conv_output_dim, seq_len) -> H_C_i, (gru_hidden_dim)
        self.gru = nn.GRU(conv_output_dim, gru_hidden_dim, num_layers)

    def init_hidden(self, batch_size):
        # Initial hidden vector is hidden_dim-dimensional and padded with 0
        return Variable(torch.zeros(self.num_layers, batch_size, self.gru_hidden_dim)).cuda()

    def forward(self, x, hidden):
        batch_size = x.size(0)
        
        conv_output = self.conv(x)
        conv_output = F.relu(conv_output)
        
        # conv_output has dimensions (batch_size, dim, seq_len)
        # GRU accepts input tensor with dimensions (seq_len, batch_size, dim)
        # TODO: Pad
        conv_output = conv_output.permute(2, 0, 1)
        
        output, hidden = self.gru(conv_output, hidden)
        
        return output, hidden

### Conditional Variational Auto Encoder

In [7]:
class SampleCVAE(nn.Module):
    def __init__(self, input_dim, output_dim, mu_dim, sigma_dim):
        super(SampleCVAE, self).__init__()
        
        self.sigma_dim = sigma_dim

        self.fc1 = nn.Linear(input_dim, output_dim)
        self.fc_mu = nn.Linear(output_dim, mu_dim)
        self.fc_sigma = nn.Linear(output_dim, sigma_dim)

    def forward(self, x):
        batch_size = x.size(0)
        
        output = self.fc1(x)
        output = F.relu(output)
        
        mu = self.fc_mu(output)
        sigma = torch.div(torch.exp(self.fc_sigma(output)), 2)
        # Reparam trick
        epsilon = Variable(torch.normal(torch.zeros(batch_size, self.sigma_dim),
                           torch.ones(batch_size, self.sigma_dim))).cuda()

        return mu + sigma*epsilon

### Fully Connected Softmax Layer

In [8]:
class SampleFCS(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SampleFCS, self).__init__()
        
        self.fcs = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        output = self.fcs(x)
        output = F.softmax(output, dim=2)
        return output

### Decoder

In [9]:
class SampleDecoder(nn.Module):
    def __init__(self, input_dim, seq_len, num_layers, gru_hidden_dim, gru_output_dim, output_dim):
        super(SampleDecoder, self).__init__()
        
        self.seq_len = seq_len
        self.num_layers = num_layers
        self.gru_hidden_dim = gru_hidden_dim
        
        self.gru = nn.GRU(input_dim, gru_output_dim, num_layers)
        self.linear = nn.Linear(gru_output_dim, output_dim)
        
    def init_hidden(self, batch_size):
        return Variable(torch.zeros(self.num_layers, batch_size, self.gru_hidden_dim)).cuda()
        
    def forward(self, x, hidden):
        output, hidden = self.gru(x, hidden)
        output = self.linear(output)
        return output, hidden

## Train

In [23]:
# Hyperparameters
learning_rate = 0.0001
num_epochs = 100
batch_size = 8
X_seq_len = 20
Y_seq_len = 40

In [24]:
# Initialize model

# input_dim = 2        | input is a sequence of (x,y) coordinates (i.e. 2-dimensional)
# seq_len = 20         | time sequence length of 20
# conv_output_dim = 16 | 1D convolution with 16 output channels
# conv_kernel_size = 3 | 1D convolution with kernel of width 3
# gru_hidden_dim = 48  | 48-dimensional hidden vector
#
# Input: Tensor of size (batch_size, 2, 20)
encoder1 = SampleEncoder(input_dim=2, seq_len=X_seq_len, num_layers=1,
                         conv_output_dim=16, conv_kernel_size=3,
                         gru_hidden_dim=48).cuda()

# input_dim = 2        | input is a sequence of (x,y) coordinates (i.e. 2-dimensional)
# seq_len = 40         | time sequence length of 40
# conv_output_dim = 16 | 1D convolution with 16 output channels
# conv_kernel_size = 1 | 1D convolution with kernel of width 1
# gru_hidden_dim = 48  | 48-dimensional hidden vector
#
# Input: Tensor of size (batch_size, 2, 40)
encoder2 = SampleEncoder(input_dim=2, seq_len=Y_seq_len, num_layers=1,
                         conv_output_dim=16, conv_kernel_size=1,
                         gru_hidden_dim=48).cuda()

# input_dim = 96       | Concatenate encoder1's and encoder2's outputs (48-dim each) into 1 output (96-dim)
# output_dim = 48      | Transform concatenated 96-dim vector into 48-dim vector
# mu_dim = 48          | mu is 48-dim
# sigma_dim = 48       | sigma is 48-dim
#
# Input: Tensor of size (batch_size, 48)
cvae = SampleCVAE(input_dim=96, output_dim=48, mu_dim=48, sigma_dim=48).cuda()

# input_dim = 48       |
# output_dim = 48      |
#
# Input: Tensor of size (batch_size, 48)
fcs = SampleFCS(input_dim=48, output_dim=48).cuda()

# input_dim = 48       |
# seq_len = 40         | time sequence length of 40
# gru_hidden_dim = 48  |
# gru_output_dim = 48  |
# output_dim = 2       |
#
# Input: Tensor of size (40, batch_size, 48)
decoder = SampleDecoder(input_dim=48, seq_len=40, num_layers=1,
                        gru_hidden_dim=48, gru_output_dim=48,
                        output_dim=2).cuda()

# Loss
kld = nn.KLDivLoss()
mse = nn.MSELoss()

# Optimizer
optimizer = optim.Adam([
    {'params': encoder1.parameters()},
    {'params': encoder2.parameters()},
    {'params': cvae.parameters()},
    {'params': fcs.parameters()},
    {'params': decoder.parameters()}
], lr=learning_rate
)

In [25]:
# writer = SummaryWriter()

for epoch in range(num_epochs):
    X_position, Y_position, X_displacement, Y_displacement = get_random_batches(X_seq_len, Y_seq_len, batch_size)
    
#     writer.add_scalars('data/X', {
#         'x': X.permute(0, 2, 1)[0][0][0],
#         'y': Y.permute(0, 2, 1)[0][0][0]
#     }, epoch)
    
    running_kld_loss = 0.0
    running_mse_loss = 0.0
    running_loss = 0.0
    
    #optimizer
    optimizer.zero_grad()
    
    # Encoder 1
    e1_hidden = encoder1.init_hidden(batch_size)
    e1_output, e1_last_hidden = encoder1(X_displacement, e1_hidden)
    H_X = e1_last_hidden

    # Encoder 2
    e2_hidden = encoder2.init_hidden(batch_size)
    e2_output, e2_last_hidden = encoder2(Y_displacement, e2_hidden)
    H_Y = e2_last_hidden

    # CVAE
    H_XY = torch.cat([H_X, H_Y], 2)
    z = cvae(H_XY)

    # FCS
    beta_z = fcs(z)
    
    # Decoder
    xz = H_X*beta_z
    hxz = xz
    for i in range(39):
        hxz = torch.cat((hxz, Variable(torch.zeros(1, batch_size, 48)).cuda()), 0)
    decoder_hidden = decoder.init_hidden(batch_size)
    output, last_hidden = decoder(hxz, decoder_hidden)

    # Reconstruction
    X0 = X_position.permute(2, 0, 1)[-1]
    delta_X0 = output[0]    
    Y0_hat = X0 + delta_X0
    Y_hat = Y0_hat.unsqueeze(0)
    
    for i in range(1, Y_seq_len):
        Yi = Y_hat[i - 1]
        delta_Xi = output[i]
        Yi_hat = Yi + delta_Xi
        Yi_hat = Yi_hat.unsqueeze(0)
        Y_hat = torch.cat((Y_hat, Yi_hat), 0)
    Y_hat = Y_hat.permute(1, 2, 0)
        
    # Minimise loss
    # KLD Loss requires random z in N(0,1)
    test_z = Variable(torch.normal(torch.zeros(batch_size, 48),
                                   torch.ones(batch_size, 48))).cuda()
    test_z = test_z.unsqueeze(0)
    kld_loss = kld(torch.log(beta_z), test_z)
    
    # MSE Loss requires to add displacement at all steps before
    Y_true = Y_position
    mse_loss = mse(Y_hat, Y_true)
    
    # Combine losses
    loss = kld_loss + mse_loss
    loss.backward()
    
    optimizer.step()
    
    running_kld_loss += kld_loss.item()
    running_mse_loss += mse_loss.item()
    running_loss += loss.item()
    print('(Epoch %d) Total Loss: %.3f, KLD Loss: %.3f, MSE Loss: %.3f' 
          % (epoch + 1, running_loss, running_kld_loss, running_mse_loss))
    running_kld_loss = 0.0
    running_mse_loss = 0.0
    
    if epoch + 1 == num_epochs:
        print()
        
        print('Generated Y:')
        print(Y_hat[0])
        print(Y_hat.size())
        
        print_line_sep()
        
        print('True Y:')
        print(Y_true[0])
    
# writer.export_scalars_to_json("./all_scalars.json")
# writer.close()

(Epoch 1) Total Loss: 21.745, KLD Loss: 1.594, MSE Loss: 20.151
(Epoch 2) Total Loss: 41.817, KLD Loss: 1.596, MSE Loss: 40.221
(Epoch 3) Total Loss: 13.515, KLD Loss: 1.244, MSE Loss: 12.272
(Epoch 4) Total Loss: 25.543, KLD Loss: 1.554, MSE Loss: 23.989
(Epoch 5) Total Loss: 19.567, KLD Loss: 1.521, MSE Loss: 18.046
(Epoch 6) Total Loss: 34.138, KLD Loss: 1.813, MSE Loss: 32.325
(Epoch 7) Total Loss: 11.286, KLD Loss: 1.550, MSE Loss: 9.736
(Epoch 8) Total Loss: 37.321, KLD Loss: 1.443, MSE Loss: 35.878
(Epoch 9) Total Loss: 31.999, KLD Loss: 1.617, MSE Loss: 30.382
(Epoch 10) Total Loss: 51.761, KLD Loss: 1.561, MSE Loss: 50.200
(Epoch 11) Total Loss: 18.997, KLD Loss: 1.282, MSE Loss: 17.714
(Epoch 12) Total Loss: 15.166, KLD Loss: 1.452, MSE Loss: 13.714
(Epoch 13) Total Loss: 32.533, KLD Loss: 1.643, MSE Loss: 30.890
(Epoch 14) Total Loss: 23.137, KLD Loss: 1.645, MSE Loss: 21.492
(Epoch 15) Total Loss: 58.935, KLD Loss: 1.443, MSE Loss: 57.492
(Epoch 16) Total Loss: 9.703, KLD L

# Ranking & Refinement Module