# Data Loading

In [2]:
from __future__ import print_function, division
import pandas as pd
import matplotlib.pyplot as plt
from skimage import io, transform
import os
import numpy as np

# defining customized Dataset class for Udacity

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

class UdacityDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None, select_camera=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            select_camera (string): 'left_ / right_ / center_camera'
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        
        camera_csv = pd.read_csv(csv_file)
        assert select_camera in ['left_camera', 'right_camera', 'center_camera'], "Invalid camera: {}".format(select_camera)
        if select_camera:
            camera_csv = camera_csv[camera_csv['frame_id']==select_camera]
        self.camera_csv = camera_csv
        self.root_dir = root_dir
        self.transform = transform
    
    def __len__(self):
        return len(self.camera_csv)
    
    def read_data_single(self, idx):
        path = os.path.join(self.root_dir, self.camera_csv['filename'].iloc[idx])
        image = io.imread(path)
        timestamp = self.camera_csv['timestamp'].iloc[idx]
        frame_id = self.camera_csv['frame_id'].iloc[idx]
        angle = self.camera_csv['angle'].iloc[idx]
        torque = self.camera_csv['torque'].iloc[idx]
        speed = self.camera_csv['speed'].iloc[idx]
        
        if self.transform:
            image_transformed = self.transform(image)
            del image
            image = image_transformed
        angle_t = torch.tensor(angle)
        torque_t = torch.tensor(torque)
        speed_t = torch.tensor(speed)
        del angle, torque, speed
            
        return image, timestamp, frame_id, angle_t, torque_t, speed_t
    
    def read_data(self, idx):
        if isinstance(idx, list):
            data = None
            for i in idx:
                new_data = self.read_data(i)
                if data is None:
                    data = [[] for _ in range(len(new_data))]
                for i, d in enumerate(new_data):
                    data[i].append(new_data[i])
                del new_data
                
            for stack_idx in [0, 3, 4, 5]: # we don't stack timestamp and frame_id since those are string data
                data[stack_idx] = torch.stack(data[stack_idx])
            
            return data
        
        else:
            return self.read_data_single(idx)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        data = self.read_data(idx)
        
        sample = {'image': data[0],
                  'timestamp': data[1],
                  'frame_id': data[2],
                  'angle': data[3],
                  'torque': data[4],
                  'speed': data[5]}
        
        del data
        
        return sample

In [3]:
# generate Batch with consecutive frames taken from input data

from torch.utils.data import Sampler
import random

class ConsecutiveBatchSampler(Sampler):
    
    def __init__(self, data_source, batch_size, seq_len, drop_last=False, shuffle=True):
        r""" Sampler to generate consecutive Batches
        
        Args:
            data_source: Source of data
            batch_size: Size of batch
            seq_len: Number of frames in each sequence (used for context for prediction)
            drop: Wether to drop the last incomplete batch
            shuffle: Wether to shuffle the data
        Return:
            List of iterators, size: [batch_size x seq_len x n_channels x height x width]
        """
        super().__init__(data_source)
        
        self.data_source = data_source
        
        assert seq_len >= 1, "Invalid batch size: {}".format(seq_len)
        self.seq_len = seq_len
        self.drop_last = drop_last
        self.shuffle = shuffle
        self.batch_size = batch_size
    
    def __iter__(self):
        data_size = len(self.data_source)
        start_indices = list(range(data_size))
        if self.shuffle:
            random.shuffle(start_indices)
        
        batch = []
        for idx, ind in enumerate(start_indices):
            if data_size - idx < self.batch_size and self.drop_last: # if last batch
                break
                
            seq = []
            if ind + 1 < self.seq_len:
                seq.extend([0]*(self.seq_len - ind - 1) + list(range(0, ind+1)))
            else:
                seq.extend(list(range(ind-self.seq_len+1, ind+1)))
            
            batch.append(seq)
            
            if len(batch) == self.batch_size or idx == data_size - 1:
                yield batch
                batch = []

    
    def __len__(self):
        length = len(self.data_source)
        batch_size = self.batch_size
        
        if length % batch_size == 0 or self.drop_last:
            return length // batch_size
        
        return length // batch_size + 1

In [4]:
def show_sample(sample):
    r""" Helper function for (batch) sample visualization
    
    Args:
        sample: Dictionary
    """
    image_dims = len(sample['image'].shape)
    assert image_dims <= 5, "Unsupported image shape: {}".format(sample['image'].shape)
    if image_dims == 3:
        plt.imshow(sample['image'])
    else:
        n0 = sample['image'].shape[0]
        n1 = sample['image'].shape[1] if image_dims == 5 else 1
        images_flattened = torch.flatten(sample['image'], end_dim=-4)
        fig, ax = plt.subplots(n0, n1, figsize=(25, 15))
        for i1 in range(n1):
            for i0 in range(n0):
                image = images_flattened[i0 * n1 + i1]
                axis = ax[i0, i1]
                axis.imshow(image.permute(1,2,0))
                axis.axis('off')
                axis.set_title("t={}".format(sample['timestamp'][i0][i1]))
                axis.text(10, 30, sample['frame_id'][i0][i1], color='red')
            
    


In [5]:
# Initializing DataLoader
# Warning: this only need to be done once to reduce system overhead (leaking memory)

from torch.utils.data import BatchSampler, SequentialSampler, RandomSampler

udacity_dataset = UdacityDataset(csv_file='/export/jupyterlab/data/udacity-challenge-2/Ch2_002_export/interpolated.csv',
                                 root_dir='/export/jupyterlab/data/udacity-challenge-2/Ch2_002_export/',
                                 transform=transforms.Compose([transforms.ToTensor()]),
                                 select_camera='center_camera')

cbs = ConsecutiveBatchSampler(data_source=udacity_dataset, batch_size=20, shuffle=True, drop_last=False, seq_len=15)
dataloader = DataLoader(udacity_dataset, sampler=cbs, collate_fn=(lambda x: x[0]))


# Build the model

## 3D CNN with residual connection

In [6]:
# Build the model

import torch.nn as nn

# helper function to determine dimension after convolution
def conv_output_shape(in_dimension, kernel_size, stride):
    output_dim = []
    for (in_dim, kern_size, strd) in zip(in_dimension, kernel_size, stride):
        len = int(float(in_dim - kern_size) / strd + 1.)
        output_dim.append(len)
    
    return output_dim

        
class TemporalCNN(nn.Module):
    
    def _conv_unit(self, in_channels, out_channels, in_shape, kernel_size, stride, dropout_prob):
        r""" Return one 3D convolution unit
        
        Args:
            in_channels: Input channels of the Conv3D module
            out_channels: Output channels of the Conv3D module
            in_shape: Shape of the input image. i.e. The last 3 dimensions of the input tensor: D x H x W
            kernel_size: Kernel size
            stride: Stride
            dropout_prob: Probability of dropout layer
                         
        Output:
            (conv_module, aux_module, out_shape)
        """
        
        conv = nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride)
        dropout = nn.Dropout3d(p=dropout_prob)
        conv_module = nn.Sequential(conv, dropout)
        out_shape = conv_output_shape(in_shape, kernel_size, stride)
        
        flatten = nn.Flatten(start_dim=2)
        aux = nn.Linear(in_features=np.prod(out_shape[-2:])*out_channels, out_features=128)
        aux_module = nn.Sequential(flatten, aux)
        
        return conv_module, aux_module, out_shape
    
    def _linear_unit(self, in_features, out_features, dropout_prob):
        linear = nn.Linear(in_features, out_features)
        dropout = nn.Dropout(p=dropout_prob)
        return nn.Sequential(linear, dropout), out_features
        
    
    def __init__(self, in_height, in_width, seq_len, dropout_prob=0.5, aux_history=10):
        r""" TemporalCNN: this model does 3D convolution on the H, W and temporal dimension
             It also includes residual connection from each Conv3D output to the final output
             
             Args:
                 in_height: image height
                 in_width: image width
                 seq_len: image sequence length
                 dropout_prob: prob for the dropout layer
                 aux_history: length of history to extract from seq_len
             
             Output:
                 nn.Module, accepts input with shape [batch_len, seq_len, C, in_height, in_width]
        """
        super().__init__()
        
        self.seq_len_ = seq_len
        self.in_width_ = in_width
        self.in_height_ = in_height
        self.aux_history_ = aux_history
        in_shape = (seq_len, in_height, in_width)
        
        conv_layers = []
        # conv1
        conv, aux, out_shape = self._conv_unit(3, 64, in_shape, (3, 12, 12), (1, 6, 6), dropout_prob)
        conv_layers.append((conv, aux))
        # conv2
        conv, aux, out_shape = self._conv_unit(64, 64, out_shape, (2, 5, 5), (1, 2, 2), dropout_prob)
        conv_layers.append((conv, aux))
        # conv3
        conv, aux, out_shape = self._conv_unit(64, 64, out_shape, (2, 5, 5), (1, 1, 1), dropout_prob)
        conv_layers.append((conv, aux))
        # conv4
        conv, aux, out_shape = self._conv_unit(64, 64, out_shape, (2, 5, 5), (1, 1, 1), dropout_prob)
        conv_layers.append((conv, aux))
        
        linear_layers = []
        # Flatten the last 3 dims
        flatten = nn.Flatten(start_dim=2)
        linear_layers.append(flatten)
        # FC 1024
        linear, out_features = self._linear_unit(64*np.prod(out_shape[-2:]), 1024, dropout_prob)
        linear_layers.append(linear)
        # FC 512
        linear, out_features = self._linear_unit(out_features, 512, dropout_prob)
        linear_layers.append(linear)
        # FC 256
        linear, out_features = self._linear_unit(out_features, 256, dropout_prob)
        linear_layers.append(linear)
        # FC 128
        linear, out_features = self._linear_unit(out_features, 128, dropout_prob)
        linear_layers.append(linear)
        
        self.conv_layers_ = conv_layers
        self.linear_layers_ = linear_layers
        self.final_elu_ = nn.ELU()
        
    def forward(self, x):
        x = x.permute([0, 2, 1, 3, 4]) # swap channel and seq_len, 3D conv over seq_len as depth channel
                                       # now: [batch_size, channel, seq_len, H, W]
        
        aux_outputs = []
        for layer in self.conv_layers_:
            x_out = layer[0](x)
            x_out_permuted = x_out.permute([0, 2, 1, 3, 4]) # swap back for calculation of aux output
            x_aux = layer[1](x_out_permuted[:,-self.aux_history_:,:,:,:])
            print(x_out.shape)
            print(x_aux.shape)
            aux_outputs.append(x_aux)
            x = x_out
        
        x = x.permute([0, 2, 1, 3, 4]) # swap back the dimensions, now: [batch_size, seq_len, channel, H, W]
        for layer in self.linear_layers_:
            x = layer(x)
        
        final_out = x
        for aux_out in aux_outputs:
            final_out = final_out + aux_out
        final_out = self.final_elu_(final_out)
        print(final_out.shape)
        
        return final_out


## Autoregressive LSTM

In [7]:
# Autoregressive LSTM Module

class AutoregressiveLSTMCell(nn.Module):
    
    def __init__(self, target_size, visual_feature_size, hidden_size):
        r""" AutoregressiveModule takes visual feature from 3D CNN module, pass it
             first into an internal LSTM cell and then into a Linear network. The final
             output of this module is of dimension output_size.
             
             Args:
                 target_dim: dimsion of target value. for this application 3 (angle, speed, torque)
                 visual_feature_dim:
                 output_size: output size after the Linear network
                 autoregressive_mode: wether this module work as autoregressive mode or
                     just pass the ground truth to output
             Output:
                 nn.Module
        """
        super().__init__()
        
        self.target_size_ = target_size
        self.visual_feature_size_ = visual_feature_size
        self.hidden_size_ = hidden_size
        
        self.lstm_cell_ = nn.LSTMCell(input_size=target_size+visual_feature_size, hidden_size=hidden_size)
        self.linear_ = nn.Linear(in_features=hidden_size+visual_feature_size+target_size, out_features=target_size)
    
    def forward(self, visual_features, prev_target, prev_states):
        r"""
            Output:
                (output, target_ground_truth) for autoregressive_mode = True
                (output, (output, ))
        """
        lstm_input = torch.cat((visual_features, prev_target), dim=-1)
        h_t, c_t = self.lstm_cell_(lstm_input, prev_states)
        linear_input = torch.cat((visual_features, prev_target, h_t), dim=-1)
        output = self.linear_(linear_input)
        new_state = (h_t, c_t)
        
        return output, new_state
        
class AutoregressiveLSTM(AutoregressiveLSTMCell):
    
    def __init__(self, target_size, visual_feature_size, hidden_size, autoregressive_mode=True):
        super().__init__(target_size, visual_feature_size, hidden_size)
        self.autoregressive_mode_ = autoregressive_mode
    
    def forward(self, visual_features, init_target=None, init_states=None, target_groundtruth=None):
        # different from LSTM in torch library, we use the second dimension for sequence!
        assert self.autoregressive_mode_ or target_groundtruth is not None
        
        seq_len = visual_features.shape[1]
        batch_len = visual_features.shape[0]

        prev_target = torch.zeros(batch_len, self.target_size_) if init_target is None else init_target
        prev_states = (torch.zeros(batch_len, self.hidden_size_), torch.zeros(batch_len, self.hidden_size_)) if init_states is None else init_states
        
        outputs = []
        states = []
        for seq_idx in range(seq_len):
            target, state = super().forward(visual_features[:, seq_idx, :], prev_target, prev_states)
            prev_target = target if self.autoregressive_mode_ else target_groundtruth[:, seq_idx, :]
            outputs.append(target)
            states.append(torch.stack(state))
            
        outputs = torch.stack(outputs)
        outputs = outputs.permute(1, 0, 2)  # dim: [batch, seq, target_size]
        states = torch.stack(states)
        states = states.permute(1, 2, 0, 3) # dim: [ [batch, seq, internal_size], [batch, seq, internal_size] ]
        
        return outputs, states
    
        

In [8]:
lstmcell = AutoregressiveLSTMCell(3, 128, 64)
vis = torch.randn(5, 128)
target = torch.randn(5, 3)
state = (torch.randn(5, 64), torch.randn(5, 64))
output, (h, c) = lstmcell(vis, target, state)

print(output.shape, h.shape, c.shape)

lstm = AutoregressiveLSTM(3, 128, 64)
vis = torch.randn(5, 3, 128)
target = torch.randn(5, 3)
target_gt = torch.randn(5, 3, 3)
state = (torch.randn(5, 64), torch.randn(5, 64))
print(vis[:, 1, :].shape)

a, b = lstm(vis, target, state)

print(a.shape)
print(b.shape)

lstm = AutoregressiveLSTM(3, 128, 64, False)
a, b = lstm(vis, target, state, target_gt)

print(a.shape)
print(b.shape)

torch.Size([5, 3]) torch.Size([5, 64]) torch.Size([5, 64])
torch.Size([5, 128])
torch.Size([5, 3, 3])
torch.Size([2, 5, 3, 64])
torch.Size([5, 3, 3])
torch.Size([2, 5, 3, 64])


In [9]:
model = TemporalCNN(480, 640, 15)

image = torch.randn(10, 15, 3, 480, 640)
print(image.shape)
out = model(image)
del image


torch.Size([10, 15, 3, 480, 640])
torch.Size([10, 64, 13, 79, 105])
torch.Size([10, 10, 128])
torch.Size([10, 64, 12, 38, 51])
torch.Size([10, 10, 128])
torch.Size([10, 64, 11, 34, 47])
torch.Size([10, 10, 128])
torch.Size([10, 64, 10, 30, 43])
torch.Size([10, 10, 128])
torch.Size([10, 10, 128])


In [7]:
with torch.no_grad():
    
    model = TemporalCNN(480, 640, 3)
    
    cbs = ConsecutiveBatchSampler(data_source=udacity_dataset, batch_size=10, shuffle=True, drop_last=False, seq_len=15)
    dataloader = DataLoader(udacity_dataset, sampler=cbs, num_workers=1, collate_fn=(lambda x: x[0]))
    for i_batch, sample_batched in tqdm(enumerate(dataloader)):
        image = sample_batched['image']
        print(image.shape)
        out = model(image)
        break


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

torch.Size([10, 3, 15, 480, 640])
torch.Size([10, 64, 13, 79, 105])
torch.Size([10, 64, 12, 38, 51])
torch.Size([10, 64, 11, 34, 47])
torch.Size([10, 64, 10, 30, 43])


In [None]:
conv_output_shape((15, 480, 640), (2, 5, 5), (1, 2, 2))