# Attention Model
## Author: Gary Corcoran
## Date: Jan. 4th, 2017

Attention Label Classiftion of Dashcam videos.

### Read Input Data
Input data is stored in a numpy matrix consisting of each RGB image resized to $(100 \times 100)$

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

# display inline figures
%matplotlib inline

# data manipulation helpers
def shuffle(X, y):
    """
    Shuffle Input Matrices.
    
    @param X: input data matrix [num_instances,num_seqs,width,height,depth]
                @pre numpy matrix
    @param y: input labels matrix [num_instances]
                @pre numpy matrix
                
    @return shuffled data
    """
    idx = np.random.permutation(len(X))
    X = X[idx]
    y = y[idx]
    return X, y

def split_normalize_data(X, y):
    """
    Split Dataset and Normalize Data.
    
    @param X: input data matrix [num_instances,num_seqs,width,height,depth]
                @pre numpy matrix
    @param y: input labels matrix [num_instances]
                @pre numpy matrix
                
    @return training, validation and test datasets
    """
    X_train, X_val, X_test = np.split(X, [int(0.8*len(X)), len(X)])
    y_train, y_val, y_test = np.split(y, [int(0.8*len(y)), len(X)])
    print('X_train:', X_train.shape, 'y_train:', y_train.shape)
    print('X_val:', X_val.shape, 'y_val:', y_val.shape)
    print('X_test:', X_test.shape, 'y_test:', y_test.shape)
    # normalize input data (0 mean, unit variance)
    X_train, X_val, X_test = normalize(X_train, X_val, X_test)
    training_data = X_train, y_train
    validation_data = X_val, y_val
    test_data = X_test, y_test
    return training_data, validation_data, test_data

def normalize(X_train, X_val, X_test):
    """
    Normalize Input Data.
    
    After normalization the training input data should have a mean 0 and a
    standard deviation of 1.
    
    @param X_train: input training data
    @param X_val:   input validation data
    @param X_test:  input test data
    
    @return normalized input data
    """
    # normalize training data
    m = np.mean(np.mean(X_train, axis=0), axis=0)
    X_train = np.asarray(X_train - m, dtype=np.float32)
    std = np.std(X_train)
    X_train /= std
    # normalize validation data
    X_val = np.asarray((X_val - m) / std, dtype=np.float32)
    # normalize test data
    X_test = np.asarray((X_test - m) / std, dtype=np.float32)
    return X_train, X_val, X_test

# read input data
data_path = '../data/'
# X is dimensions [num_instances, num_seqs, width, height, depth]
X = np.load(data_path + 'X_videos_med.npy')
# y is dimensions [num_instances] (0=low attention, 1=medium attention,
# 2=high attention, 3=very high attention)
y = np.load(data_path + 'y_videos_med.npy')
# shuffle data
X, y = shuffle(X, y)
# split data
training_data, validation_data, test_data = split_normalize_data(X, y)
X_train, y_train = training_data
X_val, y_val = validation_data
# X_test, y_test = test_data
# number of examples
n_train = len(X_train)
n_val = len(X_val)
# n_test = len(X_test)

### Displayed Sampled Videos
Images from video frame are sampled 20 frames apart.

In [None]:
# display 5 sampled videos
plt.figure(figsize=(20,20))
for i, idx in enumerate(np.random.randint(len(X), size=5)):
    vid = X[idx]
    label = y[idx] + 1
    for j, frame in enumerate(range(0, 100, 20)):
        plt.subplot(5, 5, i*5+j+1)
        plt.imshow(cv2.cvtColor(vid[frame], cv2.COLOR_BGR2RGB))
        plt.title('Level ' + str(label) + ' Frame ' + str(frame))
        plt.xticks([]), plt.yticks([])
plt.show()

### Build the Model -  Vanilla RNN

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class CNN(nn.Module):
    """
    Convolutional Neural Network.
    
    @param input_size: number of input neurons
    """
    def __init__(self, input_size, fc1_hidden):
        super().__init__()
        self.input_size = input_size
        self.fc1_hidden = fc1_hidden
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16*22*22, fc1_hidden)
    
    def forward(self, x):
        """
        Forward pass through network.
        
        @param x: input data
        
        @return output predictions
        """
        # reshape into nSamples x nChannels x Height x Width
        x = x.view(-1, *self.input_size)
        out = F.relu(self.conv1(x))
        out = F.max_pool2d(out, 2)
        out = F.relu(self.conv2(out))
        out = F.max_pool2d(out, 2)
        out = out.view(-1, 16*22*22)
        out = self.fc1(out)
        # reshape into nSequence x nSamples x nFeatures
        out = out.view(-1, 100, self.fc1_hidden)
        out = torch.transpose(out, 0, 1)
        return out
    
class RNN(nn.Module):
    """
    Recurrent Neural Network.
    
    @param input_size: number of input neurons
    @param hidden_size: number of hidden neurons
    @param output_size: number of output neurons
    """
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, data, hidden):
        combined = torch.cat((data, hidden), 1)
        hidden = self.i2h(combined)
        output = self.h2o(hidden)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self, batch_size, GPU):
        if GPU:
            return Variable(torch.zeros(batch_size, self.hidden_size).cuda())
        return Variable(torch.zeros(batch_size, self.hidden_size))

class CRNN(nn.Module):
    """
    Convolutional Recurrent Neural Network.
    """
    def __init__(self):
        super().__init__()
        self.cnn = CNN(input_size=(3,100,100), fc1_hidden=256)
        self.rnn = RNN(input_size=256, hidden_size=64, output_size=4)
    
    def forward(self, data, hidden):
        data_feats = self.cnn.forward(data)
        num_seqs = data_feats.size()[0]
        for i in range(num_seqs):
            # pass through RNN
            output, hidden = self.rnn.forward(data_feats[i], hidden)
            
        return output, hidden
    
    def init_hidden(self, batch_size, GPU):
        return self.rnn.init_hidden(batch_size, GPU)

# cnn = CNN(input_size=(3,100,100), fc1_hidden=256)
# print(cnn)
# rnn = RNN(input_size=100*100*3, hidden_size=8, output_size=4)
# print(rnn)
crnn = CRNN()
print(crnn)

### Training Function

In [None]:
import time

def train(net, num_epochs, batch_size, learning_rate, criterion, optimizer, GPU):
    if GPU:
        net = net.cuda()
    training_losses = []
    training_accuracy = []
    validation_losses = []
    validation_accuracy = []

    for epoch in range(num_epochs):
        print('Epoch', epoch)
        training_loss = 0.0
        training_correct = 0
        validation_loss = 0.0
        validation_correct = 0
        total_correct_train = 0
        total_correct_val = 0
        # generator to loop through mini-batches
        mini_batch = ((X_train[k:k+batch_size], y_train[k:k+batch_size])
                     for k in range(0, n_train, batch_size))
        # each mini-batch
        for x_batch, y_batch in mini_batch:
            # store in variables
            if GPU:
                inputs = Variable(torch.from_numpy(x_batch).cuda())
                targets = Variable(torch.from_numpy(y_batch).cuda())
            else:
                inputs = Variable(torch.from_numpy(x_batch))
                targets = Variable(torch.from_numpy(y_batch))
            # zero-out gradient
            net.zero_grad()
            # initialize hidden layer
            hidden = net.init_hidden(batch_size, GPU)
            # pass through network
            output, hidden = net.forward(inputs, hidden)
            # compute loss and backprop
            loss = criterion(output, targets)
            training_loss += loss.data[0]
            loss.backward()
            optimizer.step()
            # compute accuracy
            _, y_pred = torch.max(output.data, 1)
            correct = (y_pred == targets.data).sum()
            training_correct += correct
            total_correct_train += y_pred.size()[0]

        # validation
        # generator to loop through mini-batches
        mini_batch = ((X_val[k:k+batch_size], y_val[k:k+batch_size])
                     for k in range(0, n_val, batch_size))
        # each mini-batch
        for x_batch, y_batch in mini_batch:
            # store in variables
            if GPU:
                inputs = Variable(torch.from_numpy(x_batch).cuda())
                targets = Variable(torch.from_numpy(y_batch).cuda())
            else:
                inputs = Variable(torch.from_numpy(x_batch))
                targets = Variable(torch.from_numpy(y_batch))
            # initialize hidden layer
            hidden = net.init_hidden(batch_size, GPU)
            # pass through netword
            output, hidden = net.forward(inputs, hidden)
            # compute loss
            loss = criterion(output, targets)
            validation_loss += loss.data[0]
            # compute accuracy
            _, y_pred = torch.max(output.data, 1)
            correct = (y_pred == targets.data).sum()
            validation_correct += correct
            total_correct_val += y_pred.size()[0]

        # print statistics
        print('\tTraining Loss:', training_loss)
        training_losses.append(training_loss)
        training_acc = training_correct / total_correct_train * 100
        print('\tTraining Accuracy:', training_acc)
        training_accuracy.append(training_acc)
        print('\tValidation Loss:', validation_loss)
        validation_losses.append(validation_loss)
        validation_acc = validation_correct / total_correct_val * 100
        print('\tValidation Accuracy:', validation_acc)
        validation_accuracy.append(validation_acc)
    
    return training_losses, validation_losses, training_accuracy, validation_accuracy

net = crnn
GPU = True
num_epochs = 10
batch_size = 10
learning_rate = 0.0001
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(net.parameters(), learning_rate)

# start timer
start = time.time()
# training function
training_losses, validation_losses, training_accuracy, validation_accuracy = train(net, 
        num_epochs, batch_size, learning_rate, criterion, optimizer, GPU)
print('Elapsed Time:', time.time() - start)

# plot figures
plt.figure(figsize=(10,10))
plt.subplot(121)
plt.plot(training_losses, label='train'), plt.plot(validation_losses, label='validation')
plt.title('Losses'), plt.xlabel('Epoch'), plt.ylabel('Loss')
plt.legend(loc='lower right')
plt.subplot(122)
plt.plot(training_accuracy, label='train'), plt.plot(validation_accuracy, label='valid')
plt.ylim(0, 100)
plt.title('Accuracy'), plt.xlabel('Epoch'), plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()

# Optical Flow
### Read Flow Input Data
Flow input data is stored in a numpy matrix of dimensions $(100 \times 100 \times 2)$ (i.e. Width, Height, and flow in X and Y directions)

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

# display inline figures
%matplotlib inline

# data manipulation helpers
def shuffle(X, y):
    """
    Shuffle Input Matrices.
    
    @param X: input data matrix [num_instances,num_seqs,width,height,depth]
                @pre numpy matrix
    @param y: input labels matrix [num_instances]
                @pre numpy matrix
                
    @return shuffled data
    """
    idx = np.random.permutation(len(X))
    X = X[idx]
    y = y[idx]
    return X, y

def split_normalize_data(X, y):
    """
    Split Dataset and Normalize Data.
    
    @param X: input data matrix [num_instances,num_seqs,width,height,depth]
                @pre numpy matrix
    @param y: input labels matrix [num_instances]
                @pre numpy matrix
                
    @return training, validation and test datasets
    """
    X_train, X_val, X_test = np.split(X, [int(0.8*len(X)), len(X)])
    y_train, y_val, y_test = np.split(y, [int(0.8*len(y)), len(X)])
    print('X_train:', X_train.shape, 'y_train:', y_train.shape)
    print('X_val:', X_val.shape, 'y_val:', y_val.shape)
    print('X_test:', X_test.shape, 'y_test:', y_test.shape)
    # normalize input data (0 mean, unit variance)
    X_train, X_val, X_test = normalize(X_train, X_val, X_test)
    training_data = X_train, y_train
    validation_data = X_val, y_val
    test_data = X_test, y_test
    return training_data, validation_data, test_data

def normalize(X_train, X_val, X_test):
    """
    Normalize Input Data.
    
    After normalization the training input data should have a mean 0 and a
    standard deviation of 1.
    
    @param X_train: input training data
    @param X_val:   input validation data
    @param X_test:  input test data
    
    @return normalized input data
    """
    # normalize training data
    m = np.mean(np.mean(X_train, axis=0), axis=0)
    X_train = np.asarray(X_train - m, dtype=np.float32)
    std = np.std(X_train)
    X_train /= std
    # normalize validation data
    X_val = np.asarray((X_val - m) / std, dtype=np.float32)
    # normalize test data
    X_test = np.asarray((X_test - m) / std, dtype=np.float32)
    return X_train, X_val, X_test

# read input data
data_path = '../data/'
# X is dimensions [num_instances, num_seqs, width, height, depth]
X = np.load(data_path + 'X_flow_big.npy')
# y is dimensions [num_instances] (0=low attention, 1=medium attention,
# 2=high attention, 3=very high attention)
y = np.load(data_path + 'y_flow_big.npy')
# shuffle data
X, y = shuffle(X, y)
# split data
training_data, validation_data, test_data = split_normalize_data(X, y)
X_train, y_train = training_data
X_val, y_val = validation_data
# X_test, y_test = test_data
# number of examples
n_train = len(X_train)
n_val = len(X_val)
# n_test = len(X_test)

### Display Sampled Optical Flow
Images from video frame are sampled 20 frames apart.

In [None]:
# display 5 sampled videos
hsv = np.zeros((100, 100, 3), dtype=np.uint8)
hsv[:, :, 1] = 255
plt.figure(figsize=(20,20))
for i, idx in enumerate(np.random.randint(len(X), size=5)):
    vid = X[idx]
    label = y[idx] + 1
    for j, frame in enumerate(range(0, 100, 20)):
        plt.subplot(5, 5, i*5+j+1)
        mag, ang = cv2.cartToPolar(vid[frame][:, :, 0], vid[frame][:, :, 1])
        hsv[:, :, 0] = ang*180/np.pi/2
        hsv[:, :, 2] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
        rgb = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)
        plt.imshow(rgb)
        plt.title('Level ' + str(label) + ' Frame ' + str(frame))
        plt.xticks([]), plt.yticks([])
plt.show()

### Build the Model -  Vanilla RNN

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class CNN(nn.Module):
    """
    Convolutional Neural Network.
    
    @param input_size: number of input neurons
    """
    def __init__(self, input_size, fc1_hidden):
        super().__init__()
        self.input_size = input_size
        self.fc1_hidden = fc1_hidden
        self.conv1 = nn.Conv2d(2, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16*22*22, fc1_hidden)
    
    def forward(self, x):
        """
        Forward pass through network.
        
        @param x: input data
        
        @return output predictions
        """
        # reshape into nSamples x nChannels x Height x Width
        x = x.view(-1, *self.input_size)
        out = F.relu(self.conv1(x))
        out = F.max_pool2d(out, 2)
        out = F.relu(self.conv2(out))
        out = F.max_pool2d(out, 2)
        out = out.view(-1, 16*22*22)
        out = self.fc1(out)
        # reshape into nSequence x nSamples x nFeatures
        out = out.view(-1, 99, self.fc1_hidden)
        out = torch.transpose(out, 0, 1)
        return out
    
class RNN(nn.Module):
    """
    Recurrent Neural Network.
    
    @param input_size: number of input neurons
    @param hidden_size: number of hidden neurons
    @param output_size: number of output neurons
    """
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, data, hidden):
        combined = torch.cat((data, hidden), 1)
        hidden = self.i2h(combined)
        output = self.h2o(hidden)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self, batch_size, GPU):
        if GPU:
            return Variable(torch.zeros(batch_size, self.hidden_size).cuda())
        return Variable(torch.zeros(batch_size, self.hidden_size))

class CRNN(nn.Module):
    """
    Convolutional Recurrent Neural Network.
    """
    def __init__(self, cnn_hidden, rnn_hidden):
        super().__init__()
        self.cnn = CNN(input_size=(2,100,100), fc1_hidden=cnn_hidden)
        self.rnn = RNN(input_size=cnn_hidden, hidden_size=rnn_hidden, output_size=4)
    
    def forward(self, data, hidden):
        data_feats = self.cnn.forward(data)
        num_seqs = data_feats.size()[0]
        for i in range(num_seqs):
            # pass through RNN
            output, hidden = self.rnn.forward(data_feats[i], hidden)
            
        return output, hidden
    
    def init_hidden(self, batch_size, GPU):
        return self.rnn.init_hidden(batch_size, GPU)

### Training Function

In [None]:
import time

def train(net, num_epochs, batch_size, learning_rate, criterion, optimizer, GPU):
    if GPU:
        net = net.cuda()
    training_losses = []
    training_accuracy = []
    validation_losses = []
    validation_accuracy = []

    for epoch in range(num_epochs):
        print('Epoch', epoch)
        training_loss = 0.0
        training_correct = 0
        validation_loss = 0.0
        validation_correct = 0
        total_correct_train = 0
        total_correct_val = 0
        # generator to loop through mini-batches
        mini_batch = ((X_train[k:k+batch_size], y_train[k:k+batch_size])
                     for k in range(0, n_train, batch_size))
        # each mini-batch
        for x_batch, y_batch in mini_batch:
            # store in variables
            if GPU:
                inputs = Variable(torch.from_numpy(x_batch).cuda())
                targets = Variable(torch.from_numpy(y_batch).cuda())
            else:
                inputs = Variable(torch.from_numpy(x_batch))
                targets = Variable(torch.from_numpy(y_batch))
            # zero-out gradient
            net.zero_grad()
            # initialize hidden layer
            hidden = net.init_hidden(batch_size, GPU)
            # pass through network
            output, hidden = net.forward(inputs, hidden)
            # compute loss and backprop
            loss = criterion(output, targets)
            training_loss += loss.data[0]
            loss.backward()
            optimizer.step()
            # compute accuracy
            _, y_pred = torch.max(output.data, 1)
            correct = (y_pred == targets.data).sum()
            training_correct += correct
            total_correct_train += y_pred.size()[0]

        # validation
        # generator to loop through mini-batches
        mini_batch = ((X_val[k:k+batch_size], y_val[k:k+batch_size])
                     for k in range(0, n_val, batch_size))
        # each mini-batch
        for x_batch, y_batch in mini_batch:
            # store in variables
            if GPU:
                inputs = Variable(torch.from_numpy(x_batch).cuda())
                targets = Variable(torch.from_numpy(y_batch).cuda())
            else:
                inputs = Variable(torch.from_numpy(x_batch))
                targets = Variable(torch.from_numpy(y_batch))
            # initialize hidden layer
            hidden = net.init_hidden(batch_size, GPU)
            # pass through netword
            output, hidden = net.forward(inputs, hidden)
            # compute loss
            loss = criterion(output, targets)
            validation_loss += loss.data[0]
            # compute accuracy
            _, y_pred = torch.max(output.data, 1)
            correct = (y_pred == targets.data).sum()
            validation_correct += correct
            total_correct_val += y_pred.size()[0]

        # print statistics
        print('\tTraining Loss:', training_loss)
        training_losses.append(training_loss)
        training_acc = training_correct / total_correct_train * 100
        print('\tTraining Accuracy:', training_acc)
        training_accuracy.append(training_acc)
        print('\tValidation Loss:', validation_loss)
        validation_losses.append(validation_loss)
        validation_acc = validation_correct / total_correct_val * 100
        print('\tValidation Accuracy:', validation_acc)
        validation_accuracy.append(validation_acc)  
    
    return training_losses, validation_losses, training_accuracy, validation_accuracy

sizes = [16, 32, 64, 128, 256, 512, 1024, 2048]
for s in sizes:
    crnn = CRNN(cnn_hidden=s, rnn_hidden=s//2)
    print(crnn)
    
    net = crnn
    GPU = True
    num_epochs = 30
    batch_size = 10
    learning_rate = 0.0001
    criterion = nn.NLLLoss()
    optimizer = torch.optim.Adam(net.parameters(), learning_rate)

    # start timer
    start = time.time()
    # training function
    training_losses, validation_losses, training_accuracy, validation_accuracy = train(net, 
            num_epochs, batch_size, learning_rate, criterion, optimizer, GPU)
    print('Elapsed Time:', time.time() - start)

    # plot figures
    plt.figure(figsize=(10,10))
    plt.subplot(121)
    plt.plot(training_losses, label='train'), plt.plot(validation_losses, label='validation')
    plt.title('Losses'), plt.xlabel('Epoch'), plt.ylabel('Loss')
    plt.legend(loc='lower right')
    plt.subplot(122)
    plt.plot(training_accuracy, label='train'), plt.plot(validation_accuracy, label='valid')
    plt.ylim(0, 100)
    plt.title('Accuracy'), plt.xlabel('Epoch'), plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.show()