In [1]:
import argparse
import os
import random
import shutil
import time
import warnings
import sys
import matplotlib.pyplot as plt
import numpy as np
import cv2
import math

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import torch.utils.model_zoo as model_zoo
import torchvision
import torch.optim.lr_scheduler
from torch.utils.data.sampler import SubsetRandomSampler
from torch.autograd import Variable

from PIL import Image

In [2]:
BATCH_SIZE = 20
EPOCH = 50
GAMMA = 0.9
STEP_SIZE = 200
LR = 0.001
USE_GPU = True
decoder = ['buoy', 'dock', 'light_buoy', 'totem']
data_transform = transforms.Compose([
            transforms.Resize(227),
            #transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
])

In [3]:
list_path = '/media/arg_ws3/TOSHIBA EXT/data/trajectory/root.txt'
img_path = '/media/arg_ws3/TOSHIBA EXT/data/trajectory/images/'
ann_path = '/media/arg_ws3/TOSHIBA EXT/data/trajectory/annotations/'
model_path = '../model'
if not os.path.exists(model_path):
    os.makedirs(model_path)
data_list_file = open(list_path,'r')
raw_data_list = data_list_file.read().splitlines()
data_list = []
for data in raw_data_list:
    data_split = data.split(',')
    first_frame = data_split[0]
    data_len = int(data_split[1])
    if data_len >= 10:
        data_list.append([first_frame, data_len])

## Define Layer Modules

In [4]:
class LRN(nn.Module):
    def __init__(self, local_size=1, alpha=1.0, beta=0.75, ACROSS_CHANNELS=True):
        super(LRN, self).__init__()
        self.ACROSS_CHANNELS = ACROSS_CHANNELS
        if ACROSS_CHANNELS:
            self.average=nn.AvgPool3d(kernel_size=(local_size, 1, 1),
                    stride=1,
                    padding=(int((local_size-1.0)/2), 0, 0))
        else:
            self.average=nn.AvgPool2d(kernel_size=local_size,
                    stride=1,
                    padding=int((local_size-1.0)/2))
        self.alpha = alpha
        self.beta = beta


    def forward(self, x):
        if self.ACROSS_CHANNELS:
            div = x.pow(2).unsqueeze(1)
            div = self.average(div).squeeze(1)
            div = div.mul(self.alpha).add(1.0).pow(self.beta)
        else:
            div = x.pow(2)
            div = self.average(div)
            div = div.mul(self.alpha).add(1.0).pow(self.beta)
        x = x.div(div)
        return x
    
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)
    
class alexnet_conv_layers(nn.Module):
    def __init__(self):
        super(alexnet_conv_layers, self).__init__()
        self.base_features = torchvision.models.alexnet(pretrained = True).features
        self.skip1 = nn.Sequential(
            nn.Conv2d(64, out_channels=16, kernel_size=1, stride=1),
            nn.PReLU(),
            Flatten()
        )
        self.skip2 = nn.Sequential(
            nn.Conv2d(192, out_channels=32, kernel_size=1, stride=1),
            nn.PReLU(),
            Flatten()
        )
        self.skip5 = nn.Sequential(
            nn.Conv2d(256, out_channels=64, kernel_size=1, stride=1),
            nn.PReLU(),
            Flatten()
        )
        self.conv6 = nn.Sequential(
            nn.Linear(37104 * 2, 2048),
            nn.ReLU()
        )
        
        # Freeze those weights
        for p in self.base_features.parameters():
            p.requires_grad = False
            
    def forward(self, x, y):
        layer_extractor_x = []
        layer_extractor_y = []
        for idx, model in enumerate(self.base_features):
            x = model(x)
            y = model(y)
            if idx in {2, 5, 11}: # layer output of conv1, conv2 , conv5(before pooling layer)
                layer_extractor_x.append(x)
                layer_extractor_y.append(y)
                
        x_out_flat = x.view(1, -1) #(1, 256, 6, 6) --> (1, 9216)
        x_out_skip1 = self.skip1(layer_extractor_x[0]) #(1, 64, 27, 27) -> (11664)
        x_out_skip2 = self.skip2(layer_extractor_x[1]) #(1, 192, 13, 13) -> (5408)
        x_out_skip5 = self.skip5(layer_extractor_x[2]) #(1, 256, 13, 13) -> (10816)
        x_out = torch.cat((x_out_skip1, x_out_skip2, x_out_skip5, x_out_flat), dim=1)
        
        y_out_flat = y.view(1, -1) #(1, 256, 6, 6) --> (1, 9216)
        y_out_skip1 = self.skip1(layer_extractor_y[0]) #(1, 64, 27, 27) -> (11664)
        y_out_skip2 = self.skip2(layer_extractor_y[1]) #(1, 192, 13, 13) -> (5408)
        y_out_skip5 = self.skip5(layer_extractor_y[2]) #(1, 256, 13, 13) -> (10816)
        y_out = torch.cat((y_out_skip1, y_out_skip2, y_out_skip5, y_out_flat), dim=1)
        
        final_out = torch.cat((x_out, y_out), dim=1)
        conv_out = self.conv6(final_out) # (1, 2048)
        return conv_out

In [5]:
a = Variable(torch.rand(1, 51)).cuda()
print(a.shape)
b = torch.zeros(1, 51).cuda()
print(b.shape)

torch.Size([1, 51])
torch.Size([1, 51])


In [6]:
class PredictNet(nn.Module):
    def __init__(self):
        super(PredictNet,self).__init__()
        self.LSTM_SIZE = 256
        alexnet = torchvision.models.alexnet(pretrained = True)
        self.feature = alexnet.features
        classifier = list(alexnet.classifier.children())
        self.classifier = nn.Sequential(*classifier[:-1])
        self.lstm1_x =nn.LSTMCell(4096 + 1, self.LSTM_SIZE)
        self.lstm2_x = nn.LSTMCell(self.LSTM_SIZE, self.LSTM_SIZE)
        self.lstm1_y =nn.LSTMCell(4096 + 1, self.LSTM_SIZE)
        self.lstm2_y = nn.LSTMCell(self.LSTM_SIZE, self.LSTM_SIZE)
        self.linear_x = nn.Linear(self.LSTM_SIZE, 1)
        self.linear_y = nn.Linear(self.LSTM_SIZE, 1)
        
        self.h_x1 = self.get_hidden(self.LSTM_SIZE)
        self.h_y1 = self.get_hidden(self.LSTM_SIZE)
        self.h_x2 = self.get_hidden(self.LSTM_SIZE)
        self.h_y2 = self.get_hidden(self.LSTM_SIZE)
        self.c_x1 = self.get_hidden(self.LSTM_SIZE)
        self.c_y1 = self.get_hidden(self.LSTM_SIZE)
        self.c_x2 = self.get_hidden(self.LSTM_SIZE)
        self.c_y2 = self.get_hidden(self.LSTM_SIZE)

    def init_hidden(self, num):
        self.h_x1 = self.h_x1.detach()
        self.h_y1 = self.h_y1.detach()
        self.h_x2 = self.h_x2.detach()
        self.h_y2 = self.h_y2.detach()
        self.c_x1 = self.c_x1.detach()
        self.c_y1 = self.c_y1.detach()
        self.c_x2 = self.c_x2.detach()
        self.c_y2 = self.c_y2.detach()
        
        self.h_x1 = self.get_hidden(num)
        self.h_y1 = self.get_hidden(num)
        self.h_x2 = self.get_hidden(num)
        self.h_y2 = self.get_hidden(num)
        self.c_x1 = self.get_hidden(num)
        self.c_y1 = self.get_hidden(num)
        self.c_x2 = self.get_hidden(num)
        self.c_y2 = self.get_hidden(num)

    def get_hidden(self, num):
        if USE_GPU:
            #return (Variable(torch.rand(1, num)).cuda(), Variable(torch.rand(1, num)).cuda())
            return Variable(torch.rand(1, num)).cuda()
            #return torch.zeros(1, num).cuda()
        else:
            #return (Variable(torch.rand(1, num)), Variable(torch.rand(1, num)))
            return Variable(torch.rand(1, num))
            #return torch.zeros(t1, num)
        
    def forward(self, img, input_x, input_y):
        '''h_x1 = torch.zeros(input.size(0), self.LSTM_SIZE, dtype=torch.double)
        h_y1 = torch.zeros(input.size(0), self.LSTM_SIZE, dtype=torch.double)
        h_x2 = torch.zeros(input.size(0), self.LSTM_SIZE, dtype=torch.double)
        h_y2 = torch.zeros(input.size(0), self.LSTM_SIZE, dtype=torch.double)
        c_x1 = torch.zeros(input.size(0), self.LSTM_SIZE, dtype=torch.double)
        c_y1 = torch.zeros(input.size(0), self.LSTM_SIZE, dtype=torch.double)
        c_x2 = torch.zeros(input.size(0), self.LSTM_SIZE, dtype=torch.double)
        c_y2 = torch.zeros(input.size(0), self.LSTM_SIZE, dtype=torch.double)'''
        
        img_features = self.feature(img.unsqueeze(0))
        img_features = img_features.view(img_features.size(0), -1)
        img_features = self.classifier(img_features).view(-1)
        x = input_x
        y = input_y
        
        cat_x = torch.cat((img_features, x), dim=0).view(1, -1)
        self.h_x1, self.c_x1 = self.lstm1_x(cat_x, (self.h_x1, self.c_x1))
        self.h_x2, self.c_x2 = self.lstm2_y(self.h_x1, (self.h_x2, self.c_x2))
        output_x = self.linear_x(self.h_x2)
        
        cat_y = torch.cat((img_features, y), dim=0).view(1, -1)
        self.h_y1, self.c_y1 = self.lstm1_y(cat_y, (self.h_y1, self.c_y1))
        self.h_y2, self.c_y2 = self.lstm2_y(self.h_y1, (self.h_y2, self.c_y2))
        output_y = self.linear_y(self.h_y2)
        
        return output_x, output_y

In [7]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
img_transform = transforms.Compose([
                    transforms.RandomResizedCrop(224),
                    transforms.RandomHorizontalFlip(),
                    transforms.ToTensor(),
                    normalize,
                    ])
def train_model(optimizer, criterion, net, num_epochs):
    for epoch in range(num_epochs):
        if epoch != 0 and epoch % 5 == 0:
            adjust_learning_rate(optim)
        curr_loss = 0.0
        dataset_size = len(data_list)
        sample_list = random.sample(range(0, dataset_size), dataset_size)
        for idx in sample_list:
            print(idx)
            frame_name = data_list[idx][0]
            data_len = data_list[idx][1]
            target_x = []
            target_y = []
            target = []
            out_x = []
            out_y = []
            out = []
            net.init_hidden(net.LSTM_SIZE)
            for frame_num in range(data_len-1):
                file = open(ann_path + frame_name + '.txt','r')
                file_split = file.read().splitlines()
                next_frame = file_split[1]
                img = Image.open(img_path + next_frame + '.jpg')
                img = img.convert('RGB')
                img = img_transform(img)
                x = torch.tensor([float(file_split[4]) - float(file_split[2])])
                y = torch.tensor([float(file_split[5]) - float(file_split[3])])
                if USE_GPU:
                    img = img.cuda()
                    x = x.cuda()
                    y = y.cuda()
                if frame_num != 0:
                    target_x = target_x + [x]
                    target_y = target_y + [y]
                output_x, output_y = net(img, x, y)
                out_x = out_x + [output_x]
                out_y = out_y + [output_y]
                frame_name = next_frame
            file = open(ann_path + frame_name + '.txt','r')
            file_split = file.read().splitlines()
            x = torch.tensor([float(file_split[4]) - float(file_split[2])])
            y = torch.tensor([float(file_split[5]) - float(file_split[3])])
            if USE_GPU:
                x = x.cuda()
                y = y.cuda()
            target_x = target_x + [x]
            target_y = target_y + [y]
                
            target_x = torch.stack(target_x, 1).squeeze(1).view(-1)
            target_y = torch.stack(target_y, 1).squeeze(1).view(-1)
            target = torch.cat((target_x, target_y))
            out_x = torch.stack(out_x, 1).squeeze(2).view(-1)
            out_y = torch.stack(out_y, 1).squeeze(2).view(-1)
            out = torch.cat((out_x, out_y)) #torch.Size([n])
            
            def clousure():
                optimizer.zero_grad()
                loss = criterion(out, target)
                print('loss:', loss.item())
                loss.backward(retain_graph = True)
                return loss
            optimizer.step(clousure)
            
    return net

np.random.seed(0)
torch.manual_seed(0)
net = PredictNet()
#net.double()
criterion = nn.MSELoss()
optimizer = torch.optim.LBFGS(net.parameters(), lr=0.1)
if USE_GPU:
    net = net.cuda()
    criterion = criterion.cuda()
net = train_model(optimizer, criterion, net, 1)

137
loss: 2.383277654647827
loss: 2.383277654647827
60
loss: 2.0790534019470215
loss: 2.0790534019470215
41
loss: 1.5399657487869263
loss: 1.5399657487869263
106
loss: 3.0443363189697266
loss: 3.0443363189697266
154
loss: 1.6094096899032593
loss: 1.6094096899032593
23
loss: 1.543656587600708
loss: 1.543656587600708
75
loss: 1.1274205446243286
loss: 1.1274205446243286
111
loss: 1.3210667371749878
loss: 1.3210667371749878
96
loss: 2.489140033721924
loss: 2.489140033721924
58
loss: 1.3314810991287231
loss: 1.3314810991287231
85
loss: 0.8806560635566711
loss: 0.8806560635566711
24
loss: 2.145580530166626
loss: 2.145580530166626
43
loss: 1.0515984296798706
loss: 1.0515984296798706
167
loss: 0.9372762441635132
loss: 0.9372762441635132
170
loss: 2.144508123397827
loss: 2.144508123397827
149
loss: 2.5602657794952393
loss: 2.5602657794952393
147
loss: 4.489132881164551
loss: 4.489132881164551
108
loss: 0.7426080107688904
loss: 0.7426080107688904
49
loss: 1.4295904636383057
loss: 1.4295904636383

RuntimeError: CUDA out of memory. Tried to allocate 144.00 MiB (GPU 0; 11.91 GiB total capacity; 10.69 GiB already allocated; 39.75 MiB free; 194.98 MiB cached)