In [1]:
from __future__ import print_function
import argparse
import torch
import torch.utils.data
from torch import nn, optim
from torch.autograd import Variable
import torch.nn as nn
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image

from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
import cv2
from os import path as osp
import numpy as np

In [4]:
#args = {"batch-size":128, "epochs": 10, "no-cuda": False, "seed":1, "log_interval": 10, "hidden_size":20, "intermediate_size":128, "widen-factor":1}

# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
#cudnn.benchmark = True


In [6]:
class MomentsDataset(Dataset):
    def __init__(self, list_IDs, labels, mode):
        'Initialization'
        self.labels = labels
        self.list_IDs = list_IDs
        self.mode = mode
        
    def __len__(self):
        return len(self.list_IDs)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.list_IDs[index]
        
        # Load data and get label
        if mode == 'train':
            X = torch.load('/hdd/datasets/Moments_in_Time_Mini/training-sound/' + labels[ID] + '/' + ID + '.wav')
        elif mode == 'test':
            X = torch.load('/hdd/datasets/Moments_in_Time_Mini/validation-sound/' + labels[ID] + '/' + ID + '.wav')
        y = self.labels[ID]
        
        return X, y
        
        #print('---- df loc-----')
        #print(self.df.iloc[idx])
        #print('idx', idx)
        
        #print('file loc', file_loc)


In [10]:
# Paremeters
params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 6}
max_epochs = 100

# Dictionaries
#create dictionaries for the training and validation set, which return the IDs of the data
    #partition['training'] = {ID1, ID2, ....}
    #partition[validation] = {ID1, ID2, ....}
#create a dictionary called labels where, for each ID of the dataset, the associated label is labels[ID]
partition = {}
partition['train'] = []
partition['validation'] = []
labels = {}

#iterate through training-sound/file_list.txt
with open("/hdd/datasets/Moments_in_Time_Mini/training-sound/file_list.txt") as f:
    for line in f:
        line = line.replace("\n", "")
        
        #split line by '/' to get the action and the file name
        split_by_action = line.split("/")
        #delimit directory name from string
        action = split_by_action[0]
        split_by_filename = split_by_action[1].split(".")
        filename = split_by_filename[0]
        
        if os.path.isfile("/hdd/datasets/Moments_in_Time_Mini/training-sound/" + action + "/" + split_by_action[1]):
            partition['train'].append(filename)
            labels[filename] = action
      
#iterate through validation-sound/file_list.txt
with open("/hdd/datasets/Moments_in_Time_Mini/validation-sound/file_list.txt") as f:
    for line in f:
        line = line.replace("\n", "")
        
        #split line by '/' to get the action and the file name
        split_by_action = line.split("/")
        #delimit directory name from string
        action = split_by_action[0]
        split_by_filename = split_by_action[1].split(".")
        filename = split_by_filename[0]
        
        if os.path.isfile("/hdd/datasets/Moments_in_Time_Mini/validation-sound/" + action + "/" + split_by_action[1]):
            partition['validation'].append(filename)
            labels[filename] = action

In [15]:
#print(partition['train'])
#print(partition['validation'])
#print(labels.values())
#print(labels)

In [None]:
# Create dataset and dataloader
train_dataset = MomentsDataset(partition['train'], labels)
test_dataset = MomentsDataset(partition['validation'], labels)

# DataLoader takes in:
# batch_size - represents number of samples contained in each generated batch
# shuffle - if true, denotes that we get a new order of exploration at each pass
#           if false, denotes a linear exploration scheme
# num_workers - denotes the number of processes that generate batches in parallel
#               high number of workers means that CPU computations are efficiently managed
train_loader = DataLoader(train_dataset, batch_size=args["batch-size"], shuffle=True, num_workers=1, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=args["batch-size"], shuffle=False, num_workers=1, pin_memory=True)

In [None]:
class EncoderRNN(nn.Module):
    """
    The encoder generates a single output vector that embodies the input sequence meaning.
    The general procedure is as follows:
        1. In each step, a word will be fed to a network and it generates
         an output and a hidden state.
        2. For the next step, the hidden step and the next word will
         be fed to the same network (W) for updating the weights.
        3. In the end, the last output will be the representative of the input sentence (called the "context vector").
    """
    def __init__(self, hidden_size, input_size, batch_size, num_layers=1, bidirectional=False):
        """
        * For nn.LSTM, same input_size & hidden_size is chosen.
        :param input_size: The size of the input vocabulary
        :param hidden_size: The hidden size of the RNN.
        :param batch_size: The batch_size for mini-batch optimization.
        :param num_layers: Number of RNN layers. Default: 1
        :param bidirectional: If the encoder is a bi-directional LSTM. Default: False
        """
        super(EncoderRNN, self).__init__()
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.hidden_size = hidden_size

        # The input should be transformed to a vector that can be fed to the network.
        self.embedding = nn.Embedding(input_size, embedding_dim=hidden_size)

        # The LSTM layer for the input
        #self.lstm = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers)
        if args.bidirectional:
            self.lstm_forward = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers)
            self.lstm_backward = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers)
        else:
            self.lstm = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers)
    
    def forward(self, input, hidden):

        if args.bidirectional:
            input_forward, input_backward = input
            hidden_forward, hidden_backward = hidden
            input_forward = self.embedding(input_forward).view(1, 1, -1)
            input_backward = self.embedding(input_backward).view(1, 1, -1)

            out_forward, (h_n_forward, c_n_forward) = self.lstm_forward(input_forward, hidden_forward)
            out_backward, (h_n_backward, c_n_backward) = self.lstm_backward(input_backward, hidden_backward)

            forward_state = (h_n_forward, c_n_forward)
            backward_state = (h_n_backward, c_n_backward)
            output_state = (forward_state, backward_state)

            return output_state
        else:
            # Make the data in the correct format as the RNN input.
            embedded = self.embedding(input).view(1, 1, -1)
            rnn_input = embedded
            # The following descriptions of shapes and tensors are extracted from the official Pytorch documentation:
            # output-shape: (seq_len, batch, num_directions * hidden_size): tensor containing the output features (h_t) from the last layer of the LSTM
            # h_n of shape (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state
            # c_n of shape (num_layers * num_directions, batch, hidden_size): tensor containing the cell state
            output, (h_n, c_n) = self.lstm(rnn_input, hidden)
            return output, (h_n, c_n)

    def initHidden(self):

        if self.bidirectional:
            encoder_state = [torch.zeros(self.num_layers, 1, self.hidden_size, device=device),
                                      torch.zeros(self.num_layers, 1, self.hidden_size, device=device)]
            encoder_state = {"forward": encoder_state, "backward": encoder_state}
            return encoder_state
        else:
            encoder_state = [torch.zeros(self.num_layers, 1, self.hidden_size, device=device),
                              torch.zeros(self.num_layers, 1, self.hidden_size, device=device)]
            return encoder_state

In [None]:
class DecoderRNN(nn.Module):
    """
    This context vector, generated by the encoder, will be used as the initial hidden state of the decoder.
    Decoding is as follows:
    1. At each step, an input token and a hidden state is fed to the decoder.
        * The initial input token is the <SOS>.
        * The first hidden state is the context vector generated by the encoder (the encoder's
    last hidden state).
    2. The first output, shout be the first sentence of the output and so on.
    3. The output token generation ends with <EOS> being generated or the predefined max_length of the output sentence.
    """
    def __init__(self, hidden_size, output_size, batch_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=1)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output, (h_n, c_n) = self.lstm(output, hidden)
        output = self.out(output[0])
        return output, (h_n, c_n)

    def initHidden(self):
        """
        The specific type of the hidden layer for the RNN type that is used (LSTM).
        :return: All zero hidden state.
        """
        return [torch.zeros(self.num_layers, 1, self.hidden_size, device=device),
                torch.zeros(self.num_layers, 1, self.hidden_size, device=device)]

In [None]:
class Linear(nn.Module):
    """
    This context vector, generated by the encoder, will be used as the initial hidden state of the decoder.
    In case that their dimension is not matched, a linear layer should be used to transformed the context vector
    to a suitable input (shape-wise) for the decoder cell state (including the memory(Cn) and hidden(hn) states).
    The shape mismatch is True in the following conditions:
    1. The hidden sizes of encoder and decoder are the same BUT we have a bidirectional LSTM as the Encoder.
    2. The hidden sizes of encoder and decoder are NOT same.
    3. ETC?
    """

    def __init__(self, bidirectional, hidden_size_encoder, hidden_size_decoder):
        super(Linear, self).__init__()
        self.bidirectional = bidirectional
        num_directions = int(bidirectional) + 1
        self.linear_connection_op = nn.Linear(num_directions * hidden_size_encoder, hidden_size_decoder)
        self.connection_possibility_status = num_directions * hidden_size_encoder == hidden_size_decoder

    def forward(self, input):

        if self.connection_possibility_status:
            return input
        else:
            return self.linear_connection_op(input)