## Mount Google Drive For Saving Results

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Download Datasets

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
!kaggle competitions download -c 11-785-fall-20-homework-4-part-2

Downloading train.npy.zip to /content
100% 3.35G/3.36G [00:45<00:00, 87.9MB/s]
100% 3.36G/3.36G [00:45<00:00, 79.3MB/s]
Downloading dev.npy.zip to /content
 95% 173M/182M [00:03<00:00, 46.3MB/s]
100% 182M/182M [00:03<00:00, 47.9MB/s]
Downloading test.npy.zip to /content
 93% 170M/183M [00:04<00:00, 26.2MB/s]
100% 183M/183M [00:04<00:00, 44.8MB/s]
Downloading train_transcripts.npy.zip to /content
  0% 0.00/3.76M [00:00<?, ?B/s]
100% 3.76M/3.76M [00:00<00:00, 124MB/s]
Downloading sample.csv to /content
  0% 0.00/16.8k [00:00<?, ?B/s]
100% 16.8k/16.8k [00:00<00:00, 18.6MB/s]
Downloading dev_transcripts.npy to /content
  0% 0.00/784k [00:00<?, ?B/s]
100% 784k/784k [00:00<00:00, 50.8MB/s]


In [None]:
!unzip dev.npy.zip
!unzip train.npy.zip
!unzip train_transcripts.npy.zip
!unzip test.npy.zip

Archive:  dev.npy.zip
  inflating: dev.npy                 
Archive:  train.npy.zip
  inflating: train.npy               
Archive:  train_transcripts.npy.zip
  inflating: train_transcripts.npy   
Archive:  test.npy.zip
  inflating: test.npy                


## Import Libraries

In [None]:
import time
import os
import csv
from datetime import datetime
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils as utils
from torch.utils.data import DataLoader, Dataset, TensorDataset
import torch.nn.utils.rnn as rnn
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import torch.optim as optim
import torch.optim.lr_scheduler
from torchvision import datasets, transforms
from torch.autograd import Variable


from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.image as mpimg
from torch.distributions.gumbel import Gumbel
import pickle as pk

## Set Arguments

In [None]:
class Args():
  def __init__(self):
    self.data_dir='./'
    self.weights_dir='/content/drive/My Drive'
    self.lr=1e-3
    self.clip=0.25
    self.epochs=30
    self.train_batch_size=20
    self.dev_batch_size=1
    self.hidden_dim = 256
    self.context_dim=128
    self.key_query_dim=128
    self.listener_feature_dim=512
    self.max_iters=600
    self.dropout=0.2
    self.dropouth=0.1
    self.dropouti=0.3
    self.seed=11785
    self.cuda = True
    self.log_interval=40
    self.wdecay=1.2e-6


args = Args()

## Utility Functions

In [None]:
def levenshtein(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    return (matrix[size_x - 1, size_y - 1])

In [None]:
def to_float_tensor(numpy_array):
    # Numpy array -> Tensor
    return torch.from_numpy(numpy_array).float()

def to_long_tensor(numpy_array):
    # Numpy array -> Tensor
    return torch.from_numpy(numpy_array).long()

def to_int_tensor(numpy_array):
    # Numpy array -> Tensor
    return torch.from_numpy(numpy_array).int()

def to_tensor(numpy_array):
    # Numpy array -> Tensor
    return torch.from_numpy(numpy_array)

def to_np(x):
    return x.data.cpu().numpy()

def to_variable(tensor):
    # Tensor -> Variable (on GPU if possible)
    if torch.cuda.is_available():
        # Tensor -> GPU Tensor
        tensor = tensor.cuda()
    return torch.autograd.Variable(tensor)

In [None]:
def plot_grad_flow(named_parameters, path):
    ave_grads = []
    max_grads = []
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
            if(p is not None):
                layers.append(n)
                ave_grads.append(p.grad.abs().mean())
                max_grads.append(p.grad.abs().max())
    plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c")
    plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, lw=2, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(left=0, right=len(ave_grads))
    plt.ylim(bottom = -0.001, top=0.02) # zoom in on the lower gradient regions
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    #plt.tight_layout()
    plt.grid(True)
    plt.legend([Line2D([0], [0], color="c", lw=4),
                Line2D([0], [0], color="b", lw=4),
                Line2D([0], [0], color="k", lw=4)], ['max-gradient', 'mean-gradient', 'zero-gradient'])
    plt.show()
    plt.savefig(path)
    return plt, max_grads

In [None]:
def cer(s1, s2):
    s1, s2, = s1.replace(' ', ''), s2.replace(' ', '')
    return levenshtein(s1, s2)

def labels2str(labels, label_sizes):
    output = []
    for l, s in zip(labels, label_sizes):
        output.append("".join(idx2chr[i] for i in l[:s]))
    return output

def label_list_to_str(labels):
    output = []
    for l in labels:
        output.append("".join(idx2chr[i] for i in l))
    return output

In [None]:
def greedy_decode(probs):
    out = []
    for prob in probs:
        s = []
        for step in prob:
            idx = torch.argmax(step).item()
            c = idx2chr[idx]
            s.append(c)
            if c == '<eos>':
                break
        out.append("".join(s))
    return out

## Dataloaders

In [None]:
class Speech2Text_Dataset(Dataset):
    def __init__(self, speech, text=None):
        self.speech = speech
        self.text = text
        if text is not None:
            self.total_labels = sum(len(y) for y in text)
        else:
            self.total_labels = -1

    def __getitem__(self, idx):
        frames = self.speech[idx]
        if self.text is None:
            labels = [-1]
        else:
            text = self.text[idx].tolist()
            text = b' '.join(text)
            text = text.decode("utf-8")
            labels = [chr2idx[c] for c in str(text)]
            labels = labels + [chr2idx['<eos>']]
        return to_float_tensor(frames), to_int_tensor(np.array(labels))

    def __len__(self):
        return self.speech.shape[0]


def Speech2Text_collate(batch):
    batch_size = len(batch)
    batch = sorted(batch, key=lambda b: b[0].size(0), reverse=True) 
    max_seq_len = batch[0][0].size(0)
    channels = batch[0][0].size(1)
    pack = torch.zeros(max_seq_len, batch_size, channels)
    seq_sizes = []
    max_label_len = max(label.size(0) for (f, label) in batch)
    all_labels = torch.zeros(batch_size, max_label_len).long()
    label_sizes = torch.zeros(batch_size).int()
    for i, (frames, label) in enumerate(batch):
        seq_size = frames.size(0)
        seq_sizes.append(seq_size)

        labele_size = label.size(0)
        label_sizes[i] = labele_size

        pack[:seq_size, i, :] = frames
        all_labels[i, :labele_size] = label
    return pack, seq_sizes, all_labels, label_sizes

In [None]:
speech_train = np.load('train.npy', allow_pickle=True, encoding='bytes')
speech_valid = np.load('dev.npy', allow_pickle=True, encoding='bytes')
speech_test = np.load('test.npy', allow_pickle=True, encoding='bytes')

transcript_train = np.load('train_transcripts.npy', allow_pickle=True,encoding='bytes')
transcript_valid = np.load('dev_transcripts.npy', allow_pickle=True,encoding='bytes')

In [None]:
vocab = sorted(list(set(''.join([''.join([p.decode("utf-8") for p in transcript_train[i]]) for i in range(transcript_train.shape[0])]))))
vocab

["'",
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [None]:
letter_list = ['<sos>',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z', "'", '"', ' ','<eos>']

In [None]:
def create_dictionaries(letter_list):
    letter2index = dict({letter_list[i]:i for i in range(len(letter_list))})
    index2letter = dict({i:letter_list[i] for i in range(len(letter_list))})
    return letter2index, index2letter

In [None]:
chr2idx, idx2chr = create_dictionaries(letter_list)

In [None]:
kwargs = {'num_workers': 3, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(
    Speech2Text_Dataset(speech_train, transcript_train),
    batch_size=args.train_batch_size, shuffle=True, collate_fn=Speech2Text_collate, **kwargs)
dev_loader = torch.utils.data.DataLoader(
    Speech2Text_Dataset(speech_valid, transcript_valid),
    batch_size=args.dev_batch_size, shuffle=True, collate_fn=Speech2Text_collate, **kwargs)
test_loader = torch.utils.data.DataLoader(
    Speech2Text_Dataset(speech_test, None),
    batch_size=1, shuffle=False, collate_fn=Speech2Text_collate, **kwargs)

## Model

In [None]:
class LockedDropout(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x, dropout=0.5):
        if dropout == 0 or not self.training:
            return x
        mask = x.data.new(x.size(0), 1, x.size(2))
        mask = mask.bernoulli_(1 - dropout)
        mask = Variable(mask, requires_grad=False) / (1 - dropout)
        mask = mask.expand_as(x)
        return mask * x

In [None]:
class pBLSTMLayer(nn.Module):
    def __init__(self, input_feature_dim, hidden_dim, dropout_rate=0.0):
        super(pBLSTMLayer, self).__init__()
        self.BLSTM = nn.LSTM(input_feature_dim * 2, hidden_dim, 1, bidirectional=True,
                                   dropout=dropout_rate, batch_first=True)
    def forward(self, input_x):
        batch_size = input_x.size(0)
        timestep = input_x.size(1)
        if timestep % 2 != 0:
            input_x = input_x[:, :-1, :]
            timestep -= 1
        feature_dim = input_x.size(2)
        # Reduce time resolution
        input_x = input_x.contiguous().view(batch_size, timestep // 2, feature_dim * 2)
        output, hidden = self.BLSTM(input_x)
        return output, hidden

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_feature_dim, hidden_dim, dropout_rate=0.0, dropout=0, dropouth=0, dropouti=0):
        super(Encoder, self).__init__()
        self.bn1 = nn.BatchNorm1d(input_feature_dim)

        self.lstm = nn.LSTM(input_size=input_feature_dim,hidden_size=hidden_dim,num_layers=1,bidirectional=True, batch_first=True)

        self.pBLSTM1 = pBLSTMLayer(hidden_dim * 2, hidden_dim,
                                dropout_rate=dropout_rate)
        self.pBLSTM2 = pBLSTMLayer(hidden_dim * 2, hidden_dim,
                                dropout_rate=dropout_rate)
        self.pBLSTM3 = pBLSTMLayer(hidden_dim * 2, hidden_dim,
                                dropout_rate=dropout_rate)
        self.lockdrop = LockedDropout()
        self.dropouti = dropouti
        self.dropouth = dropouth
        self.dropout = dropout

    def forward(self, frames, seq_sizes):
        frames = frames.permute(1, 2, 0).contiguous()
        frames = self.bn1(frames)
        output = frames.permute(0, 2, 1)

        output, _ = self.lstm(output)

        output = self.lockdrop(output, self.dropouti)
        output, _ = self.pBLSTM1(output)
        output = self.lockdrop(output, self.dropouth)
        output, _ = self.pBLSTM2(output)
        output = self.lockdrop(output, self.dropouth)
        output, _ = self.pBLSTM3(output)
        output = self.lockdrop(output, self.dropout)

        # shorten for 8x
        out_seq_sizes = [size // 8 for size in seq_sizes]

        return output, out_seq_sizes

In [None]:
def SequenceWise(input_module, input_x):
    batch_size = input_x.size(0)
    time_steps = input_x.size(1)
    reshaped_x = input_x.contiguous().view(-1, input_x.size(-1))
    output_x = input_module(reshaped_x)
    return output_x.view(batch_size, time_steps, -1)


class Attention(nn.Module):
    def __init__(self, key_query_dim=128, speller_query_dim=256, listener_feature_dim=512, context_dim=128):
        super(Attention, self).__init__()
        self.softmax = nn.Softmax(dim=1)
        self.fc_query = nn.Linear(speller_query_dim, key_query_dim)
        self.fc_key = nn.Linear(listener_feature_dim, key_query_dim)
        self.fc_value = nn.Linear(listener_feature_dim, context_dim)
        self.activate = torch.nn.LeakyReLU(negative_slope=0.2)

    def forward(self, decoder_state, listener_feature, seq_sizes):
        query = self.activate(self.fc_query(decoder_state))
        key = self.activate(SequenceWise(self.fc_key, listener_feature))
        energy = torch.bmm(query.unsqueeze(1), key.transpose(1, 2)).squeeze(dim=1)
        mask = Variable(energy.data.new(energy.size(0), energy.size(1)).zero_(), requires_grad=False)
        for i, size in enumerate(seq_sizes):
            mask[i, :size] = 1
        attention_score = self.softmax(energy)
        attention_score = mask * attention_score
        attention_score = attention_score / torch.sum(attention_score, dim=1).unsqueeze(1).expand_as(attention_score)

        value = self.activate(self.fc_value(listener_feature))
        context = torch.bmm(attention_score.unsqueeze(1), value).squeeze(dim=1)

        return attention_score, context

In [None]:
class Decoder(nn.Module):
    def __init__(self, n_classes, hidden_dim, layer_size, attention, context_dim):
        super(Decoder, self).__init__()
        self.n_classes = n_classes
        self.rnn_unit = nn.LSTMCell
        self.rnn_layer = torch.nn.ModuleList()
        
        self.rnn_layer.append(self.rnn_unit(hidden_dim + context_dim, hidden_dim))
        rnn_inith = [torch.nn.Parameter(torch.rand(1, hidden_dim))]
        rnn_initc = [torch.nn.Parameter(torch.rand(1, hidden_dim))]

        for i in range(1, layer_size):
            self.rnn_layer.append(self.rnn_unit(hidden_dim, hidden_dim))
            rnn_inith.append(torch.nn.Parameter(torch.rand(1, hidden_dim)))
            rnn_initc.append(torch.nn.Parameter(torch.rand(1, hidden_dim)))
        
        self.rnn_inith = torch.nn.ParameterList(rnn_inith)
        self.rnn_initc = torch.nn.ParameterList(rnn_initc)

        self.attention = attention
        self.embed = nn.Embedding(n_classes, hidden_dim, padding_idx=0)
        self.fc = nn.Linear(hidden_dim + context_dim, hidden_dim)
        self.activate = torch.nn.LeakyReLU(negative_slope=0.2)
        self.unembed = nn.Linear(hidden_dim, n_classes)
        self.unembed.weight = self.embed.weight # Weight Tying
        self.character_distribution = nn.Sequential(self.fc, self.activate, self.unembed)

    def forward(self, listener_feature, seq_sizes, max_iters, ground_truth=None, teacher_force_rate=0.9, dropout=[]):
        if ground_truth is None:
            teacher_force_rate = 0

        batch_size = listener_feature.size()[0]
        state, output_word = self.get_initial_state(batch_size)

        # dropouts
        dropout_masks = []
        if dropout and self.training:
            h = state[0][0] 
            n_layers = len(state[0])
            for i in range(n_layers):
                mask = h.data.new(h.size(0), h.size(1)).bernoulli_(1 - dropout[i]) / (1 - dropout[i])
                dropout_masks.append(Variable(mask, requires_grad=False))

        raw_pred_seq = []
        attention_record = []
        for step in range(ground_truth.size(1) if ground_truth is not None else max_iters):

            attention_score, raw_pred, state = self.get_prediction(listener_feature, seq_sizes, output_word, state, dropout_masks=dropout_masks)

            attention_record.append(attention_score.cpu().detach().numpy())
            raw_pred_seq.append(raw_pred)

            if np.random.random_sample() < teacher_force_rate:
                output_word = ground_truth[:, step]
            else:
                raw_pred = Gumbel(raw_pred.to('cpu'), torch.tensor([1.0])).sample().cuda()
                output_word = torch.max(raw_pred, dim=1)[1]

        return torch.stack(raw_pred_seq, dim=1), np.array(attention_record)

    def get_prediction(self, listener_feature, seq_sizes, last_output_word, state, dropout_masks=None):
        output_word_emb = self.embed(last_output_word)
        hidden, cell = state[0], state[1]
        last_rnn_output = hidden[-1]  
        attention_score, context = self.attention(last_rnn_output, listener_feature, seq_sizes)
        rnn_input = torch.cat([output_word_emb, context], dim=1)
        new_hidden, new_cell = [None] * len(self.rnn_layer), [None] * len(self.rnn_layer)
        for l, rnn in enumerate(self.rnn_layer):
            new_hidden[l], new_cell[l] = rnn(rnn_input, (hidden[l], cell[l]))
            if dropout_masks:
                rnn_input = new_hidden[l] * dropout_masks[l]
            else:
                rnn_input = new_hidden[l]
        rnn_output = new_hidden[-1]  
        concat_feature = torch.cat([rnn_output, context], dim=1)
        raw_pred = self.character_distribution(concat_feature)
        return attention_score, raw_pred, [new_hidden, new_cell]

    def get_initial_state(self, batch_size):
        hidden = [h.repeat(batch_size, 1) for h in self.rnn_inith]
        cell = [c.repeat(batch_size, 1) for c in self.rnn_initc]
        output_word = Variable(hidden[0].data.new(batch_size).long().fill_(chr2idx['<eos>']))
        return [hidden, cell], output_word

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self):
        super(Seq2Seq, self).__init__()
        self.listener = Encoder(40, args.hidden_dim, dropouti=args.dropouti, dropouth=args.dropouth, dropout=args.dropout)
        self.attention = Attention(key_query_dim=args.key_query_dim, speller_query_dim=args.hidden_dim, listener_feature_dim=args.listener_feature_dim, context_dim=args.context_dim)
        self.speller = Decoder(len(idx2chr), args.hidden_dim, 3, self.attention, context_dim=args.context_dim) 

    def forward(self, frames, seq_sizes, labels, max_iters=args.max_iters):
        listener_features, out_seq_sizes = self.listener(frames, seq_sizes)
        outputs, attentions = self.speller(listener_features, out_seq_sizes, max_iters, labels,
                                           teacher_force_rate=0.9, dropout=[0.2, 0.2, 0.3])
        return outputs, attentions

## Sequence to Sequence Cross Entropy Loss

In [None]:
def find_first_eos_in_pred(pred):
    chrs = pred.max(1)[1].data.cpu().numpy()
    for idx, c in enumerate(chrs):
        if c == chr2idx['<eos>']:
            return idx
    return len(chrs)

class SequenceCrossEntropyLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, preds, label_sizes, labels):
        pred_list = []
        label_list = []
        max_iter = preds.size(1)
        for (pred, label, num_iter) in zip(preds, labels, label_sizes):
            pred_for_loss = []
            label_for_loss = []
            eos_idx = find_first_eos_in_pred(pred)

            if eos_idx < num_iter:
                if eos_idx != 0:
                    pred_for_loss.append(pred[:eos_idx])
                pred_for_loss += [pred[eos_idx:eos_idx + 1]] * (num_iter - eos_idx)
                label_for_loss.append(label[:num_iter])

            elif eos_idx == max_iter:
                pred_for_loss.append(pred[:eos_idx])
                label_for_loss.append(label[:num_iter])
                label_for_loss += [label[num_iter - 1:num_iter]] * (eos_idx - num_iter)

            else:
                pred_for_loss.append(pred[:eos_idx + 1])
                label_for_loss.append(label[:num_iter])
                label_for_loss += [label[num_iter - 1:num_iter]] * (eos_idx + 1 - num_iter)

            pred_list.append(torch.cat(pred_for_loss))
            label_list.append(torch.cat(label_for_loss))

        preds_batch = torch.cat(pred_list)
        labels_batch = torch.cat(label_list)
        loss = torch.nn.functional.cross_entropy(preds_batch, labels_batch, reduction='sum')
        return loss

## Train Method

In [None]:
def train(epoch, model, optimizer, criterion, loader):
    model.train()
    sum_loss, sum_labels = 0, 0
    start_time = time.time()
    optimizer.zero_grad()

    for batch, (frames, seq_sizes, labels, label_sizes) in enumerate(loader):
        batch += 1
        sum_labels += sum(label_sizes)

        if torch.cuda.is_available():
            frames, labels = frames.cuda(), labels.cuda()
        frames, labels = Variable(frames), Variable(labels)

        outputs, attentions = model(frames, seq_sizes, labels)
        loss = criterion(outputs, label_sizes, labels)
        loss.backward()

        sum_loss += loss.item() 

        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)

        optimizer.step()
        optimizer.zero_grad()

        if batch % args.log_interval == 0:
            plot_grad_flow(model.named_parameters(), 'grad_flow.png')
            elapsed = time.time() - start_time
            avg_loss = sum_loss / sum_labels
            print('| epoch {:3d} | {:5d}/{:5d} batches ({:5.2f}%) | lr {:.2e} | {:3.0f} ms/utter | loss/utter {:5.2f} | loss/label {:5.4f}'
                    .format(epoch, batch, len(loader), (100.0 * batch / len(loader)),
                            optimizer.param_groups[0]['lr'],
                            elapsed * 1000.0 / (args.log_interval * args.train_batch_size),
                            sum_loss / (args.log_interval * args.train_batch_size),
                            avg_loss))
            sum_loss, sum_labels = 0, 0
            start_time = time.time()




## Validation Method

In [None]:
def evaluate(model, criterion, loader):
    model.eval()
    total_loss = 0
    total_cer = 0

    with torch.no_grad():
      for batch, (frames, seq_sizes, labels, label_sizes) in enumerate(loader):
          if args.cuda:
              frames, labels = frames.cuda(), labels.cuda()
          frames, labels = Variable(frames), Variable(labels)

          outputs, attentions = model(frames, seq_sizes, labels)
          loss = criterion(outputs, label_sizes, labels)

          total_loss += loss.item() 

          decoded = greedy_decode(outputs)
          labels_str = labels2str(to_np(labels), label_sizes) 
          for l, m in zip(labels_str, decoded):
              e = cer(l, m)
              total_cer += e

          if batch % args.log_interval == 0:        
            print(attentions.shape)
            attentions = np.squeeze(attentions, axis=1)
            plt.matshow(attentions)
            plt.show()

    total_labels = loader.dataset.total_labels
    return total_loss / (len(loader) * args.dev_batch_size), total_loss / total_labels, total_cer * 100.0 / total_labels

## Training & Saving Model

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
  torch.cuda.manual_seed(args.seed)

best_cer = 9999.9999
model = Seq2Seq()
#model.load_state_dict(torch.load(args.weights_dir+"/009_48.8567.w"))
if args.cuda:
  model.cuda()

optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
criterion = SequenceCrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, threshold=0.01, verbose=True)
for epoch in range(1, args.epochs):
    epoch_start_time = time.time()
    train(epoch, model, optimizer, criterion, train_loader)

    val_loss_utter, val_loss_label, val_cer = evaluate(model, criterion, dev_loader)
    scheduler.step(val_loss_label)
    print('=' * 100)
    print('| end of epoch {:3d} | time: {:5.2f}s | loss/utter {:5.2f} | loss/label {:5.4f} | valid cer {:5.4f}'
          .format(epoch, (time.time() - epoch_start_time), val_loss_utter, val_loss_label, val_cer))
    print('=' * 100)

    if not os.path.exists(args.weights_dir):
        os.makedirs(args.weights_dir)
    if val_cer < best_cer:  
        best_cer = val_cer
        weight_fname = "{}/{:03d}_{}.w".format(args.weights_dir, epoch, "{:.4f}".format(val_cer))
        print("saving as", weight_fname)
        torch.save(model.state_dict(), weight_fname)
    

## Predict & Create Submission

In [None]:
def greedySearch(probs):
    out = []
    for prob in probs:
        s = []
        for step in prob:
            idx = torch.argmax(step).item()
            c = idx2chr[idx]
            if c == '<eos>':
                break
            s.append(c)
        out.append("".join(s))
    return out

In [None]:
def predict(args, csv_fpath, weights_fpath):
    model = Seq2Seq()
    model.load_state_dict(torch.load(weights_fpath))
    if args.cuda:
        model.cuda()

    model.eval()

    with open(csv_fpath, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['id', 'label'])
        writer.writeheader()
        cnt = 0
        with torch.no_grad():
          for batch, (frames, seq_sizes, _, _) in enumerate(test_loader):
              if args.cuda:
                  frames = frames.cuda()
              frames = Variable(frames)
              outputs, attentions = model(frames, seq_sizes, None)
              decoded = greedySearch(outputs)
              s = decoded[0]
              while s.find('  ') != -1:
                s = s.replace('  ', ' ')
              if cnt % args.log_interval == 0:
                  print(cnt, s)
              writer.writerow({"id": cnt, "label": s})
              cnt += 1

    print("done")

In [None]:
predict(args, "submission.csv", args.weights_dir+"/009_28.1937.w")