In [None]:
import sys
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
import torch.optim as optim
import torch.nn.utils as utils
import seaborn as sns
import matplotlib.pyplot as plt
import time
import random
from torch.utils import data
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import *

from tqdm import tqdm


cuda = torch.cuda.is_available()
print(cuda, sys.version)
device = torch.device("cuda" if cuda else "cpu")
np.random.seed(5111785)
torch.manual_seed(5111785)

LETTER_LIST = ['<sos>', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', \
         'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-', "'", '.', '_', '+', ' ', '<eos>']

True 3.7.10 (default, Feb 20 2021, 21:17:23) 
[GCC 7.5.0]


Dictionaries. Index2Letter, Letter2Index

In [None]:
def create_dictionaries(letter_list):
    '''
    Create dictionaries for letter2index and index2letter transformations
    '''
    letter2index = dict(zip(letter_list, range(len(letter_list))))
    index2letter = dict(zip(range(len(letter_list)),letter_list))
    return letter2index, index2letter

def transform_letter_to_index(raw_transcripts):
    '''
    Transforms text input to numerical input by converting each letter 
    to its corresponding index from letter_list

    Args:
        raw_transcripts: Raw text transcripts with the shape of (N, )
    
    Return:
        transcripts: Converted index-format transcripts. This would be a list with a length of N
    '''  

    transcripts = []
    for i in range(raw_transcripts.shape[0]):
      trans_index = [letter2index[j]  for j in ' '.join([q.decode("utf-8") for q in raw_transcripts[i]])]
      # trans_index = [letter2index['<sos>']] + trans_index + [letter2index['<eos>']]
      trans_index = trans_index + [letter2index['<eos>']]
      transcripts.append(torch.Tensor(trans_index))
    
    return transcripts


def transform_letter_to_index_simple(raw_transcripts):
    '''
    Transforms text input to numerical input by converting each letter 
    to its corresponding index from letter_list

    Args:
        raw_transcripts: Raw text transcripts with the shape of (N, )
    
    Return:
        transcripts: Converted index-format transcripts. This would be a list with a length of N
    '''  

    transcripts = []
    for i in range(raw_transcripts.shape[0]):

      trans_index = [letter2index[j]  for j in ' '.join([q for q in raw_transcripts[i]])]
      trans_index = trans_index + [letter2index['<eos>']]
      transcripts.append(torch.Tensor(trans_index))
    
    return transcripts

       
# Create the letter2index and index2letter dictionary
letter2index, index2letter = create_dictionaries(LETTER_LIST)

In [None]:
class MyDataset(data.Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, index):
        # For testing set, return only x
        if self.Y == None:
            return torch.tensor(self.X[index].astype(np.float32))
        # For training and validation set, return x and y
        else:
            return torch.tensor(self.X[index].astype(np.float32)), torch.tensor(self.Y[index])


def collate_train_val(data):
    """
    Return:
        pad_x: the padded x (training/validation speech data) 
        pad_y: the padded y (text labels - transcripts)
        x_len: the length of x
        y_len: the length of y
    """

    x_len = torch.LongTensor([len(seq[0]) for seq in data])
    y_len = torch.LongTensor([len(seq[1]) for seq in data])

    X = [i[0] for i in data]
    Y = [i[1] for i in data]
    pad_x = pad_sequence(X,batch_first=True)
    pad_y = pad_sequence(Y,batch_first=True, padding_value = letter2index['<eos>']) 

    return pad_x,pad_y,x_len,y_len

def collate_test(data): 
    """
    Return:
        pad_x: the padded x (testing speech data) 
        x_len: the length of x
    """
    x_len = torch.LongTensor([len(seq) for seq in data])

    X = [i for i in data]
    pad_x = pad_sequence(X,batch_first=True)

    return pad_x,x_len

In [None]:
def transform_index_to_letter(index,startindex, stopindex):
    index_to_letter_list = []
    for tr in index:
      curr = ""
      for i in tr:
        if i in stopindex:
          break
        elif i in startindex:
          pass
        else:
          curr += index2letter[i]
      index_to_letter_list.append(curr)
    return index_to_letter_list

# Loading dataset

In [None]:
%cd /content/gdrive/MyDrive/competitions/HW4P2
# Load the training, validation and testing data
train_data = np.load('train.npy', allow_pickle=True, encoding='bytes')
valid_data = np.load('dev.npy', allow_pickle=True, encoding='bytes')
test_data = np.load('test.npy', allow_pickle=True, encoding='bytes')

# Load the training, validation raw text transcripts
raw_train_transcript = np.load('train_transcripts.npy', allow_pickle=True,encoding='bytes')
raw_valid_transcript = np.load('dev_transcripts.npy', allow_pickle=True,encoding='bytes')


/content/gdrive/MyDrive/competitions/HW4P2


In [None]:
# TODO: Convert the raw text transcripts into indexes
train_transcript = transform_letter_to_index(raw_train_transcript)
valid_transcript = transform_letter_to_index(raw_valid_transcript)

# Create datasets
train_dataset = MyDataset(train_data,train_transcript)
valid_dataset = MyDataset(valid_data,valid_transcript)
test_dataset = MyDataset(test_data,None)

# Create data loaders
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64, collate_fn = collate_train_val,num_workers=0, pin_memory=True)
valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=128, collate_fn = collate_train_val,num_workers=0, pin_memory=True)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=128, collate_fn = collate_test,num_workers=0, pin_memory=True)

# Model

In [None]:
# Lockdropout
# Code from TA and https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/nn/lock_dropout.html 
from torch.autograd import Variable
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

class LockedDropout(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, x, dropout=0.5):
    # x': (B, L, C)
    if dropout == 0 or not self.training:
      return x
    mask = x.data.new(x.size(0), 1, x.size(2))
    mask = mask.bernoulli_(1 - dropout)
    mask = Variable(mask, requires_grad=False) / (1 - dropout)
    mask = mask.expand_as(x)
    return mask * x

Pyramidal Bi-LSTM

In [None]:
class pBLSTM(nn.Module):
    '''
    Pyramidal BiLSTM
    Read paper and understand the concepts and then write your implementation here.
    '''
    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_size=input_dim*2, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.dropout = LockedDropout()
    def forward(self, x):
        x_padded, x_lens = pad_packed_sequence(x, batch_first=True)
        x_padded = x_padded[:, :(x_padded.size(1) // 2) * 2, :]  # chop off 
        x_padded = x_padded.reshape(x_padded.size(0), x_padded.size(1) // 2, x_padded.size(2) * 2)
        x_padded = self.dropout(x_padded) # lock dropout layer
        x_packed = pack_padded_sequence(x_padded, lengths=x_lens // 2, batch_first=True, enforce_sorted=False)
        out, _ = self.blstm(x_packed)
        return out 

In [None]:
class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key, value and unpacked_x_len.
    Key and value are linear projections of the output from pBLSTM network for the laster.
    '''
    def __init__(self, input_dim, encoder_hidden_dim, key_value_size=128):
        super(Encoder, self).__init__()

        self.cnn_layer = torch.nn.Sequential(
            torch.nn.Conv1d(input_dim, encoder_hidden_dim, kernel_size=3, stride=1, padding=1,bias=False),
            torch.nn.BatchNorm1d(encoder_hidden_dim),
        )

        # The first LSTM at the very bottom
        self.lstm = nn.LSTM(input_size=encoder_hidden_dim, hidden_size=encoder_hidden_dim, num_layers=1, bidirectional=True, batch_first=True)

        # TODO: Define the blocks of pBLSTMs
        self.pBLSTM1=pBLSTM(2*encoder_hidden_dim, encoder_hidden_dim)
        self.pBLSTM2=pBLSTM(2*encoder_hidden_dim, encoder_hidden_dim)
        self.pBLSTM3=pBLSTM(2*encoder_hidden_dim, encoder_hidden_dim)

        # The linear transformation for producing Key and Value for attention
        # Since you are using bidirectional LSTM, be careful about the size of hidden dimension
        self.key_network = nn.Linear(encoder_hidden_dim*2, key_value_size)
        self.value_network = nn.Linear(encoder_hidden_dim*2, key_value_size)

    def forward(self, x, x_len):
        x = x.transpose(1,2)
        x = self.cnn_layer(x)
        x = x.transpose(1,2)


        # Pass through the first LSTM at the very bottom        
        packed_sequence = rnn_utils.pack_padded_sequence(x, x_len.cpu(), enforce_sorted=False, batch_first=True) 
        outputs, _ = self.lstm(packed_sequence)
        
        # TODO: Pass through the pBLSTM blocks        
        outputs=self.pBLSTM1(outputs)
        outputs=self.pBLSTM2(outputs)
        outputs=self.pBLSTM3(outputs)

        # Unpack the sequence and get the Key and Value for attention
        linear_input, unpacked_x_len = utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        
        keys = self.key_network(linear_input)
        value = self.value_network(linear_input)

        return keys, value, unpacked_x_len

In [None]:
# Reference: recitation code

def plot_attention(attention):
    plt.clf()
    sns.heatmap(attention, cmap='GnBu')
    plt.show()

class Attention(nn.Module):
    '''
    Attention is calculated using key, value and query from Encoder and decoder.
    Below are the set of operations you need to perform for computing attention:
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
    '''
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, key, value, mask):

        key = key.to(device)
        attention = torch.bmm(key, query.unsqueeze(2)).squeeze(2).to(device)

        # mask = torch.arange(context.size(1)).unsqueeze(0) >= lengths.unsqueeze(1)
        mask = mask.to(device)

        attention.masked_fill_(mask, -1e9)
 
        attention = nn.functional.softmax(attention, dim=1)

        out = torch.bmm(attention.unsqueeze(1), value).squeeze(1)
        
        # attention vectors are returned for visualization
        return out, attention

In [None]:
class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step.
    Thus we use LSTMCell instead of LSTM here.
    The output from the seond LSTMCell can be used as query for calculating attention.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, decoder_hidden_dim, embed_dim, key_value_size=128):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=letter2index['<eos>'])
        self.lstm1 = nn.LSTMCell(input_size=embed_dim + key_value_size, hidden_size=decoder_hidden_dim)
        self.lstm2 = nn.LSTMCell(input_size=decoder_hidden_dim, hidden_size=key_value_size)

        self.linear = nn.Linear(key_value_size*2, decoder_hidden_dim)

        self.attention = Attention()
        self.vocab_size = vocab_size
        self.fc1 = nn.Linear(2 * key_value_size, 4 * key_value_size)
        self.tanh1 = nn.Tanh()
        self.fc2 = nn.Linear(4 * key_value_size, embed_dim)
        self.tanh2 = nn.Tanh()

        self.character_prob = nn.Linear(2 * key_value_size, vocab_size)
        self.key_value_size = key_value_size
        self.hidden_dim = decoder_hidden_dim

        # weight tying
        self.character_prob.weight = self.embedding.weight


    def forward(self, key, value, encoder_len, y=None, mode='train',batch_idx=30,Teacher_forcing_rate=0.1):
        '''
        Args:
            key :(B, T, key_value_size) - Output of the Encoder Key projection layer
            value: (B, T, key_value_size) - Output of the Encoder Value projection layer
            y: (T, text_len) - Batch input of text with text_length
            mode: Train or eval mode
        Return:
            predictions: the character perdiction probability
        '''

        B, key_seq_max_len, key_value_size = key.shape

        if mode == 'train':
            max_len =  y.shape[1] # 
            char_embeddings = self.embedding(y.long()) # ground truth 
        else:
            max_len = 600

        # TODO: Create the attention mask here (outside the for loop rather than inside) to aviod repetition
        mask = torch.arange(key_seq_max_len).unsqueeze(0) >= encoder_len.unsqueeze(1) # (1, T) >= (B, 1) -> (N, T_max) 

        predictions = []
        prediction = torch.zeros(B, 1).to(device)
        hidden_states = [None, None]

        # TODO: Initialize the context. Be careful here
        context = value[:,0,:].squeeze(1)
        attentionPlot = [] # list to generate attenntion plot
        char_embed = 0 # initalize with <SOS> 

        for i in range(max_len):
            if mode == 'train':
                # TODO: Implement (1) Teacher Forcing and (2) Gumble Noise techniques here
                if np.random.random_sample() < Teacher_forcing_rate and i > 0:
                    char_embed = char_embeddings[:,i-1]
                else:
                    char_embed = self.embedding(prediction.argmax(dim=-1))
            else:
                char_embed = self.embedding(prediction.argmax(dim=-1))

            y_context = torch.cat([char_embed, context], dim=1)
            hidden_states[0] = self.lstm1(y_context, hidden_states[0])

            lstm1_hidden = hidden_states[0][0]
            hidden_states[1] = self.lstm2(lstm1_hidden, hidden_states[1])
            output = hidden_states[1][0]

            # TODO: Compute attention from the output of the second LSTM Cell
            context, attention=self.attention(output, key, value, mask)
            if batch_idx % 64 == 22:
              attentionPlot.append(attention[5].detach().cpu())

            output_context = torch.cat([output, context], dim=1)
            output_context = self.fc1(output_context)
            output_context = self.tanh1(output_context)
            output_context = self.fc2(output_context)
            output_context = self.tanh2(output_context)
            prediction = self.character_prob(output_context)
            predictions.append(prediction.unsqueeze(1))

        if batch_idx % 64 == 22:
          attentions_plot = torch.stack(attentionPlot, dim=1)
          plot_attention(attentions_plot)

        return torch.cat(predictions, dim=1)


In [None]:
class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, encoder_hidden_dim, decoder_hidden_dim, embed_dim, key_value_size=128):
        super(Seq2Seq,self).__init__()
        self.encoder = Encoder(input_dim, encoder_hidden_dim, key_value_size=key_value_size)
        self.decoder = Decoder(vocab_size, decoder_hidden_dim, embed_dim, key_value_size=key_value_size)

    def forward(self, x, x_len, y=None, mode='train',batch_num=0,Teacher_forcing_rate=0.5):
        key, value, encoder_len = self.encoder(x, x_len)
        predictions = self.decoder(key, value, encoder_len, y=y, mode=mode,batch_idx=batch_num, Teacher_forcing_rate=Teacher_forcing_rate)
        return predictions

In [None]:
def train(model, train_loader, criterion, optimizer, mode,Teacher_forcing_rate):
    model.train()
    model.to(device)
    runningLoss = 0
    perplexity = 0
    
    # 0) Iterate through your data loader
    for batch_idx, (x,y,x_len,y_len) in tqdm(enumerate(train_loader),position=0, leave=True):
        optimizer.zero_grad()
        # 1) Set the inputs to the device.
        x,y,x_len,y_len = x.to(device),y.long().to(device),x_len.to(device),y_len.to(device)

        # 2) Pass your inputs, and length of speech into the model.
        predictions = model(x, x_len, y=y, mode='train',batch_num=batch_idx,Teacher_forcing_rate=Teacher_forcing_rate)

        # 3) Generate a mask based on the lengths of the text
        #    Ensure the mask is on the device and is the correct shape.   
        mask = torch.zeros(y.size()).T
        mask = mask.to(device) # binary
        for idx, length_Y in enumerate(y_len):
              mask[:length_Y,idx] = 1

        # 4. Calculate the loss and mask it to remove the padding part
        loss = criterion(predictions.view(-1, predictions.size(2)), y.view(-1)) 
        masked_loss = torch.sum(loss * mask.view(-1)) / torch.sum(mask)
        curr_Loss = masked_loss.item()
        curr_Perplex = torch.exp(masked_loss).item() # exponential of the loss-per-word
        runningLoss += curr_Loss
        perplexity += curr_Perplex

        # 5. Backward on the masked loss
        masked_loss.backward()
        # 6. Optional: Use torch.nn.utils.clip_grad_norm(model.parameters(), 2) to clip the gradie
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2)
        # 7. Take a step with your optimizer
        optimizer.step() 
        # 8. print the statistic (loss, edit distance and etc.) for analysis
    print('Training:')
    print('Avg-Loss: {:.5f}\tAvg-perplexity: {:.5f}'.format(runningLoss / len(train_loader), perplexity / len(train_loader)))
    del x,y,x_len,y_len
    torch.cuda.empty_cache()

def val(model, valid_loader):
    model.eval()
    model.to(device)
    runningDist = 0
    tot_sec = 0
    with torch.no_grad():
        for batch_idx, (x,y,x_len,y_len) in tqdm(enumerate(valid_loader),position=0, leave=True):
            x,y,x_len,y_len = x.to(device),y.long().to(device),x_len.to(device),y_len.to(device)
            predictions = model(x, x_len, y=y, mode='eval')
            predText = transform_index_to_letter(predictions.argmax(-1).detach().cpu().numpy(),startindex = [letter2index['<sos>']],stopindex = [letter2index['<eos>']] )    
            targetText =transform_index_to_letter(y.detach().cpu().numpy(),startindex = [letter2index['<sos>']],stopindex = [letter2index['<eos>']] )    
            for pred, target in zip(predText, targetText):
                dist = Levenshtein.distance(pred, target)
                runningDist += dist
                tot_sec += 1

    print('eval:')
    print('Avg-distance: {:.5f}'.format(runningDist / tot_sec))
    del x,y,x_len,y_len
    torch.cuda.empty_cache()
    return runningDist / tot_sec

In [None]:
model = Seq2Seq(input_dim=40, vocab_size=len(LETTER_LIST), encoder_hidden_dim=256, decoder_hidden_dim=512, embed_dim=256, key_value_size=128)
model = model.to(device)

# Training (LAS with teacher forcing )

In [None]:
model

Seq2Seq(
  (encoder): Encoder(
    (cnn_layer): Sequential(
      (0): Conv1d(40, 256, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
      (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (lstm): LSTM(256, 256, batch_first=True, bidirectional=True)
    (pBLSTM1): pBLSTM(
      (blstm): LSTM(1024, 256, batch_first=True, bidirectional=True)
      (dropout): LockedDropout()
    )
    (pBLSTM2): pBLSTM(
      (blstm): LSTM(1024, 256, batch_first=True, bidirectional=True)
      (dropout): LockedDropout()
    )
    (pBLSTM3): pBLSTM(
      (blstm): LSTM(1024, 256, batch_first=True, bidirectional=True)
      (dropout): LockedDropout()
    )
    (key_network): Linear(in_features=512, out_features=128, bias=True)
    (value_network): Linear(in_features=512, out_features=128, bias=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(34, 256, padding_idx=33)
    (lstm1): LSTMCell(384, 512)
    (lstm2): LSTMCell(512, 128)
    (linear)

In [None]:
# every 15 epoch, teacher forcing rate -=0.1
# disable scheduler for first 20 epochs
n_epochs = 150
Teacher_forcing_rate = 0.9
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(reduction='none').to(device)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.75, patience=5)
mode = 'train'


# best_dist = 10000
print('best dist',best_dist)
for epoch in range(n_epochs):
    print('epoch: ',epoch+1, 'learning rate:',optimizer.param_groups[0]['lr'],'teacher forcing rate', Teacher_forcing_rate )
    train(model, train_loader, criterion, optimizer, mode,Teacher_forcing_rate)
    val_dist = val(model, valid_loader)
    if epoch > 19:
      scheduler.step(val_dist)
    if (epoch % 15 == 14):
        Teacher_forcing_rate -= 0.1
        Teacher_forcing_rate = max(0.1, Teacher_forcing_rate)
    if best_dist > val_dist:
      best_dist = val_dist
      print('val Dist',best_dist)
      torch.save(model, 'model_24.pt') 

# Testing dataset

In [None]:
def testing(model, test_loader):
    model.eval()
    model.to(device)
    predText_col = []
    with torch.no_grad():
        for batch_idx, (x,x_len) in tqdm(enumerate(test_loader),position=0, leave=True):
            x,x_len = x.to(device),x_len.to(device)
            predictions = model(x, x_len, mode='eval')
            predText = transform_index_to_letter(predictions.argmax(-1).detach().cpu().numpy(),startindex = [letter2index['<sos>']],stopindex = [letter2index['<eos>']] )    
            predText_col.extend(predText)
    del x,x_len
    torch.cuda.empty_cache()
    return predText_col

model = torch.load( 'model_24.pt') 
test_text = testing(model, test_loader)

# write csv file
with open("trial.csv", 'w') as fh:
  fh.write('id,label\n') 
  for i in range(len(test_text)):
    fh.write(str(i)+ ',' + test_text[i] + "\n")

21it [00:16,  1.26it/s]
