<a href="https://colab.research.google.com/github/cristinakuo/thesis-pre-research/blob/master/CTC_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
import torch
import torch.nn as nn
import tqdm

print("CUDA available: ", torch.cuda.is_available())
dev = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(dev)  

# ------------------------------ Loading of raw data ------------------------------

# Load files: out_lab_fea_xx.pkl
# Saves the dictionary total_set in 3 files 
# that should contain train, dev and test data (in any order)

import pickle
import numpy as np

out_folder = '/content/gdrive/My Drive/Tesis/data/'
xi = 3 # Name of the 3 files (train, dev y test) 
xf = 6

total_set = dict()
for i in range(xi,xf):
    idf = str(i)
    out_file = 'out_lab_fea_' + idf + '.pkl'

    with open(out_folder+out_file,'rb') as f:
        total = pickle.load(f)
    total_set[total[5]] = total
    
data_name_train = total_set['train'][0]
data_end_index_train = total_set['train'][1]
fea_dict_train = total_set['train'][2]
lab_dict_train = total_set['train'][3]
data_set_train = total_set['train'][4]
todo_train = total_set['train'][5]

data_name_dev = total_set['dev'][0]
data_end_index_dev = total_set['dev'][1]
fea_dict_dev = total_set['dev'][2]
lab_dict_dev = total_set['dev'][3]
data_set_dev = total_set['dev'][4]
todo_dev = total_set['dev'][5]

data_name_test = total_set['test'][0]
data_end_index_test = total_set['test'][1]
fea_dict_test = total_set['test'][2]
lab_dict_test = total_set['test'][3]
data_set_test = total_set['test'][4]
todo_test = total_set['test'][5]

# Check number of phrases in transcripts
nr_phrases_train = len(data_name_train)
nr_phrases_test = len(data_name_test)

print('Nr of phrases (train): ', nr_phrases_train)
print('Nr of phrases (test): ', nr_phrases_test)

# Split into sequences according to end indices
train_sequences = np.split(data_set_train, data_end_index_train)[:-1] # Last item is empty
test_sequences = np.split(data_set_test, data_end_index_test)[:-1]

# Conver to tensor
train_sequences = [torch.from_numpy(seq) for seq in train_sequences]
test_sequences = [torch.from_numpy(seq) for seq in test_sequences]
print('Train: split data into {} phrases.'.format(len(train_sequences)))
print('Test: split data into {} phrases.'.format(len(test_sequences)))


CUDA available:  True
Nr of phrases (train):  3696
Nr of phrases (test):  192
Train: split data into 3696 phrases.
Test: split data into 192 phrases.


In [13]:
import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
import torch.optim as optim
from torch.nn import CTCLoss, LogSoftmax

FEATURE_DIM = 13 
NR_PHONEMES = 61
BLANK_LABEL = 61  # 0 to 60 are used 

# ******************************** Hyper parameters ******************************** #

BATCH_SIZE = 3
NUM_EPOCHS = 5
learning_rate = 0.9
momentum = 0.9
hidden_size = 93
layer_size = 2 # Number of layers


# *********************************** DataSet ************************************** #

class SpeechDataset(Dataset):
  def __init__(self, list_of_sequences):
    self.data = list_of_sequences
    self.n_samples = len(list_of_sequences)
    self.seq_lens = [len(seq) for seq in list_of_sequences]
    self.n_frames = sum(self.seq_lens)

  def __getitem__(self, index):
    return self.data[index]

  def __len__(self):
    return self.n_samples


train_dataset = SpeechDataset(train_sequences)
test_dataset = SpeechDataset(test_sequences)

print("Train Samples: {}".format(len(train_dataset)))
print("Train frames: ", train_dataset.n_frames)
print("Test Samples: {}".format(len(test_dataset)))
print("Test frames: ", test_dataset.n_frames)


def collate_timit(batch):
    # Get length for each sequence
    lens = [sequence.shape[0] for sequence in batch]
    
    # Pad sequence to max length in batch
    batch_padded = pad_sequence(batch, batch_first=True, padding_value=-1)
    batch_padded = torch.as_tensor(batch_padded)
    
    # Separate input (x) and target (y)
    x = batch_padded[:,:,:-1]
    y = batch_padded[:,:,-1]  
    
    return x,y,lens


# *********************************** Model ************************************** #

class BLSTMModel(nn.Module):
  def __init__(self, input_size, hidden_size, nr_layers, output_size):
    super(BLSTMModel, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.nr_layers = nr_layers

    # Hidden layer
    # batch_first=True means input/output tensors to be of shape
    # (batch_dim, seq_dim, feature_dim)
    self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.nr_layers, 
                        batch_first=True, bidirectional=True, bias=True)
    
    # Output Layer
    # Input is hidden_size*2 because it has a bidirectional LSTM as previous layer
    self.fc = torch.nn.Linear(self.hidden_size*2, self.output_size, bias=True)

    self.log_softmax = LogSoftmax(dim=2)

  def forward(self, x, seq_lens): # x has shape [batch_size, max_seq_len, nr_coef]

    x_packed = pack_padded_sequence(x, lengths=seq_lens, batch_first=True, enforce_sorted=False) # x_packed.data has shape [nr_batch_frames, nr_coefs]

    out_lstm, _ = self.lstm(x_packed) # out_lstm.data has shape [nr_batch_frames, nr_coefs]

    out_lstm_padded, out_lens = pad_packed_sequence(out_lstm, batch_first=True, padding_value=-1) # [batch_size, sequences_max_len, nr_coefs]

    out_linear = self.fc(out_lstm_padded) # [batch_size, sequences_max_len, nr_classes] 
    # CTCLoss will ignore the padding values as sequences lengths from input and target are passed to it

    out = self.log_softmax(out_linear) # [batch_size, sequences_max_len, nr_classes]
    return out, out_lens


def train_model(model, dataset):
  
  data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collate_timit)

  model.train()
  
  epoch_loss = 0
  # nr_correct_frames = 0
  # total_frames = 0

  for batch_idx, (input_data, target_data, seq_lens) in enumerate(tqdm.notebook.tqdm((data_loader))):
    
    # Save to device
    input_data = input_data.to(torch.float)
    target_data = target_data.to(torch.long)
    input_data, target_data = input_data.to(device), target_data.to(device)
    
    optimizer.zero_grad() # Clear grads before doing backward
    
    # Forward pass
    model_out, out_lens = model(input_data, seq_lens) # [batch_size, seq_len, classes]
    
    # Compute loss
    #target_packed = pack_padded_sequence(target_data, lengths=seq_lens, batch_first=True, enforce_sorted=False)


    # Prepare CTC input
    model_out = model_out.permute(1,0,2) # [seq_len, batch_size, classes] 
    
    #print("model shape: ", model_out)
    #print("target data: ", target_data)
    #print("out_lens ", out_lens)
    #print("seq_lens ", seq_lens)

    batch_loss = cost_function(model_out.cpu(), target_data.cpu(), list(out_lens), seq_lens)  # Sequences lens is equal I think?
    
    epoch_loss += batch_loss.item()
    
    # Backward pass
    batch_loss.backward()
    optimizer.step()

    # TODO: decode

    # See frame accuracy
    # y_pred_labels = torch.argmax(model_out, dim=1)
    # nr_correct_frames += int((y_pred_labels == target_packed.data).sum())
    
    #total_frames += len(y_pred_labels)

  # Epoch training results
  epoch_loss = epoch_loss/len(dataset)
  # total_frames = dataset.n_frames
  # accuracy = nr_correct_frames/total_frames*100

  return epoch_loss #, nr_correct_frames, total_frames, accuracy

def test_model(model, dataset):

  data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collate_timit)
  # This will notify all the layers that it's eval mode, that way, 
  # batchnorm or dropout layers will work in eval mode instead of training mode.
  model.eval()

  total_loss = 0
  nr_correct_frames = 0
  total_frames = 0

  # no_grad impacts the autograd engine and deactivate it. 
  # It will reduce memory usage and speed up computations 
  # and it won’t be able to backprop (which is not desired 
  # in an eval script).
  with torch.no_grad(): 
    for batch_idx, (input_data, target_data, seq_lens) in enumerate(data_loader):
      input_data = input_data.to(torch.float)
      target_data = target_data.to(torch.long)
      
      model_out = model(input_data, seq_lens)
      
      # Compute loss
      target_packed = pack_padded_sequence(target_data, lengths=seq_lens, batch_first=True, enforce_sorted=False)
      batch_loss = cost_function(model_out, target_packed.data)
  
      # Calculate number of correct frames
      y_pred_labels = torch.argmax(model_out.data, dim=1)
      nr_correct_frames += int((y_pred_labels == target_packed.data).sum())
      nr_total_frames += sum(seq_lens)
      total_loss += batch_loss.item()
      
    # Arrange return values
    average_loss = total_loss/len(data_loader.dataset)
  
  accuracy = nr_correct_frames/total_frames*100
  
  return average_loss, nr_correct_frames, total_frames, accuracy


# **************************** Instantiate model & train ******************************** #

input_size = FEATURE_DIM
output_size = NR_PHONEMES + 1  # Number of phonemes + 1 for blank label

model = BLSTMModel(input_size, hidden_size, layer_size, output_size)
model = model.to(device)

cost_function = nn.CTCLoss(blank=BLANK_LABEL)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

print("Created BLSTM model with parameters:")
print("Input size: ", input_size)
print("Hidden size: ", hidden_size)
print("Layer size: ", layer_size)
print("Output size: ", output_size)

for epoch in range(NUM_EPOCHS):
  print("Epoch nr: ", epoch)

  #train_loss, train_correct_frames, train_total_frames, train_accuracy = train_model(model, train_dataset)
  train_loss = train_model(model, train_dataset)
  #test_loss, test_correct_frames, test_total_frames, test_accuracy = test_model(model, test_dataset)
  
  print("Training results: ")
  print("Loss: ", train_loss)
  #print("Loss: {} | Accuracy: {}/{} %{}".format(train_loss, train_correct_frames, train_total_frames, train_accuracy))
  #print("Test results: ")
  #print("Loss: {} | Accuracy: {}/{} %{}".format(test_loss, test_correct_frames, test_total_frames, test_accuracy))

Train Samples: 3696
Train frames:  1124823
Test Samples: 192
Test frames:  57919
Created BLSTM model with parameters:
Input size:  13
Hidden size:  93
Layer size:  2
Output size:  62
Epoch nr:  0


HBox(children=(FloatProgress(value=0.0, max=1232.0), HTML(value='')))


Training results: 
Loss:  nan
Epoch nr:  1


HBox(children=(FloatProgress(value=0.0, max=1232.0), HTML(value='')))


Training results: 
Loss:  nan
Epoch nr:  2


HBox(children=(FloatProgress(value=0.0, max=1232.0), HTML(value='')))


Training results: 
Loss:  nan
Epoch nr:  3


HBox(children=(FloatProgress(value=0.0, max=1232.0), HTML(value='')))


Training results: 
Loss:  nan
Epoch nr:  4


HBox(children=(FloatProgress(value=0.0, max=1232.0), HTML(value='')))


Training results: 
Loss:  nan
