In [322]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [323]:
##Paths
train_aud = '/Users/tacha/iu_research/speech_recognition/asr_workshop/data/target-segments/ru/clips/'
train_df = '/Users/tacha/iu_research/speech_recognition/asr_workshop/data/target-segments/ru/train.tsv'

In [564]:
#Alphabet 
with open('/Users/tacha/iu_research/speech_recognition/asr_workshop/iu-hse-asr-workshop/data/alphabet.txt', 'r') as fo:
    alphabet = fo.readlines() + ['f', 'i', 'r', 'e', 'o', 'x']
char2ind = {alphabet[i].strip():i for i in range(len(alphabet))}

**Transform functions**

Transform functions are the functions that are dealing with necessary input transformations e.g. feature extraction. They are feeded directly in the data loader. It helps to speed up data manipulation in contrast to reading all the file from the hard drive.

In [575]:
def find_maxlen(path, train_df):
    fnames = pd.read_csv(train_df, sep='\t')['path']
    maxlen = 0
    for n in tqdm(fnames):
        waveform, sample_rate = torchaudio.load(os.path.join(path, n))
        mfcc = torchaudio.transforms.MFCC()(waveform)
        size = mfcc.shape[2]
        if size > maxlen:
            maxlen = size
    print("Maxlen:", maxlen)


def extract_feats(path, maxlen=1083):
    '''
    Reads and processes one file at a time.
    Args:
        path: path to the file
        maxlen: maximum length of the spectrogram for padding
    '''
    waveform, sample_rate = torchaudio.load(path)
    #Calculate MFCC
    mfcc = torchaudio.transforms.MFCC()(waveform)
    #Calculate delta and double-delta
    deltas = torchaudio.transforms.ComputeDeltas()(mfcc)
    ddeltas = torchaudio.transforms.ComputeDeltas()(deltas)
    res = torch.cat((mfcc, deltas, ddeltas), dim=1).squeeze(0)
    #Normalize rows
    s = torch.sum(res, dim=1, keepdim=True)
    norm = torch.div(res, s)
    mask = torch.ones(1, norm.shape[1])
    padded_norm = nn.functional.pad(norm, pad=(0, maxlen-norm.shape[1], 0, 0), 
                                          mode="constant",value=0)
    padded_mask = nn.functional.pad(mask, pad=(0, maxlen-mask.shape[1], 0, 0), 
                                          mode="constant",value=0)
    return padded_norm, padded_mask

In [567]:
class TrainData(data.Dataset):
    def __init__(self, csv_path, aud_path, transform):
        self.df = pd.read_csv(csv_path, sep='\t')
        self.aud_path = aud_path
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        fname = os.path.join(self.aud_path, self.df['path'][idx])
        transcript = self.df['sentence'][idx].lower()

        feat, mask = self.transform(fname)

        sample = {'aud':feat, 'trans': transcript, 'mask':mask}
        return sample
    
def weights(m):
    '''
    Intialize weights randomly
    '''
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight.data)
        nn.init.constant_(m.bias.data,0.1)

**Proposed Architechture**

Attention-based Sequence-to-Sequence model:

![](/img/arch.png)

In [582]:
class Encoder(nn.Module):
    def __init__(self, batch_size):
        super().__init__()
        self.input_layer = nn.Linear(120, 512)
        self.blstm = nn.LSTM(input_size=512, 
                             hidden_size=256, 
                             num_layers=3, 
                             bidirectional=True)
        self.h0 = torch.zeros(3*2, batch_size, 256)
        self.c0 = torch.zeros(3*2, batch_size, 256)
        
    def forward(self, x):
        #Pass through the first linear layer
        outputs=[]
        for i in range(x.shape[2]):
            feature = x[:,:,i]
            out = self.input_layer(feature)
            out = torch.nn.LeakyReLU()(out)
            outputs.append(out)
        outputs = torch.stack(outputs)
        #Pass through LSTM layers
        output, (hn, cn) = self.blstm(outputs, (self.h0, self.c0))
        return output, (hn, cn)
    
class Attention(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, h_e, h_d):
        score = torch.matmul(h_e.T, h_d)
        temp1 = torch.exp(score)
        temp2 = torch.sum(score, dim=0)
        a_t = temp1/temp2
        c_t = torch.zeros(h_e.shape)
        for a in a_t:
            c_t+=a*h_e  
        return c_t
        
    
class Decoder(nn.Module):
    def __init__(self, batch_size):
        super().__init__()
        self.char2ind = char2ind
        self.embed_layer = nn.Linear(33, 128)
        self.lstm_cell = nn.LSTMCell(128, 512)
        self.softmax = nn.Softmax(dim=1)
        self.output = nn.Linear(512, 33)
        self.dec_h = torch.zeros(batch_size, 512)
        self.c = torch.zeros(batch_size, 512)
        self.y = torch.zeros(batch_size,  33)
        self.attention = Attention()
    
    def forward(self, enc_h):
        preds = []
        for hidden in enc_h:
            c_t = self.attention(hidden, self.dec_h)
            y = self.embed_layer(self.y)
            self.dec_h, self.c = self.lstm_cell(y, (self.dec_h, self.c))
            self.y = self.output(self.dec_h)
            y_hat = torch.argmax(self.softmax(self.y), dim=1)
            preds.append(y_hat)
        return torch.stack(preds).T
    
class Seq2Seq(nn.Module):
    def __init__(self, batch_size):
        super().__init__()
        self.encoder = Encoder(batch_size)
        self.decoder = Decoder(batch_size)
    def forward(self, batch):
        enc_out, (he, ce) = encoder(batch)
        preds = decoder(enc_out)
        return preds
    
    
def encode_trans(trans, char2ind, maxlen_t=7):
    '''
    Encodes true transcription
    trans: 
    '''
    sent_ind = []
    for sent in trans:
        res = np.array([char2ind[char] for char in sent])
        res = np.pad(res, (0, maxlen_t-len(res)), 'constant', constant_values=(-1))
        sent_ind.append(res)
    return torch.tensor(sent_ind)
    
def collapse_fn(preds, masks):
    preds = preds.detach().cpu().numpy()
    masks = masks.detach().cpu().numpy()
    collapsed = []
    maxlen_t = 0
    for pred, mask in zip(preds, masks):
        temp = [pred[0]]
        for i, char in enumerate(pred[1:]):
            if mask[i]:
                if pred[i-1]==char:
                    continue
                else:
                    temp.append(char)
        collapsed.append(temp)
        maxlen_t = max(maxlen_t, len(temp))
    
    res = []
    for sent in collapsed:
        sent = np.pad(sent, (0, maxlen_t - len(sent)), 'constant', constant_values=(-1))
        res.append(sent)
        
    return torch.tensor(res)

In [583]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Seq2Seq(32)
model.apply(weights)
model = model.to(device)

criterion = nn.CTCLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-4)

cv_dataset = TrainData(train_df, train_aud, extract_feats)
loader = data.DataLoader(cv_dataset, batch_size=32, shuffle=True)

In [585]:
for batch in loader:
    x = batch['aud'].to(device)
    t = encode_trans(batch['trans'], char2ind).to(device)
    print(t.shape)
    mask = batch['mask'].squeeze(1)
    preds = model(x)
    preds = collapse_fn(preds, mask)
    print(preds.shape)

torch.Size([32, 7])
torch.Size([32, 1])
torch.Size([32, 7])


KeyboardInterrupt: 