In [322]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [323]:
##Paths
train_aud = '/Users/tacha/iu_research/speech_recognition/asr_workshop/data/target-segments/ru/clips/'
train_df = '/Users/tacha/iu_research/speech_recognition/asr_workshop/data/target-segments/ru/train.tsv'

In [324]:
#Alphabet 
with open('/Users/tacha/iu_research/speech_recognition/asr_workshop/iu-hse-asr-workshop/data/alphabet.txt', 'r') as fo:
    alphabet = fo.readlines()
char2ind = {alphabet[i].strip():i for i in range(len(alphabet))}

**Transform functions**

Transform functions are the functions that are dealing with necessary input transformations e.g. feature extraction. They are feeded directly in the data loader. It helps to speed up data manipulation in contrast to reading all the file from the hard drive.

In [325]:
def find_maxlen(path, train_df):
    fnames = pd.read_csv(train_df, sep='\t')['path']
    maxlen = 0
    for n in tqdm(fnames):
        waveform, sample_rate = torchaudio.load(os.path.join(path, n))
        mfcc = torchaudio.transforms.MFCC()(waveform)
        size = mfcc.shape[2]
        if size > maxlen:
            maxlen = size
    print("Maxlen:", maxlen)


def extract_feats(path, maxlen=1083):
    '''
    Reads and processes one file at a time.
    Args:
        path: path to the file
        maxlen: maximum length of the spectrogram for padding
    '''
    waveform, sample_rate = torchaudio.load(path)
    #Calculate MFCC
    mfcc = torchaudio.transforms.MFCC()(waveform)
    #Calculate delta and double-delta
    deltas = torchaudio.transforms.ComputeDeltas()(mfcc)
    ddeltas = torchaudio.transforms.ComputeDeltas()(deltas)
    res = torch.cat((mfcc, deltas, ddeltas), dim=1).squeeze(0)
    #Normalize rows
    s = torch.sum(res, dim=1, keepdim=True)
    norm = torch.div(res, s)
    mask = torch.ones(norm.shape[0], norm.shape[1])
    padded_norm = nn.functional.pad(norm, pad=(0, maxlen-norm.shape[1], 0, 0), 
                                          mode="constant",value=0)
    padded_mask = nn.functional.pad(mask, pad=(0, maxlen-mask.shape[1], 0, 0), 
                                          mode="constant",value=0)
    return padded_norm, padded_mask

def encode_trans(trans, char2ind, maxlen_t=7):
    '''
    Encodes true transcription
    trans: 
    '''
    pass

In [327]:
class TrainData(data.Dataset):
    def __init__(self, csv_path, aud_path, transform):
        self.df = pd.read_csv(csv_path, sep='\t')
        self.aud_path = aud_path
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        fname = os.path.join(self.aud_path, self.df['path'][idx])
        transcript = self.df['sentence'][idx]

        feat, mask = self.transform(fname)

        sample = {'aud':feat, 'trans': transcript, 'mask':mask}
        return sample
    
def weights(m):
    '''
    Intialize weights randomly
    '''
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight.data)
        nn.init.constant_(m.bias.data,0.1)

**Proposed Architechture**

Attention-based Sequence-to-Sequence model:

![](/img/arch.png)

In [430]:
class Encoder(nn.Module):
    def __init__(self, batch_size):
        super().__init__()
        self.input_layer = nn.Linear(120, 512)
        self.blstm = nn.LSTM(input_size=512, 
                             hidden_size=256, 
                             num_layers=3, 
                             bidirectional=True)
        self.h0 = torch.zeros(3*2, batch_size, 256)
        self.c0 = torch.zeros(3*2, batch_size, 256)
        
    def forward(self, x):
        #Pass through the first linear layer
        outputs=[]
        for i in range(x.shape[2]):
            feature = x[:,:,i]
            out = self.input_layer(feature)
            out = torch.nn.LeakyReLU()(out)
            outputs.append(out)
        outputs = torch.stack(outputs)
        #Pass through LSTM layers
        output, (hn, cn) = self.blstm(outputs, (self.h0, self.c0))
        return output, (hn, cn)
    
    
class Decoder(nn.Module):
    def __init__(self, batch_size, char2ind):
        super().__init__()
        self.char2ind = char2ind
        self.embed_layer = nn.Linear(33, 128)
        self.lstm_cell = nn.LSTMCell(128, 512)
        self.softmax = nn.Softmax()
        self.output = nn.Linear(512, 33)
        self.h = torch.zeros(1, 512)
        self.c = torch.zeros(1, 512)
        self.y = torch.zeros(1,  33)
        
    def forward(self, t, maxlen_t=7):
        pred_seq = []
        for i in range(maxlen_t):
            y = self.embed_layer(self.y)
            hx, cx = self.lstm_cell(y, (self.h, self.c))
            out = self.output(hx)
            self.y = out
            self.h = hx
            self.c = cx
            
            pred = torch.argmax(self.softmax(out))
            pred_seq.append(pred)
        print(torch.tensor(pred_seq))

In [431]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
encoder = Encoder(32)
encoder = encoder.to(device)
encoder.apply(weights)

cv_dataset = TrainData(train_df, train_aud, extract_feats)
loader = data.DataLoader(cv_dataset, batch_size=32, shuffle=True)

In [432]:
decoder = Decoder(32, char2ind)

In [433]:
for batch in loader:
    x = batch['aud'].to(device)
    t = batch['trans']
    decoder(t)

  pred = torch.argmax(self.softmax(out))


tensor([13, 13, 13, 13, 13, 13, 13])
tensor([13, 13, 13, 13, 13, 13, 13])
tensor([13, 13, 13, 13, 13, 13, 13])
tensor([13, 13, 13, 13, 13, 13, 13])
tensor([13, 13, 13, 13, 13, 13, 13])
tensor([13, 13, 13, 13, 13, 13, 13])
tensor([13, 13, 13, 13, 13, 13, 13])
tensor([13, 13, 13, 13, 13, 13, 13])
tensor([13, 13, 13, 13, 13, 13, 13])
tensor([13, 13, 13, 13, 13, 13, 13])
tensor([13, 13, 13, 13, 13, 13, 13])
tensor([13, 13, 13, 13, 13, 13, 13])
tensor([13, 13, 13, 13, 13, 13, 13])


KeyboardInterrupt: 