In [9]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [13]:
##Paths
train_aud = '/Users/tacha/iu_research/speech_recognition/asr_workshop/data/target-segments/ru/clips/'
train_df = '/Users/tacha/iu_research/speech_recognition/asr_workshop/data/target-segments/ru/train.tsv'

**Transform functions**

Transform functions are the functions that are dealing with necessary input transformations e.g. feature extraction. They are feeded directly in the data loader. It helps to speed up data manipulation in contrast to reading all the file from the hard drive.

In [129]:
def find_maxlen(path, train_df):
    fnames = pd.read_csv(train_df, sep='\t')['path']
    maxlen = 0
    for n in tqdm(fnames):
        waveform, sample_rate = torchaudio.load(os.path.join(path, n))
        mfcc = torchaudio.transforms.MFCC()(waveform)
        size = mfcc.shape[2]
        if size > maxlen:
            maxlen = size
    print("Maxlen:", maxlen)


def extract_feats(path, maxlen=1083):
    '''
    Reads and processes one file at a time.
    Args:
        path: path to the file
        maxlen: maximum length of the spectrogram for padding
    '''
    waveform, sample_rate = torchaudio.load(path)
    #Calculate MFCC
    mfcc = torchaudio.transforms.MFCC()(waveform)
    #Calculate delta and double-delta
    deltas = torchaudio.transforms.ComputeDeltas()(mfcc)
    ddeltas = torchaudio.transforms.ComputeDeltas()(deltas)
    res = torch.cat((mfcc, deltas, ddeltas), dim=1).squeeze(0)
    #Normalize rows
    s = torch.sum(res, dim=1, keepdim=True)
    norm = torch.div(res, s)
    mask = torch.ones(norm.shape[0], norm.shape[1])
    padded_norm = nn.functional.pad(norm, pad=(0, maxlen-norm.shape[1], 0, 0), 
                                          mode="constant",value=0)
    padded_mask = nn.functional.pad(mask, pad=(0, maxlen-mask.shape[1], 0, 0), 
                                          mode="constant",value=0)
    return padded_norm, padded_mask

In [137]:
class TrainData(data.Dataset):
    def __init__(self, csv_path, aud_path, transform):
        self.df = pd.read_csv(csv_path, sep='\t')
        self.aud_path = aud_path
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        fname = os.path.join(self.aud_path, self.df['path'][idx])
        transcript = self.df['sentence'][idx]

        feat, mask = self.transform(fname)

        sample = {'aud':feat, 'trans': transcript, 'mask':mask}
        return sample
    
def weights(m):
    '''
    Intialize random weights
    '''
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight.data)
        nn.init.constant_(m.bias.data,0.1)

**Proposed Architechture**

Attention-based Sequence-to-Sequence model:

![](/img/arch.png)

In [150]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.input_layer = nn.Linear(120, 512)
        self.blstm = nn.LSTM(512, hidden_size=256, num_layers=3, bidirectional=True)

    def init_lstm(self, x):
        pass

    def forward(self, x):
        x = self.input_layer(x)
        x = torch.nn.LeakyReLU(x)
        x = self.blstm(x)
        print(x)
        return(x)

In [133]:
cv_dataset = TrainData(train_df, train_aud, extract_feats)
loader = data.DataLoader(cv_dataset, batch_size=32, shuffle=True)

In [149]:
for i in loader:
    print(i['aud'])

tensor([[[ 2.8605e-03,  2.8605e-03,  2.8605e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 7.3242e-10,  7.3242e-10,  7.3242e-10,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 1.5318e-08,  1.5318e-08,  1.5318e-08,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 1.2503e-19,  1.2503e-19,  1.2503e-19,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 1.9429e-03,  1.9429e-03,  1.9429e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-4.6756e-10, -4.6756e-10, -4.6756e-10,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-1.0797e-09, -1.0797e-09, -1.0797e-09,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [-2.0172e-20,  9

tensor([[[ 5.7029e-03,  5.7029e-03,  5.7029e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 1.2305e-09,  1.2305e-09,  1.2305e-09,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 3.3278e-10,  3.3278e-10,  3.3278e-10,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [ 2.7273e-20,  2.7273e-20,  2.7273e-20,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-0.0000e+00, -0.0000e+00, -0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-0.0000e+00, -0.0000e+00, -0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 2.5131e-03,  2.5131e-03,  2.5131e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 8.3676e-10,  8.3676e-10,  8.3676e-10,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 1.8234e-09,  1.8234e-09,  1.8234e-09,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [ 1.0149e-20,  1

tensor([[[ 2.3195e-03,  2.3195e-03,  2.3195e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 1.1654e-10,  1.1654e-10,  1.1654e-10,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 2.8820e-09,  2.8820e-09,  2.8820e-09,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [-1.5190e-19, -1.5190e-19, -1.5190e-19,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 7.5846e-19,  7.5846e-19,  7.5846e-19,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-9.6110e-19, -9.6110e-19, -9.6110e-19,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 2.0430e-03,  2.0430e-03,  2.0430e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 3.4061e-10,  3.4061e-10,  3.4061e-10,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 1.4236e-09,  1.4236e-09,  1.4236e-09,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [-9.7728e-20, -9

tensor([[[ 2.3398e-03,  2.3398e-03,  2.3398e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 2.0824e-10,  2.0824e-10,  2.0824e-10,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 4.9397e-09,  4.9397e-09,  4.9397e-09,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [ 5.0670e-20, -9.3386e-02,  4.2450e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00, -2.3454e-02, -7.4873e-02,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-1.6520e-20,  2.1396e-02,  1.8903e-02,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 1.8311e-03,  1.8311e-03,  1.8311e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-4.1931e-10, -4.1931e-10, -4.1931e-10,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-5.6420e-09, -5.6420e-09, -5.6420e-09,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [-2.6803e-19, -2

tensor([[[ 2.6920e-03,  2.6920e-03,  2.6920e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 8.0042e-10,  8.0042e-10,  8.0042e-10,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-1.2374e-08, -1.2374e-08, -1.2374e-08,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-3.2161e-18, -3.2161e-18, -3.2161e-18,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 3.3965e-03,  3.3965e-03,  3.3965e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 2.3172e-09,  2.3172e-09,  2.3172e-09,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 5.0734e-09,  5.0734e-09,  5.0734e-09,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [ 1.1929e-19,  1

tensor([[[ 2.8243e-03,  2.8243e-03,  2.8243e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 1.3623e-09,  1.3623e-09,  1.3623e-09,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 2.3002e-09,  2.3002e-09,  2.3002e-09,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [ 2.1400e-21,  2.1400e-21, -4.6177e-04,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-0.0000e+00, -0.0000e+00,  5.7669e-04,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  6.0622e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 2.1554e-03,  2.1554e-03,  2.1554e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-7.1885e-10, -7.1885e-10, -7.1885e-10,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 2.1477e-08,  2.1477e-08,  2.1477e-08,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [ 3.3625e-19,  3