In [2]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchaudio



In [3]:
##Paths
train_aud = '/Users/tacha/iu_research/speech_recognition/asr_workshop/data/target-segments/ru/clips/'
train_df = '/Users/tacha/iu_research/speech_recognition/asr_workshop/data/target-segments/ru/train.tsv'

In [4]:
#Alphabet 
with open('/Users/tacha/iu_research/speech_recognition/asr_workshop/iu-hse-asr-workshop/data/alphabet.txt', 'r') as fo:
    alphabet = fo.readlines() + ['f', 'i', 'r', 'e', 'o', 'x']
char2ind = {alphabet[i].strip():i for i in range(len(alphabet))}

**Transform functions**

Transform functions are the functions that are dealing with necessary input transformations e.g. feature extraction. They are feeded directly in the data loader. It helps to speed up data manipulation in contrast to reading all the file from the hard drive.

In [22]:
def find_maxlen(path, train_df):
    fnames = pd.read_csv(train_df, sep='\t')['path']
    maxlen = 0
    for n in tqdm(fnames):
        waveform, sample_rate = torchaudio.load(os.path.join(path, n))
        mfcc = torchaudio.transforms.MFCC()(waveform)
        size = mfcc.shape[2]
        if size > maxlen:
            maxlen = size
    print("Maxlen:", maxlen)

def extract_feats(path, maxlen=1083):
    '''
    Reads and processes one file at a time.
    Args:
        path: path to the file
        maxlen: maximum length of the spectrogram for padding
    '''
    waveform, sample_rate = torchaudio.load(path)
    #Calculate MFCC
    mfcc = torchaudio.transforms.MFCC()(waveform)
    #Calculate delta and double-delta
    deltas = torchaudio.transforms.ComputeDeltas()(mfcc)
    ddeltas = torchaudio.transforms.ComputeDeltas()(deltas)
    res = torch.cat((mfcc, deltas, ddeltas), dim=1).squeeze(0)
    #Normalize rows
    s = torch.sum(res, dim=1, keepdim=True)
    norm = torch.div(res, s)
    mask = torch.ones(1, norm.shape[1])
    padded_norm = nn.functional.pad(norm, pad=(0, maxlen-norm.shape[1], 0, 0), 
                                          mode="constant",value=0)
    padded_mask = nn.functional.pad(mask, pad=(0, maxlen-mask.shape[1], 0, 0), 
                                          mode="constant",value=0)
    return padded_norm, padded_mask


def encode_trans(trans, char2ind, maxlen_t=7):
    '''
    Encodes true transcription
    trans: 
    '''
    res = np.array([char2ind[char] for char in trans])
    res = np.pad(res, (0, maxlen_t-len(res)), 'constant', constant_values=(-1))
    mask = [1 if i>=0 else 0 for i in res]
    return torch.tensor(res), torch.tensor(mask) 


def collapse_fn(preds, masks):
    preds = preds.detach().cpu().numpy()
    masks = masks.detach().cpu().numpy()
    collapsed = []
    maxlen_t = 0
    for pred, mask in zip(preds, masks):
        temp = [pred[0]]
        for i, char in enumerate(pred[1:]):
            if mask[i]:
                if pred[i-1]==char:
                    continue
                else:
                    temp.append(char)
        collapsed.append(temp)
        maxlen_t = max(maxlen_t, len(temp))
    
    res = []
    for sent in collapsed:
        sent = np.pad(sent, (0, maxlen_t - len(sent)), 'constant', constant_values=(-1))
        res.append(sent)
        
    return torch.tensor(res)

In [23]:
class TrainData(data.Dataset):
    def __init__(self, csv_path, aud_path, char2ind, transforms):
        self.df = pd.read_csv(csv_path, sep='\t')
        self.aud_path = aud_path
        self.char2ind = char2ind
        self.transforms = transforms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        fname = os.path.join(self.aud_path, self.df['path'][idx])
        transcript = self.df['sentence'][idx].lower()

        feat, fmask = self.transforms[0](fname)
        trans, tmask = self.transforms[1](transcript, self.char2ind)
        sample = {'aud': nan_to_num(feat), 'trans': trans, 'fmask':fmask, 'tmask':tmask}
        return sample
    
def weights(m):
    '''
    Intialize weights randomly
    '''
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight.data)
        nn.init.constant_(m.bias.data,0.1)

def nan_to_num(t,mynan=0.):
    if torch.all(torch.isfinite(t)):
        return t
    if len(t.size()) == 0:
        return torch.tensor(mynan)
    return torch.cat([nan_to_num(l).unsqueeze(0) for l in t],0)

In [24]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.input_layer = nn.Linear(120, 512)
        self.blstm = nn.LSTM(input_size=512, 
                             hidden_size=256, 
                             num_layers=3, 
                             bidirectional=True)
        
    def forward(self, x, mask):
        outputs=[]
        for i in range(x.shape[2]):
            feature = x[:,:,i]
            out = self.input_layer(feature)
            out = torch.nn.LeakyReLU()(out)
            outputs.append(out)
        outputs = torch.stack(outputs)
        lengths = torch.sum(mask, dim=1).detach().cpu()
        outputs = pack_padded_sequence(outputs, lengths, enforce_sorted=False)
        output, (hn, cn) = self.blstm(outputs)
        output, _ = pad_packed_sequence(output, total_length=mask.shape[1])
        return output, (hn, cn)
    
class Attention(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, h_e, h_d):
        score = torch.matmul(h_e.T, h_d)
        a_t = nn.functional.softmax(score, dim=0)
        c_t = torch.sum(a_t, dim=0)*h_e 
        return c_t
    
    
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed_layer = nn.Linear(33, 128)
        self.lstm_cell = nn.LSTMCell(128, 512)
        self.output = nn.Linear(1024, 33)
        self.attention = Attention()
        self.dec_h = None 
        self.dec_c = None

    def forward(self, enc_h, y):
        preds = []
        for i, hidden in enumerate(enc_h):
            if i==0:
                self.dec_h, self.dec_c = self.lstm_cell(y)
            else:
                self.dec_h, self.dec_c = self.lstm_cell(y, (self.dec_h, self.dec_c))
            c_t = self.attention(hidden, self.dec_h)
            combined_input = torch.cat([self.dec_h, c_t], 1)
            y_hat = self.output(combined_input)
            
            output = nn.functional.log_softmax(y_hat, dim=1)
            y = self.embed_layer(y_hat)
            preds.append(output)
        preds = torch.stack(preds)
        return preds
    
class Seq2Seq(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()

    def forward(self, x, mask, dec_input):
        enc_out, (he, ce) = self.encoder(x, mask)
        preds = self.decoder(enc_out, dec_input)
        return preds

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Seq2Seq(32)
model.apply(weights)
model = model.to(device)

criterion = nn.CTCLoss(zero_infinity=True)
optimizer = optim.Adam(model.parameters(), lr=5e-4)

cv_dataset = TrainData(train_df, train_aud, char2ind, [extract_feats, encode_trans])
loader = data.DataLoader(cv_dataset, batch_size=32, shuffle=True)

In [25]:
def train(csv_path, aud_path, alphabet_path, num_epochs=10,  batch_size=32, enc_hidden_size=256):

    with open(alphabet_path, 'r') as fo:
        alphabet = fo.readlines() + ['f', 'i', 'r', 'e', 'o', 'x']
    char2ind = {alphabet[i].strip():i for i in range(len(alphabet))}
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = Seq2Seq()
    model.apply(weights)
    model = model.to(device)

    criterion = nn.CTCLoss(zero_infinity=True)
    optimizer = optim.Adam(model.parameters(), lr=5e-4)

    cv_dataset = TrainData(csv_path, aud_path, char2ind, [extract_feats, encode_trans])
    print("Start training...")
    for epoch in range(1, num_epochs+1):
        epoch_loss = 0
        loader = data.DataLoader(cv_dataset, batch_size=32, shuffle=True)

        for batch in loader:
            x = batch['aud'].to(device)
            t = batch['trans'].to(device)
            fmask = batch['fmask'].squeeze(1).to(device)
            tmask = batch['tmask'].squeeze(1).to(device)
            dec_input = torch.randn(x.shape[0], 128, requires_grad=True).to(device)

            preds = model(x, fmask, dec_input)
            input_length = torch.sum(fmask, dim =1).long().to(device)
            target_length = torch.sum(tmask, dim=1).long().to(device)
            optimizer.zero_grad()
            loss = criterion(preds, t, input_length, target_length)
            loss.backward()
            optimizer.step()
            epoch_loss+=loss.detach().cpu().numpy()
        print('Epoch:{:3}/{:3} Training loss:{:>4f}'.format(epoch, num_epochs, epoch_loss/len(loader)))