In [None]:
!cp /content/drive/MyDrive/poc_data/data.zip /content

In [None]:
!unzip /content/data.zip
!clear
!rm /content/data.zip

In [1]:
!cp -r /content/drive/MyDrive/data /content

In [3]:
!nvidia-smi

Tue Nov 22 04:15:29 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    44W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
assert os.environ['COLAB_TPU_ADDR']

In [None]:
!pip install cloud-tpu-client==0.10 torch==1.12.0 https://storage.googleapis.com/tpu-pytorch/wheels/cuda/112/torch_xla-1.12-cp37-cp37m-linux_x86_64.whl --force-reinstall 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch-xla==1.12
  Downloading https://storage.googleapis.com/tpu-pytorch/wheels/cuda/112/torch_xla-1.12-cp37-cp37m-linux_x86_64.whl (393.5 MB)
[K     |████████████████████████████████| 393.5 MB 46 kB/s 
[?25hCollecting cloud-tpu-client==0.10
  Downloading cloud_tpu_client-0.10-py3-none-any.whl (7.4 kB)
Collecting torch==1.12.0
  Downloading torch-1.12.0-cp37-cp37m-manylinux1_x86_64.whl (776.3 MB)
[K     |████████████████████████████████| 776.3 MB 19 kB/s 
[?25hCollecting absl-py>=1.0.0
  Downloading absl_py-1.3.0-py3-none-any.whl (124 kB)
[K     |████████████████████████████████| 124 kB 90.6 MB/s 
[?25hCollecting oauth2client
  Downloading oauth2client-4.1.3-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 9.7 MB/s 
[?25hCollecting google-api-python-client==1.8.0
  Downloading google_api_python_client-1.8.0-py3-none-any.whl (57 kB)
[K     |█

In [None]:
# !pip install cloud-tpu-client==0.10 torch==1.10.0 torchvision==0.11.1 torchaudio==0.10.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.12-cp37-cp37m-linux_x86_64.whl

In [5]:
# !pip install -q torchaudio==0.12.0 -f https://download.pytorch.org/whl/cu111/torch_stable.html  
!pip install requests

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
import torch
import torch_xla
import torch_xla.core.xla_model as xm

In [2]:
t = torch.randn(2, 2, device=xm.xla_device())
print(t.device)
print(t)

xla:1
tensor([[-0.6989, -0.0987],
        [ 0.7337, -0.9071]], device='xla:1')


## ASR Model

In [3]:
# import torchaudio
import torch

from torch import nn
import torch.nn.functional as F

In [4]:
class CNNLayerNorm(nn.Module):
    """Layer normalization built for cnns input"""
    def __init__(self, n_feats):
        super(CNNLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(n_feats)

    def forward(self, x):
        # x (batch, channel, feature, time)
        x = x.transpose(2, 3).contiguous() # (batch, channel, time, feature)
        x = self.layer_norm(x)
        return x.transpose(2, 3).contiguous() # (batch, channel, feature, time) 


class ResidualCNN(nn.Module):
    """Residual CNN inspired by https://arxiv.org/pdf/1603.05027.pdf
        except with layer norm instead of batch norm
    """
    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
        super(ResidualCNN, self).__init__()

        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel//2)
        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel//2)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.layer_norm1 = CNNLayerNorm(n_feats)
        self.layer_norm2 = CNNLayerNorm(n_feats)

    def forward(self, x):
        residual = x  # (batch, channel, feature, time)
        x = self.layer_norm1(x)
        x = F.gelu(x)
        x = self.dropout1(x)
        x = self.cnn1(x)
        x = self.layer_norm2(x)
        x = F.gelu(x)
        x = self.dropout2(x)
        x = self.cnn2(x)
        x += residual
        return x # (batch, channel, feature, time)


class BidirectionalGRU(nn.Module):

    def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
        super(BidirectionalGRU, self).__init__()

        self.BiGRU = nn.GRU(
            input_size=rnn_dim, hidden_size=hidden_size,
            num_layers=1, batch_first=batch_first, bidirectional=True)
        self.layer_norm = nn.LayerNorm(rnn_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer_norm(x)
        x = F.gelu(x)
        x, _ = self.BiGRU(x)
        x = self.dropout(x)
        return x


class SpeechRecognitionModel(nn.Module):
    
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(SpeechRecognitionModel, self).__init__()
        n_feats = n_feats//2
        self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2)  # cnn for extracting heirachal features

        # n residual cnn layers with filter size of 32
        self.rescnn_layers = nn.Sequential(*[
            ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats) 
            for _ in range(n_cnn_layers)
        ])
        self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
        self.birnn_layers = nn.Sequential(*[
            BidirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
                             hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
            for i in range(n_rnn_layers)
        ])
        self.classifier = nn.Sequential(
            nn.Linear(rnn_dim*2, rnn_dim),  # birnn returns rnn_dim*2
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(rnn_dim, n_class)
        )

    def forward(self, x):
        print('model diye jachi')
        x = self.cnn(x)
        x = self.rescnn_layers(x)
        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # (batch, feature, time)
        x = x.transpose(1, 2) # (batch, time, feature)
        x = self.fully_connected(x)
        x = self.birnn_layers(x)
        print('model diye ekhono jachi')
        x = self.classifier(x)
        return x


In [5]:
import torchaudio
import torch
import requests
import json
import re

from requests.packages.urllib3.exceptions import InsecureRequestWarning

In [6]:
class TextTransform:
    """Maps phonemes to integers and vice versa"""
    def __init__(self):
        self.url = "https://dev.revesoft.com:6790/phonemizer"
        self.headers = {
                        'Content-Type': 'application/json'
                        }
        self.phonome_map_str = """
                                a 1
                                ã 2
                                b 3
                                bʰ 4
                                c 5
                                cʰ 6
                                d 7
                                dʰ 8
                                d̪ 9
                                d̪ʰ 10
                                e 11
                                ẽ 12
                                g 13
                                gʰ 14
                                h 15
                                i 16
                                ĩ 17
                                i̯ 18
                                k 19
                                kʰ 20
                                l 21
                                m 22
                                n 23
                                o 24
                                õ 25
                                o̯ 26
                                p 27
                                pʰ 28
                                r 29
                                s 30
                                t 31
                                tʰ 32
                                t̪ 33
                                t̪ʰ 34
                                u 35
                                ũ 36
                                u̯ 37
                                æ 38
                                æ̃ 39
                                ŋ 40
                                ɔ 41
                                ɔ̃ 42
                                ɟ 43
                                ɟʰ 44
                                ɽ 45
                                ɽʰ 46
                                ʃ 47
                                ʲ 48
                                ʷ 49
                                
                                """
        self.phone_map = {}
        self.index_map = {}
        for line in self.phonome_map_str.strip().split('\n'):
            ch, index = line.split()
            self.phone_map[ch] = int(index)
            self.index_map[int(index)] = ch
        self.index_map[50] = '@'
        self.phone_map['@'] = 50
        
    
    def remove_punctuations(self, text):
        # define punctuation
        regex = r"[!\"#\$%।\'\(\)\*\+,-\./:‘’;<=>\?@\[\\\]\^_`{\|}~]"
        
        subst = ""

        result = re.sub(regex, subst, text, 0, re.MULTILINE)
        return result
    
    def remove_english_letters_and_numbers(self, text):
        return re.sub(r'[A-Za-z0-9]+[ \t]*', r'', text)
        
    def text_to_int(self, text):
        """ Use a character map and convert text to an integer sequence """
        # print(text)
        text = self.remove_english_letters_and_numbers(text)
        text = self.remove_punctuations(text)
        
        payload = json.dumps({
                            "text": text
                            })
        
        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
        response = requests.request("POST", self.url, headers=self.headers, data=payload, verify=False)
        phone_list = response.json()['output']
          
        print(phone_list)
        int_sequence = []
        for phone_per_word in phone_list:
            phone_per_word = phone_per_word.replace("_1", "")
            phone_per_word = phone_per_word.replace("-", "")
            phone_per_word = phone_per_word.replace("_2", "")
            phone = ""
            for i in range(len(phone_per_word)):
                if i == len(phone_per_word) - 1:
                    phone += phone_per_word[i]
                    # print(phone)
                        # exit()
                    ch = self.phone_map[phone]
                    int_sequence.append(ch)
                elif phone_per_word[i] != " ":
                    phone += phone_per_word[i]
                else:
                    ch = self.phone_map[phone]
                    int_sequence.append(ch)
                    phone = ""
            int_sequence.append(self.phone_map['@'])
        
        return int_sequence

    def int_to_text(self, labels):
        """ Use a character map and convert integer labels to an text sequence """
        string = []
        for i in labels:
            string.append(self.index_map[i])
        return ''.join(string).replace('', ' ')


In [None]:
demo = TextTransform()
int_seq = demo.text_to_int('যুক্তরাষ্ট্র ও রাশিয়ার মধ্যে নতুন START8,2,234,, frOm Th চুক্তির ভবিষ্যৎ নিয়েও কিছু বলবেন আশা করি')
print(int_seq)

[43, 35, 19, 33, 24, 29, 1, 47, 31, 29, 24, 50, 24, 50, 29, 1, 47, 16, 48, 1, 29, 50, 22, 24, 9, 10, 11, 50, 23, 24, 33, 35, 23, 50, 5, 35, 19, 33, 16, 29, 50, 4, 24, 3, 16, 47, 47, 41, 33, 50, 23, 16, 48, 11, 26, 50, 19, 16, 6, 35, 50, 3, 24, 21, 3, 11, 23, 50, 1, 47, 1, 50, 19, 24, 29, 16, 50]


## Dataloader

In [7]:
import os
import torchaudio
import glob
import csv

from typing import Tuple
from torch import Tensor
from torch.utils.data import Dataset

In [8]:
class BanglaData(Dataset):
    
    def __init__(self, data_folder: str = '/content/data'):
        
        self.audio_file_paths = glob.glob(data_folder+'/**/**/*.wav')
        self.text_file_paths = glob.glob(data_folder+'/**/**/*.txt')
    
    def load_item(self, n: int) -> Tuple[Tensor, int, str]:
        audio_path = self.audio_file_paths[n]
        text_path = self.text_file_paths[n]
        # Load audio
        waveform, sample_rate = torchaudio.load(audio_path)
        # Load text
        utterance = '। '.join(open(text_path, 'r', encoding='utf-8').readlines())
        # print(len(utterance))
        
        return (
            waveform,
            sample_rate,
            utterance,
        )
    
    def __getitem__(self, n: int) -> Tuple[Tensor, int, str]:
        
        return self.load_item(n)

    def __len__(self) -> int:
        return len(self.audio_file_paths)


In [9]:
import torchaudio
import torch
import torch.nn.functional as F
import os
import torch
import torch.utils.data as data
import torch.optim as optim
# import asr_model
# import mlflow
# import mlflow.pytorch

# from dataset import BanglaData
from platform import python_branch
# from data_utils import TextTransform
from cmath import nan
from torch import nn

torch.autograd.set_detect_anomaly(True)

phone_cache = {} 

In [11]:
def save_ckp(state, checkpoint_dir = './saved_model'):
    f_path = checkpoint_dir + '/best_model_checkpoint.pt'
    torch.save(state, f_path)

def data_processing(train_audio_transforms, valid_audio_transforms,text_transform, data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, _, utterance) in data:
        if data_type == 'train':
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        else:
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        spectrograms.append(spec)
        
        
        # if phone_cache.get(utterance) is not None:
        if utterance not in phone_cache.keys():
            phone_cache[utterance] = torch.Tensor(text_transform.text_to_int(utterance))
            label = phone_cache[utterance]
        else:
            label = phone_cache[utterance]
            
        # label = torch.Tensor(text_transform.text_to_int(utterance))
        labels.append(label)
        input_lengths.append(spec.shape[0]//2)
        label_lengths.append(len(label))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths

def train(model, device, train_loader, test_loader, criterion, optimizer, scheduler, epoch):
    model.train()
    data_len = len(train_loader.dataset)
    count_train = 0
    tot_train_loss = []
    batch_count = 1
    for batch_idx, _data in enumerate(train_loader):
        print('batch: ', batch_count)
        spectrograms, labels, input_lengths, label_lengths = _data 
        spectrograms, labels = spectrograms.to(device), labels.to(device)

        optimizer.zero_grad()

        output = model(spectrograms)  # (batch, time, n_class)
        print('output paisi')
        output = F.log_softmax(output, dim=2)
        print('output process kortesi', output.shape)
        output = output.transpose(0, 1) # (time, batch, n_class)
        print('output process kortesi')
        loss = criterion(output, labels, input_lengths, label_lengths)
        print('output process kortesi', loss)
        tot_train_loss.append(loss.item())
        if loss == nan:
            print('loss for this batch is NaN')
        loss.backward()
        print('output process kortesi')

        optimizer.step()
        scheduler.step()
        print('output process kortesi nice')
        count_train += 1
        batch_count += 1
        print('notun batch dao')
        if batch_idx % 100 == 0 or batch_idx == data_len:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(spectrograms), data_len,
                100. * batch_idx / len(train_loader), loss.item()))
    
    model.eval()
    tot_val_loss = []
    count_val = 0
    with torch.no_grad():
        for batch_idx, _data in enumerate(test_loader):
            print('validation hoche')
            spectrograms, labels, input_lengths, label_lengths = _data 
            spectrograms, labels = spectrograms.to(device), labels.to(device)

            output = model(spectrograms)  # (batch, time, n_class)
            output = F.log_softmax(output, dim=2)
            output = output.transpose(0, 1) # (time, batch, n_class)

            loss = criterion(output, labels, input_lengths, label_lengths)
            count_val += 1
            tot_val_loss.append(loss.item())
    
           
    return float(sum(tot_val_loss)/count_val), model, optimizer
    


def main(train_audio_transforms, valid_audio_transforms, text_transform, learning_rate=5e-4, batch_size=20, epochs=10, device = 'cuda'):

    # model config for training with smaller dataset(backup_data) 
    hparams = {
        "n_cnn_layers": 3,
        "n_rnn_layers": 5,
        "rnn_dim": 512,
        "n_class": 51,
        "n_feats": 80,
        "stride":2,
        "dropout": 0.1,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs
    }

    # model config for training with larger dataset(all_data) 
    # hparams = {
    #     "n_cnn_layers": 5,
    #     "n_rnn_layers": 7,
    #     "rnn_dim": 512,
    #     "n_class": 51,
    #     "n_feats": 80,
    #     "stride":2,
    #     "dropout": 0.1,
    #     "learning_rate": learning_rate,
    #     "batch_size": batch_size,
    #     "epochs": epochs
    # }

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(7)
    # device = torch.device("cuda:1" if use_cuda else "cpu")


    train_dataset = BanglaData(data_folder = '/content/data/train')
    test_dataset = BanglaData(data_folder = '/content/data/valid')
    
    # kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = data.DataLoader(dataset=train_dataset,
                                batch_size=hparams['batch_size'],
                                shuffle=True,
                                collate_fn=lambda x: data_processing(train_audio_transforms, valid_audio_transforms, text_transform, x, 'train'))
    test_loader = data.DataLoader(dataset=test_dataset,
                                batch_size=hparams['batch_size'],
                                shuffle=False,
                                collate_fn=lambda x: data_processing(train_audio_transforms, valid_audio_transforms, text_transform, x, 'valid'))
    # for __data in train_loader:
    #     print(__data)
    model = SpeechRecognitionModel(
        hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'],
        hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout']
        ).to(device)

    # print(model)
    # print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))

    optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
    criterion = nn.CTCLoss(blank=0, zero_infinity=True).to(device)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'], 
                                            steps_per_epoch=int(len(train_loader)),
                                            epochs=hparams['epochs'],
                                            anneal_strategy='linear')

    
    best_loss = 100000000000000000
    path = './saved_model'
    save_path = os.path.join(path,'phoneme_prediction_model.pt')
    isExist = os.path.exists(path)
    if not isExist:
        os.makedirs(path)
    
    for epoch in range(1, epochs + 1):
        print('epoch: ', epoch)
        temp_loss, model, optimizer = train(model, device, train_loader, test_loader, criterion, optimizer, scheduler, epoch)
        print('epoch: ', epoch, ' complete')
        if temp_loss < best_loss:
            checkpoint = {
                        'epoch': epoch,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict()
                        }
            print(f'validation loss decreased from {best_loss} to {temp_loss}, model being saved')
            best_loss = temp_loss
            torch.save(model.state_dict(), save_path)
            save_ckp(checkpoint)


In [None]:
import time

train_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=400, n_mels=80),
    # torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=80),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=15),
    torchaudio.transforms.TimeMasking(time_mask_param=35)
)

valid_audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=400, n_mels=80)
# valid_audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=80),

text_transform = TextTransform()

learning_rate = 2e-5
batch_size = 32
epochs = 2

start = time.time()
main(train_audio_transforms, valid_audio_transforms, text_transform, learning_rate, batch_size, epochs)
end = time.time()
print("The time of execution of above program is :",
      (end-start))


epoch:  1
['e_1 r_2', 'b_1 a i̯ r e_2', 'ɟ_1 e l a ʲ_2', 'b_1 æ p o k_2', 'a_1 k a r e_2', 't̪_1 o i̯ r i_2', 'p_1 o ʃ a k_2', 's_1 r o m i k ʃ ɔ h o_2', 'o_1 n n æ n n o_2', 'ʃ_1 r o m i k r a_2', 'r_1 o ʲ e cʰ e_2']
['p_1 u l i ʃ e r_2', 'p_1 o ʃ a k d̪ʰ a r i_2', 'c_1 ɔ o̯ ɽ a_2', 'g_1 õ pʰ o ʷ a l a_2', 'l_1 o k t i k e_2', 'r_1 u p k ɔ t̪ʰ a r_2', 'ɟ_1 ɔ l l a d̪ e r_2', 'm_1 ɔ t̪ o i̯_2', 'd̪_1 æ kʰ a c cʰ i l o_2']
['n_1 a m_2', 'n_1 a_2', 'ɟ_1 a n a_2', 'k_1 ɔ t̪ o_2', 'g_1 a cʰ_2', 'p_1 ɔ t̪ʰ e r_2', 'd̪_1 u p a ʃ e_2']
['k_1 i cʰ u t a_2', 'h_1 ã pʰ_2', 'cʰ_1 e ɽ e_2', 'b_1 a c l a m_2']
['æ_1 g a r o_2', 'd̪_1 ɔ pʰ a_2', 'ɟ_1 o u̯ k t̪ i k_2', 'd̪_1 a b i t̪ e_2', 'a_1 m a d̪ e r_2', 'a_1 ɟ k e r_2', 'e_1 i̯_2', 'a_1 n d̪ o l o n_2']
['t̪_1 a r_2', 'm_1 a_2', 'e_1 ʃ e cʰ i l e n_2', 'e', 'b_1 i ʃ ɔ ʲ e_2', 'k_1 ɔ t̪ʰ a_2', 'b_1 o l t̪ e_2']
['a_1 r o_2', 'k_1 ɔ t̪ o_2', 'k_1 i_2']
['g_1 r a m a n c o l e_2', 'p_1 u r b e r_2', 'ʃ_1 ɔ b_2', 'ʃ_1 u d̪ kʰ o r_2', 'ɟ_1 o t̪ d̪

In [None]:
print(phone_cache )