# Практика №4

Теперь мы построим и обучим простую end-to-end модель. Будем работать с пропатченной версией уже готового [пайплайна](https://www.assemblyai.com/blog/end-to-end-speech-recognition-pytorch). Также нам пригодится [ESPnet](https://github.com/espnet/espnet) для использования модели [Transformer](http://jalammar.github.io/illustrated-transformer/) в качестве энкодера.

### Bootstrap

In [None]:
!pip install torchaudio
!pip install sentencepiece
!pip install gdown

Collecting torchaudio
[?25l  Downloading https://files.pythonhosted.org/packages/aa/55/01ad9244bcd595e39cea5ce30726a7fe02fd963d07daeb136bfe7e23f0a5/torchaudio-0.8.1-cp37-cp37m-manylinux1_x86_64.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 4.3MB/s 
Installing collected packages: torchaudio
Successfully installed torchaudio-0.8.1
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 4.3MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95


In [None]:
!gdown --id '1skrVbNyrhBLeceGS9CV9uIw_gvo1JiA6'

!unzip -q lab4.zip
!rm -rf lab4.zip sample_data
%cd lab4

/home/dm/MADE-22/asr-n-tts/asr_lab4/lab4


In [None]:
import os
import gc
import torch
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import torchaudio
import numpy as np
import math

from utils import TextTransform
from utils import cer
from utils import wer

import espnet
from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule
from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
from espnet.nets.pytorch_backend.transformer.encoder_layer import EncoderLayer
from espnet.nets.pytorch_backend.transformer.repeat import repeat
from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import PositionwiseFeedForward
from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
from espnet.nets.pytorch_backend.conformer.encoder_layer import EncoderLayer as ConformerEncoderLayer

import sentencepiece as spm

In [None]:
display(torch.__version__)
display(espnet.__version__)
torch.cuda.is_available()

'1.8.1'

'0.9.7'

True

In [None]:
train_audio_transforms = torch.nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=400, hop_length=160, n_mels=80),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100)
)

valid_audio_transforms = (
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=400, hop_length=160, n_mels=80)
)

In [None]:
#-----------------------------TODO №2-----------------------------------
# Заменить графемный токенайзер на сабвордовый TextTransformBPE
#-----------------------------------------------------------------------
class TextTransformBPE:
    def __init__(self):
        """ Обучение BPE модели на 4000 юнитов"""
        self.train_data="train_clean_100_text_clean.txt"
        self.vocab_size=4000
        train_cmd = f"--input={self.train_data} --model_prefix=m_bpe --vocab_size={self.vocab_size} --model_type=bpe"
        spm.SentencePieceTrainer.train(train_cmd)        
        self.sp_bpe = spm.SentencePieceProcessor()
        self.sp_bpe.load('m_bpe.model')
        
    def text_to_int(self, text):
        """ Преобразование входного текста в последовательность сабвордов в формате их индекса в BPE модели """
        return self.sp_bpe.encode_as_ids(text)

    def int_to_text(self, labels):
        """ Преобразование последовательности индексов сабвордов в текст """
        return ''.join(self.sp_bpe.decode_ids(labels))

#text_transform = TextTransform()
text_transform = TextTransformBPE()


In [None]:
def data_processing(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, _, utterance, _, _, _) in data:
        if data_type == 'train':
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        elif data_type == 'valid':
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        else:
            raise Exception('data_type should be train or valid')
        spectrograms.append(spec)
        label = torch.Tensor(text_transform.text_to_int(utterance.upper()))
        labels.append(label)
        input_lengths.append(spec.shape[0])
        label_lengths.append(len(label))

    spectrograms = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths


def GreedyDecoder(output, labels, label_lengths, blank_label=4000, collapse_repeated=True):
#def GreedyDecoder(output, labels, label_lengths, blank_label=28, collapse_repeated=True):
    arg_maxes = torch.argmax(output, dim=2)
    decodes = []
    targets = []
    for i, args in enumerate(arg_maxes):
        decode = []
        target_ids = list(map(int, labels[i][:label_lengths[i]].tolist()))
        targets.append(text_transform.int_to_text(target_ids))
        for j, index in enumerate(args):
            if index != blank_label:
                if collapse_repeated and j != 0 and index == args[j -1]:
                    continue
                decode.append(index.item())
        decodes.append(text_transform.int_to_text(decode))
    return decodes, targets

In [None]:
class TransformerModel(torch.nn.Module):

    def __init__(
        self,
        input_size=80,
         output_size=4001,
#        output_size=29,
        conv2d_filters=32,
        attention_dim=360,
        attention_heads=8,
        feedforward_dim=1024,
        num_layers=10,
        dropout=0.1,
    ):
        super(TransformerModel, self).__init__()
        
        self.conv_in = torch.nn.Sequential(
            torch.nn.Conv2d(1, conv2d_filters, kernel_size=(3,3), stride=(2,2), padding=(1,1)),
            torch.nn.ReLU(),
            torch.nn.Conv2d(conv2d_filters, conv2d_filters, kernel_size=(3,3), stride=(2,2), padding=(1,1)),
            torch.nn.ReLU(),
        )
        self.conv_out = torch.nn.Sequential(
            torch.nn.Linear(conv2d_filters * ((input_size // 2) // 2), attention_dim),
            PositionalEncoding(attention_dim, 0.1),
        )
        positionwise_layer = PositionwiseFeedForward
        positionwise_layer_args = (attention_dim, feedforward_dim, dropout)
        self.encoder_layer = repeat(
            num_layers,
            lambda lnum: EncoderLayer(
                attention_dim,
                MultiHeadedAttention(
                    attention_heads, attention_dim, dropout
                ),
                positionwise_layer(*positionwise_layer_args),
                dropout,
                normalize_before=True,
                concat_after=False,
            ),
        )
        self.after_norm = LayerNorm(attention_dim)
        self.final_layer = torch.nn.Linear(attention_dim, output_size)

    def forward(self, x, ilens):
        x = x.unsqueeze(1)  # (b, c, t, f)
        x = self.conv_in(x)
        b, c, t, f = x.size()
        x = self.conv_out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        masks = (~make_pad_mask(ilens)[:, None, :])[:, :, ::4].to(x.device)
        x, _ = self.encoder_layer(x, masks)
        x = self.after_norm(x)
        x = self.final_layer(x)
        gc.collect()
        return x

In [None]:
def train(model, device, train_loader, criterion, optimizer, scheduler, num_batches, epoch):
    model.train()
    data_len = len(train_loader.dataset)

    for batch_idx, _data in enumerate(train_loader):
        spectrograms, labels, input_lengths, label_lengths = _data 
        spectrograms, labels = spectrograms[:, :, :,:max(input_lengths)].to(device), labels.to(device) #(batch, 1, feat_dim, time)
        spectrograms = spectrograms.squeeze(1).transpose(1,2) # (batch, time, feat_dim,)
        optimizer.zero_grad()
        
        output = model(spectrograms, input_lengths)  # (batch, time, n_classes)
        output = F.log_softmax(output, dim=2)
        output = output.transpose(0, 1) # (time, batch, n_class)
        input_lengths = [x // 4 for x in input_lengths]

        loss = criterion(output, labels, input_lengths, label_lengths)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()
        scheduler.step()
        if batch_idx % 500 == 0 or batch_idx == data_len:
            print('Train Epoch: {} [{: 5d}/{} ({:.0f}%)]\tLoss: {:.6f}\tLR: {:.6f}'.format(
                epoch, batch_idx * len(spectrograms), data_len,
                500. * batch_idx / data_len, loss.item(), scheduler.get_last_lr()[0]))
            
        batches_exceed = False
        if num_batches:
            batches_exceed = batch_idx * len(spectrograms) > num_batches
        
        del spectrograms
        del labels
        del input_lengths
        del label_lengths
        del _data
        del output
        gc.collect()
        torch.cuda.empty_cache()
        
        if batches_exceed:
            break


def test(model, device, test_loader, criterion, epoch):
    print('\nevaluating...')
    model.eval()
    test_loss = 0
    test_cer, test_wer = [], []
    with torch.no_grad():
        for i, _data in enumerate(test_loader):
            spectrograms, labels, input_lengths, label_lengths = _data 
            spectrograms, labels = spectrograms.to(device), labels.to(device)
            spectrograms = spectrograms.squeeze(1).transpose(1,2) # (batch time, feat_dim,)
            
            output = model(spectrograms, input_lengths)  # (batch, time, n_class)
            output = F.log_softmax(output, dim=2)
            output = output.transpose(0, 1) # (time, batch, n_class)
            input_lengths = [x // 4 for x in input_lengths]

            loss = criterion(output, labels, input_lengths, label_lengths)
            test_loss += loss.item() / len(test_loader)
            
            decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)

            for j in range(len(decoded_preds)):
                test_cer.append(cer(decoded_targets[j], decoded_preds[j]))
                test_wer.append(wer(decoded_targets[j], decoded_preds[j]))
                
            del spectrograms
            del labels
            del input_lengths
            del label_lengths
            del _data
            del output
            gc.collect()
            torch.cuda.empty_cache()

    avg_cer = sum(test_cer)/len(test_cer)
    avg_wer = sum(test_wer)/len(test_wer)

    print('Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'.format(test_loss, avg_cer, avg_wer))


In [None]:
def main(EncoderModel, learning_rate=1e-5,
         batch_size=20, test_batch_size=7, num_batches=None, epochs=10,
         train_url="train-clean-100", test_url="test-clean"
        ):
    
    hparams = {
        "input_size": 80,
         "output_size": 4001,
#        "output_size": 29,
        "conv2d_filters": 32,
        "attention_dim": 360,
        "attention_heads": 8,
        "feedforward_dim": 1024,
        "num_layers": 10,
        "dropout": 0.1,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs
    }

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(7)
    device = torch.device("cuda" if use_cuda else "cpu")

    if not os.path.isdir("./data"):
        os.makedirs("./data")

    train_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=train_url, download=True)
    test_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=test_url, download=True)

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = data.DataLoader(dataset=train_dataset,
                                batch_size=hparams['batch_size'],
                                shuffle=True,
                                collate_fn=lambda x: data_processing(x, 'train'),
                                **kwargs)
    test_loader = data.DataLoader(dataset=test_dataset,
                                batch_size=test_batch_size,
                                shuffle=False,
                                collate_fn=lambda x: data_processing(x, 'valid'),
                                **kwargs)

    model = EncoderModel(
        hparams['input_size'],
        hparams['output_size'],
        hparams['conv2d_filters'],
        hparams['attention_dim'],
        hparams['attention_heads'],
        hparams['feedforward_dim'],
        hparams['num_layers'],
        hparams['dropout']).to(device)

    #print(model)
    print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))
    
    steps_per_epoch = int(len(train_loader))
    if num_batches:
        steps_per_epoch = num_batches // batch_size + batch_size

    optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
#     criterion = torch.nn.CTCLoss(blank=28, zero_infinity=False).to(device)
    criterion = torch.nn.CTCLoss(blank=4000, zero_infinity=False).to(device)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'], 
                                            steps_per_epoch=steps_per_epoch,
                                            epochs=hparams['epochs'],
                                            anneal_strategy='linear')
    
    for epoch in range(1, epochs + 1):
        !date
        train(model, device, train_loader, criterion, optimizer, scheduler, num_batches, epoch)
        !date
        test(model, device, test_loader, criterion, epoch)

### <b>Задание №1</b> (5 баллов):
На данный момент практически все E2E SOTA решения использую [сабворды](https://dyakonov.org/2019/11/29/%D1%82%D0%BE%D0%BA%D0%B5%D0%BD%D0%B8%D0%B7%D0%B0%D1%86%D0%B8%D1%8F-%D0%BD%D0%B0-%D0%BF%D0%BE%D0%B4%D1%81%D0%BB%D0%BE%D0%B2%D0%B0-subword-tokenization/) (subwords/wordpieces) в качестве таргетов нейронки для распознавания. Нам бы тоже не мешало перейти от графем к сабвордам. Теперь вместо букв (графем) будем распознавать кусочки слов. В качестве такого токенайзера предлагается использовать [Sentencepiece](https://github.com/google/sentencepiece). Главное правильно обернуть его в наш класс TextTransform. Текстовый файл (train_clean_100_text_clean.txt) для обучения токенайзера уже подготовлен и лежит в корневой папке проекта. 

In [None]:
learning_rate = 1e-3
batch_size = 3
test_batch_size = 3
num_batches = 15000
epochs = 10
libri_train_set = "train-clean-100"
libri_test_set = "test-clean"

main(TransformerModel,
     learning_rate,
     batch_size,
     test_batch_size,
     num_batches,
     epochs,
     libri_train_set,
     libri_test_set
    )

Num Model Parameters 14284849
Вт 11 мая 2021 19:31:33 +05
Вт 11 мая 2021 19:45:00 +05

evaluating...
Test set: Average loss: 6.7421, Average CER: 0.998781 Average WER: 0.9991

Вт 11 мая 2021 19:46:10 +05
Вт 11 мая 2021 19:59:21 +05

evaluating...
Test set: Average loss: 5.6619, Average CER: 0.920685 Average WER: 0.9618

Вт 11 мая 2021 20:00:59 +05
Вт 11 мая 2021 20:14:04 +05

evaluating...
Test set: Average loss: 5.1809, Average CER: 0.860070 Average WER: 0.9065

Вт 11 мая 2021 20:16:07 +05
Вт 11 мая 2021 20:29:07 +05

evaluating...
Test set: Average loss: 4.7414, Average CER: 0.804029 Average WER: 0.8694

Вт 11 мая 2021 20:31:37 +05
Вт 11 мая 2021 20:44:38 +05

evaluating...
Test set: Average loss: 4.5124, Average CER: 0.825354 Average WER: 0.8666

Вт 11 мая 2021 20:46:53 +05
Вт 11 мая 2021 20:59:56 +05

evaluating...
Test set: Average loss: 4.3183, Average CER: 0.762821 Average WER: 0.8340

Вт 11 мая 2021 21:02:39 +05
Вт 11 мая 2021 21:15:48 +05

evaluating...
Test set: Average loss:

Вт 11 мая 2021 22:04:13 +05

evaluating...
Test set: Average loss: 3.8841, Average CER: 0.691785 Average WER: 0.7755



### <b>Задание №2</b> (5 баллов):
Импровизация по улучшению качества распознавания.

In [None]:
class ConformerModel(torch.nn.Module):
    def __init__(
        self,
        input_size=80,
        output_size=4001,
        conv2d_filters=32,
        attention_dim=360,
        attention_heads=8,
        feedforward_dim=1024,
        num_layers=10,
        dropout=0.1,
        cnn_module_kernel=7,
        macaron_style=False,
    ):
        super(ConformerModel, self).__init__()
        
        self.conv_in = torch.nn.Sequential(
            torch.nn.Conv2d(1, conv2d_filters, kernel_size=(3,3), stride=(2,2), padding=(1,1)),
            torch.nn.ReLU(),
            torch.nn.Conv2d(conv2d_filters, conv2d_filters, kernel_size=(3,3), stride=(2,2), padding=(1,1)),
            torch.nn.ReLU(),
        )
        self.conv_out = torch.nn.Sequential(
            torch.nn.Linear(conv2d_filters * ((input_size // 2) // 2), attention_dim),
            PositionalEncoding(attention_dim, 0.1),
        )
        
        # "swish" activation
        activation = lambda x: x * torch.sigmoid(x)
        
        # feed-forward module definition
        positionwise_layer = PositionwiseFeedForward
        positionwise_layer_args = (attention_dim, feedforward_dim, dropout)
        
        # self-attention module definition
        encoder_selfattn_layer = MultiHeadedAttention
        encoder_selfattn_layer_args = (attention_heads, attention_dim, dropout)
        
        # convolution module definition
        convolution_layer = ConvolutionModule
        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
        
        self.encoder_layer = repeat(
            num_layers,
            lambda lnum: ConformerEncoderLayer(
                attention_dim,
                encoder_selfattn_layer(*encoder_selfattn_layer_args),
                positionwise_layer(*positionwise_layer_args),
                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
                convolution_layer(*convolution_layer_args),
                dropout,
                normalize_before=True,
                concat_after=False,
            ),
        )
        
        self.after_norm = LayerNorm(attention_dim)
        self.final_layer = torch.nn.Linear(attention_dim, output_size)

    def forward(self, x, ilens):
        x = x.unsqueeze(1)  # (b, c, t, f)
        x = self.conv_in(x)
        b, c, t, f = x.size()
        x = self.conv_out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        masks = (~make_pad_mask(ilens)[:, None, :])[:, :, ::4].to(x.device)
        x = self.encoder_layer(x, masks)
        x = self.after_norm(x[0])
        x = self.final_layer(x)
        return x

In [None]:
learning_rate = 1e-3
batch_size = 3
test_batch_size = 3
num_batches=12000
epochs = 10
libri_train_set = "train-clean-100"
libri_test_set = "test-clean"

main(ConformerModel,
     learning_rate,
     batch_size,
     test_batch_size,
     num_batches,
     epochs,
     libri_train_set,
     libri_test_set
    )

Num Model Parameters 18234049
Вт 11 мая 2021 22:40:05 +05
Вт 11 мая 2021 22:52:47 +05

evaluating...
Test set: Average loss: 7.5704, Average CER: 1.000000 Average WER: 0.9997

Вт 11 мая 2021 22:53:51 +05


RuntimeError: CUDA out of memory. Tried to allocate 30.00 MiB (GPU 0; 3.82 GiB total capacity; 2.48 GiB already allocated; 32.19 MiB free; 2.60 GiB reserved in total by PyTorch)

__Замечания__:
- модель тяжелая, GPU колаб не дает, обучил, насколько хватило ноута
- batch_size=3 для ConformerModel явно маловато, в процессе экпериментов, модель хорошо себя показывала на 20-30% тренировочных данных с обучением в 2 эпохи
- по дальнейшей оптимизации обеих моделей, первое, что даст хороший прирост - это увеличить batch_size
- для TransformerModel пробовал несколько вариентов lr_scheduler (StepLR, CyclicLR, OneCycleLR, ReduceLROnPlateau), OneCycleLR показал меньший лосс, оставил его

Студент: Черников Дмитрий, MADE-ML-22