In [None]:
# Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg libsox-fmt-mp3 -y
!pip install unidecode
!pip install matplotlib
!pip install omegaconf
!pip install nemo_toolkit[all]
!pip install wandb

In [None]:
import glob
import os
from pathlib import Path
import subprocess
import tarfile
import librosa
import wget
from torch.utils.data import Dataset
from collections import defaultdict

import json
import copy
from omegaconf import open_dict, DictConfig
import torch.nn as nn
from pytorch_lightning import Trainer

import nemo.collections.asr as nemo_asr
from ruamel.yaml import YAML
import wandb
from pytorch_lightning.loggers import WandbLogger
import torch

In [None]:
class LibriDataset():
    def __init__(self, option: str, data_dir: str = '/kaggle/input/libritts/') -> None:
        super().__init__()

        self.data_dir = data_dir
        self.option = option
        self.path = self.option + '/' + self.option

#         self.prepare_data()

        # Get transcript path list
#         transcript_path_lst = list()
#         for r1 in os.listdir(os.path.join(self.data_dir, self.path)):
#             for r2 in os.listdir(os.path.join(self.data_dir, self.path, r1)):
#                 for r3 in os.listdir(os.path.join(self.data_dir, self.path, r1, r2)):
#                     if r3[-4:] == '.txt':
#                         transcript_path_lst.append(os.path.join(self.path, r1, r2, r3))

        # Building Manifests
        print('Bulding Manifests for dataset...')
        
        self.manifest_path = '/kaggle/input/libritts-manifest/' + self.option + '-manifest-kaggle.json'

        if not os.path.isfile(self.manifest_path):
            for transcript_dir in transcript_path_lst:
                transcripts_path = os.path.join(self.data_dir, transcript_dir)
                self.build_manifest(transcripts_path, self.manifest_path, self.path)
        print("***Done***")

    def prepare_data(self):
        mirror = self.option + ".tar.gz"
        if not os.path.exists(self.data_dir + mirror):
            print(f"Downloading {self.option} dataset...")
            libri_url = "https://www.openslr.org/resources/12/" + mirror
            libri_path = wget.download(libri_url, self.data_dir)
            print(f"Dataset downloaded at: {libri_path}")
        else:
            print("Tarfile already exists.")
            libri_path = self.data_dir + mirror.replace("-", "_")

        if not os.path.exists(self.data_dir + self.path):
            tar = tarfile.open(libri_path)
            tar.extractall(path=self.data_dir)

            print("Converting .flac to .wav...")
            flac_list = glob.glob(self.data_dir + 'LibriSpeech/**/*.flac', recursive=True)
            for flac_path in flac_list:
                wav_path = flac_path[:-5] + '.wav'
                cmd = ["sox", flac_path, wav_path]
                subprocess.run(cmd)
        print("Finished conversion.\n******")

    def build_manifest(self, transcripts_path, manifest_path, wav_path):
        with open(transcripts_path, 'r') as fin:
            with open(manifest_path, 'a') as fout:
                for line in fin:

                    transcript = ' '.join(line.split(' ')[1:]).lower()
                    file_id = line.split(' ')[0]

                    audio_path = os.path.join(
                        self.data_dir,
                        wav_path,
                        file_id[:file_id.find('-')],
                        file_id[file_id.find('-')+1 : file_id.rfind('-')],
                        file_id + '.wav')

                    duration = librosa.core.get_duration(filename=audio_path)

                    metadata = {
                        "audio_filepath": audio_path,
                        "duration": duration,
                        "text": transcript
                    }
                    json.dump(metadata, fout)
                    fout.write('\n')

In [None]:
dev_clean = LibriDataset(option="dev-clean")
dev_other = LibriDataset(option="dev-other")
test_clean = LibriDataset(option="test-clean")
test_other = LibriDataset(option="test-other")

In [None]:
def concatenate_files(file1_path, file2_path, file3_path):
    try:
        with open(file1_path, 'r', encoding='utf8') as file1:
            content1 = file1.read()

        with open(file2_path, 'r', encoding='utf8') as file2:
            content2 = file2.read()

        concatenated_content = content1 + content2

        with open(file3_path, 'w', encoding='utf8') as file3:
            file3.write(concatenated_content)

        print(f"Contents of {file1_path} and {file2_path} have been concatenated and saved to {file3_path}")

    except FileNotFoundError:
        print("One or more files not found.")

In [None]:
concatenate_files(dev_clean.manifest_path, dev_other.manifest_path, 'train.json')
concatenate_files(test_clean.manifest_path, test_other.manifest_path, 'test.json')

In [None]:
BRANCH = 'main'

if not os.path.exists("scripts/process_asr_text_tokenizer.py"):
    !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tokenizers/process_asr_text_tokenizer.py

In [None]:
TOKENIZER_TYPE = "bpe"

In [None]:
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
from collections import defaultdict
import tqdm

def get_charset(manifest_data):
    charset = defaultdict(int)
    for row in manifest_data:
        text = row['text']
        for character in text:
            charset[character] += 1
    return charset


train_manifest_data = read_manifest('/kaggle/working/train.json')
test_manifest_data = read_manifest('/kaggle/working/test.json')

train_charset = get_charset(train_manifest_data)
test_charset = get_charset(test_manifest_data)

train_set = set(train_charset.keys())

VOCAB_SIZE = 512

tokenizer_dir = os.path.join('tokenizers', 'en')

In [None]:
!python scripts/process_asr_text_tokenizer.py \
  --manifest='/kaggle/working/train.json' \
  --vocab_size=$VOCAB_SIZE \
  --data_root=$tokenizer_dir \
  --tokenizer="spe" \
  --spe_type=$TOKENIZER_TYPE \
  --spe_character_coverage=1.0 \
  --no_lower_case \
  --log
     

In [None]:
TOKENIZER_DIR = f"{tokenizer_dir}/tokenizer_spe_{TOKENIZER_TYPE}_v{VOCAB_SIZE}/"

# Number of tokens in tokenizer - 
with open(os.path.join(TOKENIZER_DIR, 'tokenizer.vocab')) as f:
    tokens = f.readlines()

num_tokens = len(tokens)

if num_tokens < VOCAB_SIZE:
    print(
        f"The text in this dataset is too small to construct a tokenizer "
        f"with vocab size = {VOCAB_SIZE}. Current number of tokens = {num_tokens}. "
        f"Please reconstruct the tokenizer with fewer tokens"
    )

In [None]:
model = nemo_asr.models.ASRModel.from_pretrained("stt_en_citrinet_512", map_location='cpu')

pretrained_decoder = model.decoder.state_dict()
model.change_vocabulary(new_tokenizer_dir=TOKENIZER_DIR, new_tokenizer_type="bpe")

# Insert preserved model weights if shapes match
if model.decoder.decoder_layers[0].weight.shape == pretrained_decoder['decoder_layers.0.weight'].shape:
    model.decoder.load_state_dict(pretrained_decoder)

In [None]:
def enable_bn_se(model):
    if type(model) == nn.BatchNorm1d:
        model.train()
        for param in model.parameters():
            param.requires_grad_(True)

    if 'SqueezeExcite' in type(model).__name__:
        model.train()
        for param in model.parameters():
            param.requires_grad_(True)
            
freeze_encoder = True #@param ["False", "True"] {type:"raw"}
freeze_encoder = bool(freeze_encoder)

if freeze_encoder:
    model.encoder.freeze()
    model.encoder.apply(enable_bn_se)
else:
    model.encoder.unfreeze()

In [None]:
cfg = copy.deepcopy(model.cfg)

# Setup new tokenizer
cfg.tokenizer.dir = TOKENIZER_DIR
cfg.tokenizer.type = "bpe"

# Set tokenizer config
model.cfg.tokenizer = cfg.tokenizer

In [None]:
# Setup train, validation, test configs
with open_dict(cfg):
    # Train dataset
    cfg.train_ds.manifest_filepath = "/kaggle/working/train.json"
    cfg.train_ds.batch_size = 16
    cfg.train_ds.num_workers = 8
    cfg.train_ds.pin_memory = True
    cfg.train_ds.use_start_end_token = True
    cfg.train_ds.trim_silence = True

    # Validation dataset
    cfg.validation_ds.manifest_filepath = "/kaggle/working/test.json"
    cfg.validation_ds.batch_size = 16
    cfg.validation_ds.num_workers = 8
    cfg.validation_ds.pin_memory = True
    cfg.validation_ds.use_start_end_token = True
    cfg.validation_ds.trim_silence = True

model.setup_training_data(cfg.train_ds)
model.setup_multiple_validation_data(cfg.validation_ds)

In [None]:
def analyse_ctc_failures_in_model(model):
    count_ctc_failures = 0
    am_seq_lengths = []
    target_seq_lengths = []

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = model.to(device)
    mode = model.training
    
    train_dl = model.train_dataloader()

    with torch.no_grad():
        model = model.eval()
        for batch in train_dl:
            x, x_len, y, y_len = batch
            x, x_len = x.to(device), x_len.to(device)
            x_logprobs, x_len, greedy_predictions = model(input_signal=x, input_signal_length=x_len)

            # Find how many CTC loss computation failures will occur
            for xl, yl in zip(x_len, y_len):
                if xl <= yl:
                    count_ctc_failures += 1

            # Record acoustic model lengths=
            am_seq_lengths.extend(x_len.to('cpu').numpy().tolist())

            # Record target sequence lengths
            target_seq_lengths.extend(y_len.to('cpu').numpy().tolist())

            del x, x_len, y, y_len, x_logprobs, greedy_predictions
    
    if mode:
        model = model.train()
      
    return count_ctc_failures, am_seq_lengths, target_seq_lengths

results = analyse_ctc_failures_in_model(model)
num_ctc_failures, am_seq_lengths, target_seq_lengths = results

In [None]:
# Compute average ratio of T / U
avg_T = sum(am_seq_lengths) / float(len(am_seq_lengths))
avg_U = sum(target_seq_lengths) / float(len(target_seq_lengths))

avg_length_ratio = 0
for am_len, tgt_len in zip(am_seq_lengths, target_seq_lengths):
    avg_length_ratio += (am_len / float(tgt_len))
avg_length_ratio = avg_length_ratio / len(am_seq_lengths)

print(f"Average Acoustic model sequence length = {avg_T}")
print(f"Average Target sequence length = {avg_U}")
print()
print(f"Ratio of Average AM sequence length to target sequence length = {avg_length_ratio}")

In [None]:
with open_dict(model.cfg.optim):
    model.cfg.optim.lr = 0.025
    model.cfg.optim.weight_decay = 0.001
    model.cfg.optim.sched.warmup_steps = None  # Remove default number of steps of warmup
    model.cfg.optim.sched.warmup_ratio = 0.10  # 10 % warmup
    model.cfg.optim.sched.min_lr = 1e-9
    
with open_dict(model.cfg.spec_augment):
    model.cfg.spec_augment.freq_masks = 2
    model.cfg.spec_augment.freq_width = 25
    model.cfg.spec_augment.time_masks = 10
    model.cfg.spec_augment.time_width = 0.05

model.spec_augmentation = model.from_config_dict(model.cfg.spec_augment)

In [None]:
use_cer = True #@param ["False", "True"] {type:"raw"}
log_prediction = True #@param ["False", "True"] {type:"raw"}

model._wer.use_cer = use_cer
model._wer.log_prediction = log_prediction

In [None]:
wandb.login(key="privated-key")

wandb_logger = WandbLogger(project="asr", log_model='all')

for k,v in model._cfg.items():
    wandb_logger.experiment.config[k]=v

In [None]:
import pytorch_lightning as ptl
from omegaconf import OmegaConf

if torch.cuda.is_available():
    accelerator = 'gpu'
else:
    accelerator = 'cpu'

EPOCHS = 100  # 100 epochs would provide better results

trainer = ptl.Trainer(devices=1, 
                      accelerator=accelerator, 
                      max_epochs=EPOCHS, 
                      accumulate_grad_batches=1,
                      enable_checkpointing=False,
                      logger=wandb_logger,
                      log_every_n_steps=5,
                      check_val_every_n_epoch=5)

model.set_trainer(trainer)
model.cfg = DictConfig(OmegaConf.to_container(model._cfg))

In [None]:
trainer.fit(model)

In [None]:
model.save_to('bpe_libri.nemo')