In [None]:
# Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg libsox-fmt-mp3 -y
!pip install unidecode
!pip install matplotlib
!pip install omegaconf
!pip install nemo_toolkit[all]
!pip install wandb

In [None]:
import glob
import os
from pathlib import Path
import subprocess
import tarfile
import librosa
import wget
from torch.utils.data import Dataset
from collections import defaultdict

import json
import copy
from omegaconf import open_dict, DictConfig
import torch.nn as nn
from pytorch_lightning import Trainer

import nemo.collections.asr as nemo_asr
from ruamel.yaml import YAML
import wandb
from pytorch_lightning.loggers import WandbLogger
import torch

In [None]:
class LibriDataset():
    def __init__(self, option: str, data_dir: str = '/kaggle/input/libritts/') -> None:
        super().__init__()

        self.data_dir = data_dir
        self.option = option
        self.path = self.option + '/' + self.option

#         self.prepare_data()

        # Get transcript path list
#         transcript_path_lst = list()
#         for r1 in os.listdir(os.path.join(self.data_dir, self.path)):
#             for r2 in os.listdir(os.path.join(self.data_dir, self.path, r1)):
#                 for r3 in os.listdir(os.path.join(self.data_dir, self.path, r1, r2)):
#                     if r3[-4:] == '.txt':
#                         transcript_path_lst.append(os.path.join(self.path, r1, r2, r3))

        # Building Manifests
        print('Bulding Manifests for dataset...')
        
        self.manifest_path = '/kaggle/input/libritts-manifest/' + self.option + '-manifest-kaggle.json'

        if not os.path.isfile(self.manifest_path):
            for transcript_dir in transcript_path_lst:
                transcripts_path = os.path.join(self.data_dir, transcript_dir)
                self.build_manifest(transcripts_path, self.manifest_path, self.path)
        print("***Done***")

    def prepare_data(self):
        mirror = self.option + ".tar.gz"
        if not os.path.exists(self.data_dir + mirror):
            print(f"Downloading {self.option} dataset...")
            libri_url = "https://www.openslr.org/resources/12/" + mirror
            libri_path = wget.download(libri_url, self.data_dir)
            print(f"Dataset downloaded at: {libri_path}")
        else:
            print("Tarfile already exists.")
            libri_path = self.data_dir + mirror.replace("-", "_")

        if not os.path.exists(self.data_dir + self.path):
            tar = tarfile.open(libri_path)
            tar.extractall(path=self.data_dir)

            print("Converting .flac to .wav...")
            flac_list = glob.glob(self.data_dir + 'LibriSpeech/**/*.flac', recursive=True)
            for flac_path in flac_list:
                wav_path = flac_path[:-5] + '.wav'
                cmd = ["sox", flac_path, wav_path]
                subprocess.run(cmd)
        print("Finished conversion.\n******")

    def build_manifest(self, transcripts_path, manifest_path, wav_path):
        with open(transcripts_path, 'r') as fin:
            with open(manifest_path, 'a') as fout:
                for line in fin:

                    transcript = ' '.join(line.split(' ')[1:]).lower()
                    file_id = line.split(' ')[0]

                    audio_path = os.path.join(
                        self.data_dir,
                        wav_path,
                        file_id[:file_id.find('-')],
                        file_id[file_id.find('-')+1 : file_id.rfind('-')],
                        file_id + '.wav')

                    duration = librosa.core.get_duration(filename=audio_path)

                    metadata = {
                        "audio_filepath": audio_path,
                        "duration": duration,
                        "text": transcript
                    }
                    json.dump(metadata, fout)
                    fout.write('\n')

In [None]:
dev_clean = LibriDataset(option="dev-clean")
dev_other = LibriDataset(option="dev-other")
test_clean = LibriDataset(option="test-clean")
test_other = LibriDataset(option="test-other")

In [None]:
def concatenate_files(file1_path, file2_path, file3_path):
    try:
        with open(file1_path, 'r', encoding='utf8') as file1:
            content1 = file1.read()

        with open(file2_path, 'r', encoding='utf8') as file2:
            content2 = file2.read()

        concatenated_content = content1 + content2

        with open(file3_path, 'w', encoding='utf8') as file3:
            file3.write(concatenated_content)

        print(f"Contents of {file1_path} and {file2_path} have been concatenated and saved to {file3_path}")

    except FileNotFoundError:
        print("One or more files not found.")

In [None]:
concatenate_files(dev_clean.manifest_path, dev_other.manifest_path, 'train.json')
concatenate_files(test_clean.manifest_path, test_other.manifest_path, 'test.json')

In [None]:
model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_en_conformer_ctc_medium")

In [None]:
# --- Config Information ---#
try:
    from ruamel.yaml import YAML
except ModuleNotFoundError:
    from ruamel_yaml import YAML
config_path = './configs/conformer_ctc_bpe.yaml'

if not os.path.exists(config_path):
    # Grab the config we'll use in this example
    BRANCH = 'main'
    !wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/conformer/conformer_ctc_bpe.yaml

yaml = YAML(typ='safe')
with open(config_path) as f:
    params = yaml.load(f)

In [None]:
params['model']['train_ds']['sample_rate'] = 16000
params['model']['validation_ds']['sample_rate'] = 16000
params['model']['test_ds']['sample_rate'] = 16000
params['model']['preprocessor']['sample_rate'] = 16000
params['model']['encoder']['feat_in'] = 80
params['model']['optim']['sched']['d_model'] = 512

In [None]:
BRANCH = "main"

if not os.path.exists("scripts/process_asr_text_tokenizer.py"):
    !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tokenizers/process_asr_text_tokenizer.py

In [None]:
!python /kaggle/working/scripts/process_asr_text_tokenizer.py \
  --manifest="/kaggle/working/train.json" \
  --data_root="tokenizers/" \
  --vocab_size=512 \
  --tokenizer="spe" \
  --no_lower_case \
  --spe_type="bpe" \
  --log

In [None]:
pre_train_set = set(model.decoder.vocabulary)

In [None]:
model.change_vocabulary(
    new_tokenizer_dir="/kaggle/working/tokenizers/tokenizer_spe_bpe_v512/",
    new_tokenizer_type="bpe"
)

In [None]:
freeze_encoder = False
freeze_encoder = bool(freeze_encoder)

In [None]:
import torch
import torch.nn as nn

def enable_bn_se(m):
    if type(m) == nn.BatchNorm1d:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)

    if 'SqueezeExcite' in type(m).__name__:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)

In [None]:
import logging

if freeze_encoder:
    model.encoder.freeze()
    model.encoder.apply(enable_bn_se)
    logging.info("Model encoder has been frozen, and batch normalization has been unfrozen")
else:
    model.encoder.unfreeze()
    logging.info("Model encoder has been un-frozen")

In [None]:
import copy

new_opt = copy.deepcopy(params['model']['optim'])
new_opt['lr'] = 5.0

In [None]:
params['exp_manager']['checkpoint_callback_params']['save_best_model'] = True
params['exp_manager']['name'] = 'Conformer-CTC-BPE'

In [None]:
from omegaconf import DictConfig
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

train_manifest = '/kaggle/working/train.json'
test_manifest = '/kaggle/working/test.json'

# Use the smaller learning rate we set before
model.setup_optimization(optim_config=DictConfig(new_opt))

# Point to the data we'll use for fine-tuning as the training set
params['model']['train_ds']['batch_size'] = 16
params['model']['train_ds']['manifest_filepath'] = train_manifest
params['model']['train_ds']['is_tarred'] = False
model.setup_training_data(train_data_config=params['model']['train_ds'])

# Point to the new validation data for fine-tuning
params['model']['validation_ds']['manifest_filepath'] = test_manifest
model.setup_validation_data(val_data_config=params['model']['validation_ds'])

In [None]:
wandb.login(key="privated-key")

wandb_logger = WandbLogger(project="asr", log_model='all')

for k,v in params.items():
    wandb_logger.experiment.config[k]=v

In [None]:
from omegaconf import OmegaConf
trainer = pl.Trainer(
    devices=1,
    accelerator='gpu',
    max_epochs=80,
    accumulate_grad_batches=1,
    enable_checkpointing=False,
    logger=wandb_logger,
    log_every_n_steps=100,
    check_val_every_n_epoch=5
)

model.set_trainer(trainer)
model.cfg = DictConfig(OmegaConf.to_container(model._cfg))

In [None]:
trainer.fit(model)

In [None]:
wandb.finish()

In [None]:
model.save_to('conformer_libri.nemo')