In [1]:
!pip install torchcodec jiwer
!sudo apt-get update && sudo apt-get install sox -y

Collecting torchcodec
  Downloading torchcodec-0.6.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading torchcodec-0.6.0-cp312-cp312-manylinux_2_28_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m110.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec, rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.0 torchcodec-0.6.0
Hit:1 https://cli.github.com/packages stable InRelease
Get:2 https

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/ASR/Chunkformer

Mounted at /content/drive
/content/drive/MyDrive/ASR/Chunkformer


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
The token `asr-nemo` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authentic

In [4]:
import os
import sys
import math
import argparse
import shutil
from typing import List, Tuple, Optional

import torch
import torchaudio
import yaml
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import sentencepiece as spm

# Add the current directory to the Python path
sys.path.append('.')

from model.utils.init_model import init_model
from model.utils.checkpoint import load_checkpoint
from model.fixed_tokenizer import TextTokenizer

In [5]:
class AudioDataset(Dataset):
    def __init__(self, tsv_path: str, tokenizer: TextTokenizer, use_speed_perturb: bool = True,
                 return_texts: bool = False):
        df = pd.read_csv(tsv_path, sep="\t")
        assert "wav" in df.columns and "txt" in df.columns, "TSV must contain 'wav' and 'txt' columns"
        self.paths = df["wav"].tolist()
        self.texts = df["txt"].tolist()
        self.use_speed_perturb = use_speed_perturb
        self.tokenizer = tokenizer
        self.return_texts = return_texts

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx: int):
        path = self.paths[idx]
        text = str(self.texts[idx])
        waveform, sr = torchaudio.load(path)
        if waveform.size(0) > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        if self.use_speed_perturb:
            waveform = maybe_speed_perturb(waveform, sr)

        feats = compute_fbank(waveform, sr)
        token_ids = self.tokenizer.encode(text)
        if self.return_texts:
            return feats, torch.tensor(token_ids, dtype=torch.long), text
        return feats, torch.tensor(token_ids, dtype=torch.long)

In [6]:
def collate_fn(batch, apply_specaug: bool = True) -> Tuple[List[torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor]:
    xs = []
    ys = []
    for sample in batch:
        feats, tgt = sample[0], sample[1]
        if apply_specaug:
            feats = spec_augment(feats)
        xs.append(feats)
        ys.append(tgt)

    xs_lens = torch.tensor([x.size(0) for x in xs], dtype=torch.long)
    ys_cat = torch.cat([y for y in ys]) if len(ys) > 0 else torch.tensor([], dtype=torch.long)
    ys_lens = torch.tensor([len(y) for y in ys], dtype=torch.long)

    return xs, xs_lens, ys_cat, ys_lens

In [7]:
class NoamLR(torch.optim.lr_scheduler._LRScheduler):
    """Noam scheduler with explicit peak learning rate at warmup_steps.

    lr(step) = scale * min(step^-0.5, step * warmup^-1.5)
    where scale is chosen so lr(warmup_steps) == peak_lr.
    """

    def __init__(self, optimizer: torch.optim.Optimizer, warmup_steps: int, peak_lr: float, last_epoch: int = -1):
        self.warmup_steps = max(1, warmup_steps)
        self.peak_lr = peak_lr
        # Determine scale so that lr at warmup equals peak_lr
        self.scale = peak_lr * (self.warmup_steps ** 0.5)
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        step = max(1, self.last_epoch + 1)
        factor = min(step ** -0.5, step * (self.warmup_steps ** -1.5))
        lr = self.scale * factor
        return [lr for _ in self.base_lrs]

In [8]:
def build_or_load_tokenizer(train_tsv: str, out_dir: str, vocab_size: int) -> TextTokenizer:
    spm_path = os.path.join(out_dir, 'spm.model')
    if os.path.exists(spm_path):
        return TextTokenizer(spm_path)
    texts = pd.read_csv(train_tsv, sep='\t')['txt'].astype(str).tolist()
    spm_path = TextTokenizer.train_from_corpus(texts, out_dir, vocab_size)
    return TextTokenizer(spm_path)

In [9]:
def build_config_yaml(output_dir: str, vocab_size: int, d_model: int, num_blocks: int,
                      attention_heads: int, linear_units: int, dropout_rate: float) -> str:
    config = {
        "cmvn_file": None,
        "is_json_cmvn": False,
        "input_dim": 80,
        "output_dim": vocab_size,
        "encoder_conf": {
            "output_size": d_model,
            "attention_heads": attention_heads,
            "linear_units": linear_units,
            "num_blocks": num_blocks,
            "dropout_rate": dropout_rate,
            "positional_dropout_rate": dropout_rate,
            "attention_dropout_rate": 0.0,
            "input_layer": "conv2d",
            "pos_enc_layer_type": "abs_pos",
            "normalize_before": True,
            "static_chunk_size": 0,
            "use_dynamic_chunk": False,
            "positionwise_conv_kernel_size": 1,
            "macaron_style": True,
            "selfattention_layer_type": "rel_selfattn",
            "activation_type": "swish",
            "use_cnn_module": True,
            "cnn_module_kernel": 15,
            "causal": False,
            "cnn_module_norm": "batch_norm",
            "use_limited_chunk": False,
            "limited_decoding_chunk_sizes": [],
            "limited_left_chunk_sizes": [],
            "use_dynamic_conv": False,
            "use_context_hint_chunk": False,
            "right_context_sizes": [],
            "right_context_probs": [],
            "freeze_subsampling_layer": False,
        }
    }
    config_path = os.path.join(output_dir, "config.yaml")
    with open(config_path, "w", encoding="utf-8") as f:
        yaml.safe_dump(config, f, sort_keys=False)
    return config_path

In [11]:
import os
import shutil
from sentencepiece import SentencePieceProcessor

def save_vocab_and_config(tokenizer: TextTokenizer, output_dir: str, config_path: str):
    os.makedirs(output_dir, exist_ok=True)
    vocab_txt = os.path.join(output_dir, "vocab.txt")
    tokenizer.save_vocab_txt(vocab_txt)
    # Save SentencePiece model next to vocab for consistent inference
    spm_path = None
    try:
        spm_path = tokenizer.sp.model_file()
    except Exception:
        spm_path = None
    if spm_path and isinstance(spm_path, str) and os.path.exists(spm_path):
        shutil.copy2(spm_path, os.path.join(output_dir, "spm.model"))
    else:
        # Fallback: write serialized model bytes if available
        try:
            blob = tokenizer.sp.serialized_model_proto()
        except Exception:
            blob = None
        if blob:
            with open(os.path.join(output_dir, "spm.model"), "wb") as f:
                f.write(blob)
    return vocab_txt

In [12]:
def set_seed(seed: int):
    import random
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [30]:
def train_one_epoch_full(model, dataloader, optimizer, scheduler, device, scaler,
                         grad_accum_steps: int = 1, max_grad_norm: float = 5.0):
    model.train()
    total_loss = 0.0
    total_tokens = 0
    optimizer.zero_grad(set_to_none=True)

    for step, (xs, xs_lens, ys_cat, ys_lens) in enumerate(dataloader):
        xs = [x.to(device) for x in xs]
        ys_cat = ys_cat.to(device)
        ys_lens = ys_lens.to(device)

        xs_origin_lens = xs_lens.to(device)

        with torch.cuda.amp.autocast(enabled=scaler is not None):
            subsampling = getattr(model.encoder.embed, "subsampling_factor", 1)
            max_len = int(xs_origin_lens.max().item())
            chunk_size_fc = ((max_len + subsampling - 1) // subsampling) * subsampling
            left_ctx = 0
            right_ctx = 0

            offset = torch.zeros(len(xs), dtype=torch.int, device=device)
            encoder_outs, encoder_lens, n_chunks, _, _, _ = model.encoder.forward_parallel_chunk(
                xs=xs,
                xs_origin_lens=xs_origin_lens,
                chunk_size=chunk_size_fc,
                left_context_size=left_ctx,
                right_context_size=right_ctx,
                offset=offset,
            )
            enc_padded, enc_masks = model.encoder.rearrange(encoder_outs, xs_origin_lens, n_chunks)
            input_lengths = enc_masks.squeeze(1).sum(dim=1).to(torch.int)
            log_probs = model.ctc.log_softmax(enc_padded).transpose(0, 1)
            loss = torch.nn.functional.ctc_loss(
                log_probs, ys_cat, input_lengths, ys_lens,
                blank=0, reduction="sum", zero_infinity=True
            )
            norm = ys_lens.sum().clamp_min(1)
            loss = loss / norm

        if scaler is not None:
            scaler.scale(loss).backward()
        else:
            loss.backward()

        if (step + 1) % grad_accum_steps == 0:
            if scaler is not None:
                scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            if scaler is not None:
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()
            scheduler.step()
            optimizer.zero_grad(set_to_none=True)

        total_loss += loss.detach().item() * norm.item()
        total_tokens += norm.item()

        if device.type == "cuda":
            torch.cuda.empty_cache()

    avg_loss = total_loss / max(1, total_tokens)
    return avg_loss, total_tokens

In [15]:
# Define configuration parameters
train_tsv = "train.tsv"
valid_tsv = None
output_dir = "output"
epochs = 50
batch_size = 4
num_workers = 2
peak_lr = 1e-3
warmup_steps = 15000
weight_decay = 0.0
seed = 42
device = "cuda" if torch.cuda.is_available() else "cpu"
amp = True
vocab_size = 5000
d_model = 256
num_blocks = 12
attention_heads = 4
linear_units = 2048
dropout = 0.1
disable_specaug = True
disable_speed_perturb = True

# Create a simple object to hold the configuration
class Args:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Args(
    train_tsv=train_tsv,
    valid_tsv=valid_tsv,
    output_dir=output_dir,
    epochs=epochs,
    batch_size=batch_size,
    num_workers=num_workers,
    peak_lr=peak_lr,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    seed=seed,
    device=device,
    amp=amp,
    vocab_size=vocab_size,
    d_model=d_model,
    num_blocks=num_blocks,
    attention_heads=attention_heads,
    linear_units=linear_units,
    dropout=dropout,
    disable_specaug=disable_specaug,
    disable_speed_perturb=disable_speed_perturb
)

In [16]:
os.makedirs(args.output_dir, exist_ok=True)
set_seed(args.seed)

In [18]:
tokenizer = build_or_load_tokenizer(args.train_tsv, args.output_dir, args.vocab_size)
vocab_size = tokenizer.vocab_size()
config_path = build_config_yaml(
    args.output_dir,
    vocab_size=vocab_size,
    d_model=args.d_model,
    num_blocks=args.num_blocks,
    attention_heads=args.attention_heads,
    linear_units=args.linear_units,
    dropout_rate=args.dropout,
)
save_vocab_and_config(tokenizer, args.output_dir, config_path)

'output/vocab.txt'

In [19]:
with open(config_path, "r") as f:
    configs = yaml.load(f, Loader=yaml.FullLoader)
model = init_model(configs, config_path)

device = torch.device(args.device)
model = model.to(device)

In [22]:
train_ds = AudioDataset(
        args.train_tsv,
        tokenizer,
        use_speed_perturb=(not args.disable_speed_perturb),
    )
train_loader = DataLoader(
    train_ds,
    batch_size=args.batch_size,
    shuffle=True,
    num_workers=args.num_workers,
    collate_fn=lambda b: collate_fn(b, apply_specaug=(not args.disable_specaug)),
    pin_memory=(device.type == "cuda"),
)

optimizer = torch.optim.Adam(model.parameters(), lr=args.peak_lr, betas=(0.9, 0.98), weight_decay=args.weight_decay)
scheduler = NoamLR(optimizer, warmup_steps=args.warmup_steps, peak_lr=args.peak_lr)
scaler = torch.cuda.amp.GradScaler() if (args.amp and device.type == "cuda") else None

  scaler = torch.cuda.amp.GradScaler() if (args.amp and device.type == "cuda") else None


In [28]:
import os, torch, torchaudio, yaml
import torchaudio.compliance.kaldi as kaldi
from datasets import Audio
from model.utils.init_model import init_model
from model.utils.ctc_utils import remove_duplicates_and_blank

def compute_fbank(waveform: torch.Tensor, sample_rate: int = 16000) -> torch.Tensor:
    if waveform.dtype != torch.float32:
        waveform = waveform.to(torch.float32)
    if sample_rate != 16000:
        waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
        sample_rate = 16000
    feats = kaldi.fbank(
        waveform,
        num_mel_bins=80,
        frame_length=25,
        frame_shift=10,
        dither=0.0,
        energy_floor=0.0,
        sample_frequency=sample_rate,
    )
    return feats

In [None]:
for epoch in range(1, args.epochs + 1):
    print(f"Epoch {epoch}/{args.epochs}")
    train_loss, _ = train_one_epoch_full(
        model,
        train_loader,
        optimizer,
        scheduler,
        device,
        scaler,
        grad_accum_steps=1,
    )
    print(f"Train loss (CTC): {train_loss:.4f}")

torch.save(model.state_dict(), os.path.join(args.output_dir, "pytorch_model.bin"))
print("Saved full-context checkpoint.")

Epoch 1/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=scaler is not None):


Train loss (CTC): 10.0476
Epoch 2/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 6.0883
Epoch 3/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.9701
Epoch 4/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.9468
Epoch 5/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.9059
Epoch 6/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.9008
Epoch 7/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.9787
Epoch 8/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.9730
Epoch 9/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.9788
Epoch 10/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.9799
Epoch 11/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.9757
Epoch 12/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.9412
Epoch 13/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8707
Epoch 14/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8438
Epoch 15/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8239
Epoch 16/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8181
Epoch 17/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8234
Epoch 18/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8093
Epoch 19/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7993
Epoch 20/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7932
Epoch 21/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7940
Epoch 22/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8063
Epoch 23/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8127
Epoch 24/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7992
Epoch 25/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8162
Epoch 26/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8212
Epoch 27/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8251
Epoch 28/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8040
Epoch 29/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8148
Epoch 30/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7837
Epoch 31/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.8072
Epoch 32/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7980
Epoch 33/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7823
Epoch 34/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7735
Epoch 35/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7676
Epoch 36/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7589
Epoch 37/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7559
Epoch 38/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7546
Epoch 39/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7649
Epoch 40/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7693
Epoch 41/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7836
Epoch 42/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7567
Epoch 43/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Train loss (CTC): 5.7716
Epoch 44/50


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
