In [1]:
import torch
from torchfsdd import TorchFSDDGenerator, TrimSilence
from torchaudio.transforms import MFCC, TimeMasking, FrequencyMasking
from torchvision.transforms import Compose
from torch.utils.data import DataLoader
from lightning import Trainer
from lightning.pytorch.loggers import MLFlowLogger
from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor
from matplotlib import pyplot as plt
from pathlib import Path
import sys

# quick hacky way to get the package visible in the notebook
base_path = Path(".").resolve().parent
sys.path.append(str(base_path ))

from sdc.rnn import RNN
from sdc.trainer import RNNTrainer
USE_GPU = torch.cuda.is_available() and 1
N_CLASSES = 10 # 0-9 digits
SAMPLING_RATE = 8e3  # 8kHz
N_MFCC_CHANNELS = 13  # Number of MFCC channels
MAX_EPOCHS = 10
DEVICE = torch.device("cuda") if USE_GPU else torch.device("cpu")
CHECKPOINT_PATH = Path(".") / "saved_models"
CHECKPOINT_PATH.mkdir(parents=True, exist_ok=True)
EXPERIMENT_NAME = "base_rnn_model"
BATCH_SIZE = 4

# torch.set_default_device(DEVICE)
# torch.set_default_dtype(torch.float16)
torch.set_float32_matmul_precision("medium")
# torch.set_float32_matmul_precision("high")


# chop off silence from the beginning and end of the audio
trimmer = TrimSilence(threshold=1e-6)

mfcc = MFCC(sample_rate=SAMPLING_RATE, n_mfcc=N_MFCC_CHANNELS)

time_masking = TimeMasking(time_mask_param=2, p=0.3)
freq_masking = FrequencyMasking(freq_mask_param=2)
freq_masking.p = 0.3

# Fetch the latest version of FSDD and initialize a generator with those files
fsdd = TorchFSDDGenerator(version='master', transforms=None)

# Create three Torch datasets for a train-validation-test split from the generator
train_set, val_set, test_set = fsdd.train_val_test_split(test_size=0.15, val_size=0.15)



In [2]:
default_root_dir = CHECKPOINT_PATH / EXPERIMENT_NAME
default_root_dir.mkdir(parents=True, exist_ok=True)


In [3]:
mlf_logger = MLFlowLogger(
    experiment_name=EXPERIMENT_NAME,
    # tracking_uri=f"http://localhost:5000",
)

In [4]:
# Create a transformation pipeline to apply to the recordings
train_transforms = Compose([
    trimmer,
    mfcc,
    time_masking,
    freq_masking,
])

val_transforms = Compose([
    trimmer,
    mfcc,
])

train_set.transforms = train_transforms
val_set.transforms = val_transforms
test_set.transforms = val_transforms

In [5]:
def collate_fn(data):
    """Batch and pad wakeword data"""
    mfccs = []
    labels = []
    for d in data:
        mfcc, label = d
        print(mfcc.shape, label)
        mfccs.append(mfcc.squeeze(0).transpose(0, 1))
        labels.append(label)

    # pad mfccs to ensure all tensors are same size in the time dim
    mfccs = torch.nn.utils.rnn.pad_sequence(mfccs, batch_first=True)  # batch, seq_len, feature

    # print(mfccs.shape)
    labels = torch.asarray(labels)
    return mfccs, labels

train_loader = DataLoader(
    train_set,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=1,
    pin_memory=True,
    collate_fn=collate_fn,
)

val_loader = DataLoader(
    val_set,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=1,
    pin_memory=True,
    collate_fn=collate_fn,
)

In [6]:
rnn = RNN(
    lstm_config={
        'input_size': N_MFCC_CHANNELS,
        'hidden_size': 128,
        'num_layers': 1,
    },
    linear_sizes=[N_CLASSES],
)



In [7]:
rnn_trainer = RNNTrainer(
    model=rnn,

)

In [8]:
trainer = Trainer(
    default_root_dir=default_root_dir,
    accelerator="cuda" if USE_GPU else "cpu",
    # amp_type="apex",
    # max_epochs=30 if i == 0 else 10,
    max_epochs=MAX_EPOCHS,
    # strategy="ddp",
    callbacks=[
        ModelCheckpoint(),
        # RichProgressBar(),
        LearningRateMonitor("epoch"),
        # PlotWeightsOnEpochEndCallback(),
        # PlotWeightsOnTrainStartCallback(),
        # EarlyStopping(monitor="val_acc", patience=15, mode="max"),
    ],
    logger=mlf_logger,
)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model=rnn_trainer, train_dataloaders=train_loader, val_dataloaders=val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type               | Params | Mode 
--------------------------------------------------------
0 | model    | RNN                | 74.5 K | train
1 | loss_fn  | CrossEntropyLoss   | 0      | train
2 | accuracy | MulticlassAccuracy | 0      | train
--------------------------------------------------------
74.5 K    Trainable params
0         Non-trainable params
74.5 K    Total params
0.298     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/chanokin/innatera/audio_digit_classifier/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


torch.Size([13, 47]) 0
torch.Size([13, 26]) 6
torch.Size([13, 12]) 4
torch.Size([13, 21]) 7
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]torch.Size([13, 21]) 1
torch.Size([13, 18]) 5
torch.Size([13, 10]) 3
torch.Size([13, 12]) 4
torch.Size([13, 18]) torch.Size([13, 18])7
 7
torch.Size([13, 11]) 1
torch.Size([13, 27]) 1
Training step 0
Batch x shape: torch.Size([4, 47, 13])
Batch y shape: torch.Size([4])
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  8.13it/s]Training step 1
Batch x shape: torch.Size([4, 21, 13])
Batch y shape: torch.Size([4])
                                                                           

/home/chanokin/innatera/audio_digit_classifier/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


torch.Size([13, 22]) 7
Training: |          | 0/? [00:00<?, ?it/s]torch.Size([13, 26]) 9
torch.Size([13, 19]) 1
Epoch 0:   0%|          | 0/540 [00:00<?, ?it/s] torch.Size([13, 22]) 5
torch.Size([13, 13]) 7
torch.Size([13, 8]) 6
torch.Size([13, 16]) 8
torch.Size([13, 13]) 3
torch.Size([13, 11]) 1
torch.Size([13, 23]) 3
torch.Size([13, 18]) 7
torch.Size([13, 11]) 5
Training step 0
Batch x shape: torch.Size([4, 16, 13])
Batch y shape: torch.Size([4])
Epoch 0:   0%|          | 1/540 [00:00<00:58,  9.24it/s, v_num=e486]13, 17]) 5
torch.Size([13, 16]) 8
torch.Size([13, 15]) 0Training step 1
Batch x shape: torch.Size([4, 23, 13])
Batch y shape: torch.Size([4])
Epoch 0:   0%|          | 2/540 [00:00<00:30, 17.56it/s, v_num=e486]
torch.Size([13, 11]) 3
Training step 2
Batch x shape: torch.Size([4, 17, 13])
Batch y shape: torch.Size([4])
Epoch 0:   1%|          | 3/540 [00:00<00:21, 24.74it/s, v_num=e486]torch.Size([13, 19]) 3
torch.Size([13, 17]) 5
torch.Size([13, 12]) 5
torch.Size([13, 7]) 6
