## Dependencies

In [1]:
import sys
import importlib
sys.path.append('/home/maxim/VisibleSpeech/runtime_speech_recognition/data')
sys.path.append('/home/maxim/VisibleSpeech/runtime_speech_recognition')

In [2]:
import torch
from torch import nn

import pytorch_lightning as pl
import torchaudio.transforms as T
from torch.utils.data import DataLoader

from data import datasets
from IPython.display import Audio, display
from utils.utils import provide_reproducibility
from pytorch_lightning.loggers import WandbLogger

In [3]:
import models
from models import phoneme_recognizer
from models import mixins

## CUDA device

In [4]:
provide_reproducibility(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Datasets

In [5]:
dir_name = '/media/maxim/Programming/voice_datasets/timit/TIMIT_2/data'  # ubuntu

In [6]:
importlib.reload(datasets)

vowel_labels = ['IY', 'IH', 'EH', 'EY', 'AE', 'AA', 'AW', 'AY', 'AH', 'AO', 'OY', 'OW', 'UH', 'UW', 'UX', 'ER', 'AX',
                'IX', 'AXR', 'AH-H']
consonant_labels = ['B', 'D', 'G', 'P', 'T', 'K', 'DX', 'Q', 'JH', 'CH', 'S', 'SH', 'Z', 'ZH', 'F', 'TH', 'V', 'M', 'N',
                    'NG', 'EM', 'EN', 'ENG', 'NX']

phoneme_classes = {
    'vowels': vowel_labels,
    'consonants': consonant_labels
}
phone_labels = vowel_labels + consonant_labels

timit_dataset_train = datasets.TIMITDataset(
    usage='train',
    percentage=0.05,
    root_dir=dir_name,
    phone_codes=phoneme_classes,
    padding=16000,
    phoneme_labeler=datasets.PhonemeLabeler(phoneme_classes, '.'),
    description_file_path='../data/timit_description.csv'
)

timit_dataset_test = datasets.TIMITDataset(
    usage='test',
    percentage=0.05,
    root_dir=dir_name,
    phone_codes=phoneme_classes,
    padding=16000,
    phoneme_labeler=datasets.PhonemeLabeler(phoneme_classes, '.'),
    description_file_path='../data/timit_description.csv'
)

In [7]:
timit_framerate = timit_dataset_train[0].frame_rate

In [8]:
timit_dataset_train[0]

AudioData(data=tensor([[ 0.0000e+00,  3.0518e-05, -3.0518e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]]), label='consonants', frame_rate=16000, sample_width=16)

In [9]:
labels = ['consonants', 'vowels']
num_of_classes = len(labels)

## Transform

In [10]:
transform = T.Resample(orig_freq=timit_framerate, new_freq=8000)
transform_cpu = T.Resample(orig_freq=timit_framerate, new_freq=8000)

chunk_size = 1024

## Dataloaders

In [11]:
def label_to_index(phone):
    if phone == 'consonants':
        return torch.tensor([1, 0])
    else:
        return torch.tensor([0, 1])


def index_to_label(index):
    return labels[index]


def pad_sequence(batch):
    batch = [item.t() for item in batch]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
    return batch.permute(0, 2, 1)


def collate_fn(batch):
    tensors, targets = [], []

    for waveform, label, *_ in batch:
        tensors += [waveform[..., :chunk_size]]
        targets += [label_to_index(label)]

    tensors = pad_sequence(tensors)
    targets = torch.stack(targets)

    return tensors, targets


batch_size = 256

if device == "cuda":
    num_workers = 12
    pin_memory = True
else:
    num_workers = 12
    pin_memory = False

train_loader = torch.utils.data.DataLoader(
    timit_dataset_train,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=num_workers,
    pin_memory=pin_memory,
)
test_loader = torch.utils.data.DataLoader(
    timit_dataset_test,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=num_workers,
    pin_memory=pin_memory,
)

train_features, train_labels = next(iter(train_loader))

In [12]:
label_to_index('consonants').shape

torch.Size([2])

In [13]:
train_features[0].shape, train_labels.shape

(torch.Size([1, 1024]), torch.Size([256, 2]))

## Model training

In [14]:
importlib.reload(models)
importlib.reload(phoneme_recognizer)
importlib.reload(mixins)

<module 'models.mixins' from '/home/maxim/VisibleSpeech/runtime_speech_recognition/models/mixins.py'>

In [15]:
from pathlib import Path
from models.phoneme_recognizer import PhonemeRecognizer, AudioPreprocessorCallback
from pytorch_lightning.callbacks import ModelCheckpoint

### Init model

In [16]:
import torch.nn.functional as F


class M3(nn.Module):
    def __init__(self, n_input=1, n_output=35, stride=4, n_channel=256):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        #
        self.conv2 = self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)

        self.fc1 = nn.Linear(n_channel, n_output)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)

        x = F.avg_pool1d(x, x.shape[-1])
        x = x.permute(0, 2, 1)
        x = self.fc1(x)
        # return F.log_softmax(x, dim=2)
        return self.sigmoid(x)

In [17]:
class MBlock(nn.Module):
    def __init__(self, kernel_size, in_chn, out_chn, count):
        super().__init__()
        self.layers = self._init_layers(kernel_size, in_chn, out_chn, count)

    def _init_layers(self, kernel_size, in_chn, out_chn, count):
        layers = list()
        for _ in range(count):
            layers += [
                nn.Conv1d(in_channels=in_chn, out_channels=out_chn, kernel_size=kernel_size),
                nn.BatchNorm1d(out_chn),
                nn.ReLU()
            ]
            in_chn = out_chn

        return nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)


class M8(nn.Module):
    def __init__(self, n_input=1, n_output=20, stride=4, conv_kernel=80, n_channel=64, maxpool_kernel=4):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=conv_kernel, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(maxpool_kernel)
        ##
        self.conv_layers_2 = MBlock(kernel_size=3, in_chn=n_channel, out_chn=n_channel, count=2)
        self.pool2 = nn.MaxPool1d(maxpool_kernel)
        self.conv_layers_3 = MBlock(kernel_size=3, in_chn=n_channel, out_chn=n_channel * 2, count=2)
        self.pool3 = nn.MaxPool1d(maxpool_kernel)
        self.conv_layers_4 = MBlock(kernel_size=3, in_chn=n_channel * 2, out_chn=n_channel * 4, count=3)
        self.pool4 = nn.MaxPool1d(maxpool_kernel)

        self.fc1 = nn.Linear(n_channel * 4, n_output)

    def forward(self, x):
        x = self.conv1(x)
        # print(x.shape)
        x = F.relu(self.bn1(x))
        # print(x.shape)
        x = self.pool1(x)
        # print(x.shape)

        x = self.conv_layers_2(x)
        # print(x.shape)
        x = self.pool2(x)
        # print(x.shape)

        x = self.conv_layers_3(x)
        # print(x.shape)
        x = self.pool3(x)
        # print(x.shape)

        x = self.conv_layers_4(x)
        # print(x.shape)
        x = self.pool4(x)
        # print(x.shape)

        x = F.avg_pool1d(x, x.shape[-1])
        x = x.permute(0, 2, 1)
        x = self.fc1(x)

        return F.log_softmax(x, dim=2)

In [18]:
m8 = M8(
    conv_kernel=8,
    stride=2,
    maxpool_kernel=2,
    n_output=2
)
example = torch.rand(1, 1, 1024)
m8(example)

tensor([[[-0.9211, -0.5077]]], grad_fn=<LogSoftmaxBackward0>)

In [19]:
m3 = M3(
    n_input = 1,
    n_output = num_of_classes,
    stride = 4,
    n_channel = 256
)

m3(example)

tensor([[[0.5756, 0.5375]]], grad_fn=<SigmoidBackward0>)

### Model params

In [20]:

model_name = 'M8'
experiment_name = 'TEST_2023'
project_name = 'Vowel&Consonants'

model_path = Path(f'/home/maxim/VisibleSpeech/PhonemeRecognizer/models/{model_name}')
experiment_path_model = model_path / experiment_name

logs_path = experiment_path_model / 'logs'
cp_path = experiment_path_model / 'cp'
logs_path.mkdir(parents=True, exist_ok=True)
cp_path.mkdir(parents=True, exist_ok=True)

In [21]:

# model params
n_input = 1
n_output = num_of_classes
conv_kernel = 8
maxpool_kernel = 2
stride = 2
n_channel = 64
optimizer = 'adadelta'
lr = 3e-1


# n_input = 1
# n_output = num_of_classes
# stride = 4
# n_channel = 256
# optimizer = 'adadelta'
# lr = 3e-1

### Run model training

In [22]:
model = PhonemeRecognizer(
    acoustic_model=M8,
    num_of_classes=num_of_classes,
    model_params=dict(
        n_input=n_input,
        n_output=num_of_classes,
        conv_kernel=conv_kernel,
        maxpool_kernel=maxpool_kernel,
        stride=stride,
        n_channel=n_channel,

    ),
    loss_criterion=nn.NLLLoss(),
    lr=3e-2
)
config_params = dict(
    n_input=n_input,
    n_output=num_of_classes,
    stride=stride,
    n_channel=n_channel,
    optimizer=optimizer,
    lr=lr
)

model_checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath=cp_path,
    filename=model_name + '_' + experiment_name + '{epoch:02d}-{val_loss:.2f}',
    save_top_k=-1,
    mode='min'
)
preprocessor_callback = AudioPreprocessorCallback(transform=transform, device=device)
callbacks = [preprocessor_callback, model_checkpoint_callback]

In [23]:
n_epochs = 50

logger = WandbLogger(
    project=project_name,
    name=f'{model_name}_{experiment_name}',
    save_dir=logs_path,
    log_model='all'
)

trainer = pl.Trainer(
    logger=logger,
    default_root_dir=logs_path,
    accelerator='gpu',
    devices=1,
    callbacks=callbacks,
    max_epochs=n_epochs,
    log_every_n_steps=10)

logger.watch(model, log='all', log_graph=True)
trainer.fit(model, train_loader, test_loader)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcrazy_historian[0m. Use [1m`wandb login --relogin`[0m to force relogin


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type             | Params
--------------------------------------------------------
0 | loss_criterion     | NLLLoss          | 0     
1 | train_metrics      | MetricCollection | 0     
2 | val_metrics        | MetricCollection | 0     
3 | test_metrics       | MetricCollection | 0     
4 | checkpoint_metrics | MetricCollection | 0     
5 | acoustic_model     | M8               | 594 K 
--------------------------------------------------------
594 K     Trainable params
0         Non-trainable params
594 K     Total params
2.378     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: 0D or 1D target tensor expected, multi-target not supported