## Dependencies

In [6]:
import torch
from torch import nn

import pytorch_lightning as pl

import torchaudio.transforms as T
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt

from IPython.display import Audio, display
from data.datasets import TIMITDataset, PhonemeLabeler
from utils.utils import provide_reproducibility

## CUDA device

In [7]:
provide_reproducibility(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Datasets

In [8]:
# dir_name = r'C:\Data\TIMIT\data'
dir_name = '/media/maxim/Programming/voice_datasets/timit/TIMIT_2/data'  # ubuntu

In [9]:
vowel_labels = ['IY', 'IH', 'EH', 'EY', 'AE', 'AA', 'AW', 'AY', 'AH', 'AO', 'OY', 'OW', 'UH', 'UW', 'UX', 'ER', 'AX',
                'IX', 'AXR', 'AH-H']
consonant_labels = ['B', 'D', 'G', 'P', 'T', 'K', 'DX', 'Q', 'JH', 'CH', 'S', 'SH', 'Z', 'ZH', 'F', 'TH', 'V', 'M', 'N',
                    'NG', 'EM', 'EN', 'ENG', 'NX']

other_labels = ['H#', 'PAU', 'EPI']

phoneme_classes = {
    'vowels': vowel_labels,
    'consonants': consonant_labels,
    'other': other_labels
}
phone_labels = vowel_labels + consonant_labels + other_labels

timit_dataset_train = TIMITDataset(usage='train', root_dir=dir_name,
                                   phone_codes=phone_labels, padding=16000,
                                   phoneme_labeler=PhonemeLabeler(phoneme_classes, '.'),
                                   description_file_path='../../data/timit_description.csv')

timit_dataset_test = TIMITDataset(usage='test', root_dir=dir_name,
                                  phone_codes=phone_labels, padding=16000,
                                  phoneme_labeler=PhonemeLabeler(phoneme_classes, '.'),
                                  description_file_path='../../data/timit_description.csv')

timit_framerate = timit_dataset_train[0].frame_rate

In [5]:
labels = ['consonants', 'vowels', 'other']
num_of_classes = 3

## Transform

In [10]:
# transform = T.Resample(orig_freq=timit_framerate, new_freq=8000)
# transform_cpu = T.Resample(orig_freq=timit_framerate, new_freq=8000)

chunk_size = 1024

In [11]:
def label_to_index(phone):
   return torch.tensor(labels.index(phone))


def index_to_label(index):
    return labels[index]


def pad_sequence(batch):
    batch = [item.t() for item in batch]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
    return batch.permute(0, 2, 1)


def collate_fn(batch):
    tensors, targets = [], []

    for waveform, label, *_ in batch:
        tensors += [waveform[..., :chunk_size]]
        targets += [label_to_index(label)]

    tensors = pad_sequence(tensors)
    targets = torch.stack(targets)

    return tensors, targets


batch_size = 256

if device == "cuda":
    num_workers = 12
    pin_memory = True
else:
    num_workers = 12
    pin_memory = False

train_loader = torch.utils.data.DataLoader(
    timit_dataset_train,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=num_workers,
    pin_memory=pin_memory,
)
test_loader = torch.utils.data.DataLoader(
    timit_dataset_test,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=num_workers,
    pin_memory=pin_memory,
)

train_features, train_labels = next(iter(train_loader))

In [12]:
train_features[0].shape

torch.Size([1, 1024])

In [19]:
import torch.nn.functional as F


class M3(nn.Module):
    def __init__(self, n_input=1, n_output=35, stride=4, kernel_size=80, n_channel=256):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=kernel_size, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        #
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)

        self.fc1 = nn.Linear(n_channel, n_output)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)

        x = F.avg_pool1d(x, int(x.shape[-1]))
        x = x.permute(0, 2, 1)
        x = self.fc1(x)
        return F.log_softmax(x, dim=2)
        # return self.sigmoid(x)

In [20]:
from pathlib import Path

# model params
n_input = 1
n_output = num_of_classes
stride = 4
kernel_size = 80
n_channel = 256
optimizer = 'adadelta'
lr = 3e-2

model_name = 'M3 with other'
experiment_name = f'no resample, chunk_size={chunk_size}, kernel={kernel_size}, stride={stride}, n_channels={n_channel}, optimizer={optimizer}, lr={lr}'
project_name = 'Vowel&Consonants'

model_path = Path(f'/home/maxim/VisibleSpeech/PhonemeRecognizer/models/{model_name}')
experiment_path_model = model_path / experiment_name

logs_path = experiment_path_model / 'logs'
cp_path = experiment_path_model / 'cp'
logs_path.mkdir(parents=True, exist_ok=True)
cp_path.mkdir(parents=True, exist_ok=True)

In [21]:
from pathlib import Path
from models.phoneme_recognizer import PhonemeRecognizer, AudioPreprocessorCallback
from pytorch_lightning.callbacks import ModelCheckpoint


model = PhonemeRecognizer(
    acoustic_model=M3,
    model_params=dict(
        n_input=n_input,
        n_output=num_of_classes,
        stride=stride,
        kernel_size=kernel_size,
        n_channel=n_channel
    ),
    loss_criterion=nn.NLLLoss(),
    lr=lr,
    target_type=None
)
config_params = dict(
    n_input=n_input,
    n_output=num_of_classes,
    stride=stride,
    n_channel=n_channel,
    optimizer=optimizer,
    lr=lr
)

model_checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath=cp_path,
    filename=model_name + '_' + experiment_name + '{epoch:02d}-{val_loss:.2f}',
    save_top_k=-1,
    mode='min'
)
# preprocessor_callback = AudioPreprocessorCallback(transform=transform, device=device)
callbacks = [model_checkpoint_callback]

                not been set for this class (LossMetric). The property determines if `update` by
                default needs access to the full metric state. If this is not the case, significant speedups can be
                achieved and we recommend setting this to `False`.
                We provide an checking function
                `from torchmetrics.utilities import check_forward_full_state_property`
                that can be used to check if the `full_state_update=True` (old and potential slower behaviour,
                default for now) or if `full_state_update=False` can be used safely.
                


In [22]:
filepath = 'model_test.onnx'
input_sample = train_features[0].unsqueeze(0)

model.to_onnx(filepath, input_sample, export_params=True)

  x = F.avg_pool1d(x, int(x.shape[-1]))


In [12]:
from pytorch_lightning.loggers import WandbLogger

n_epochs = 10

logger = WandbLogger(
    project=project_name,
    name=f'{model_name}:{experiment_name}',
    save_dir=logs_path,
    log_model='all'
)

trainer = pl.Trainer(
    logger=logger,
    default_root_dir=logs_path,
    accelerator='gpu',
    devices=1,
    callbacks=callbacks,
    max_epochs=n_epochs,
    log_every_n_steps=10)

logger.watch(model, log='all', log_graph=True)
trainer.fit(model, train_loader, test_loader)


[34m[1mwandb[0m: Currently logged in as: [33mcrazy_historian[0m. Use [1m`wandb login --relogin`[0m to force relogin


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type             | Params
--------------------------------------------------------
0 | loss_criterion     | NLLLoss          | 0     
1 | train_metrics      | MetricCollection | 0     
2 | val_metrics        | MetricCollection | 0     
3 | test_metrics       | MetricCollection | 0     
4 | checkpoint_metrics | MetricCollection | 0     
5 | acoustic_model     | M3               | 219 K 
--------------------------------------------------------
219 K     Trainable params
0         Non-trainable params
219 K     Total params
0.878     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [16]:
model = PhonemeRecognizer.load_from_checkpoint('/home/maxim/VisibleSpeech/PhonemeRecognizer/models/M3 with other/no resample, chunk_size=1024, kernel=80, stride=4, n_channels=256, optimizer=adadelta, lr=0.03/cp/M3 with other_no resample, chunk_size=1024, kernel=80, stride=4, n_channels=256, optimizer=adadelta, lr=0.03epoch=09-val_loss=0.17.ckpt')

model

PhonemeRecognizer(
  (loss_criterion): NLLLoss()
  (train_metrics): MetricCollection(
    (accuracy): Accuracy()
    (f1): F1Score()
    (loss): LossMetric(
      (loss): NLLLoss()
    )
    (precision): Precision()
    (recall): Recall(),
    prefix=train/
  )
  (val_metrics): MetricCollection(
    (accuracy): Accuracy()
    (f1): F1Score()
    (loss): LossMetric(
      (loss): NLLLoss()
    )
    (precision): Precision()
    (recall): Recall(),
    prefix=val/
  )
  (test_metrics): MetricCollection(
    (accuracy): Accuracy()
    (f1): F1Score()
    (loss): LossMetric(
      (loss): NLLLoss()
    )
    (precision): Precision()
    (recall): Recall(),
    prefix=test/
  )
  (checkpoint_metrics): MetricCollection(
    (accuracy): Accuracy()
    (f1): F1Score()
    (loss): LossMetric(
      (loss): NLLLoss()
    )
    (precision): Precision()
    (recall): Recall(),
    prefix=val_
  )
  (acoustic_model): M3(
    (conv1): Conv1d(1, 256, kernel_size=(80,), stride=(4,))
    (bn1): BatchNo

In [18]:
filepath = 'model_test.onnx'
input_sample = train_features[0].unsqueeze(0)

model.to_onnx(filepath, input_sample, export_params=True)

TypeError: avg_pool1d(): argument 'kernel_size' (position 2) must be tuple of ints, not Tensor

In [21]:
def get_likely_index(tensor):
    # find most likely label index for each element in the batch
    return tensor.argmax(dim=-1)

def index_to_label(index):
    # Return the word corresponding to the index in labels
    # This is the inverse of label_to_index
    return labels[index]

In [25]:
def predict(mdl, audio):
    pred = model(audio)
    pred = get_likely_index(pred)
    pred = index_to_label(pred)
    return pred


In [29]:
waveform = train_features[0]
print(waveform.shape, waveform.unsqueeze(0).shape)
out = model(waveform.unsqueeze(0))
out = get_likely_index(out)
index_to_label(out)

torch.Size([1, 1024]) torch.Size([1, 1, 1024])


'consonants'

In [23]:
waveform.dtype

torch.float32

In [33]:
import sounddevice
from audiochains.streams import InputStream
from audiochains.block_methods import UnpackRawInInt16, UnpackRawInFloat32

import torch

with InputStream(
            samplerate=16000,
            blocksize=1024,
            channels=1,
            sampwidth=2
) as stream:
    stream.set_methods(
        UnpackRawInFloat32()
    )
    for _ in range(stream.get_iterations(seconds=5)):
        chunk = stream.apply()
        chunk = torch.from_numpy(chunk.copy())
        print(predict(model, chunk.reshape(1, 1, -1)))


consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
vowels
consonants
consonants
consonants
consonants
vowels
consonants
vowels
consonants
consonants
vowels
vowels
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants
consonants


In [48]:
from bokeh.plotting import figure, output_file, show

output_file("line.html")

p = figure(width=400, height=400)
x = torch.linspace(end=5, steps=5, start=0)
y = torch.rand(5)
# add a line renderer
p.line(x.tolist(), y.tolist(), line_width=2)

show(p)

In [47]:
list(y)

[tensor(0.3618),
 tensor(0.0357),
 tensor(0.3588),
 tensor(0.1287),
 tensor(0.9085)]

In [49]:
from bokeh.driving import bounce

@bounce([0, 1, 2])
def update(i):
    print(i)

In [59]:
update()

2
