In [1]:
project_dir = "/Users/joannarownicka/software/vad"
os.chdir(project_dir)

In [2]:
import numpy as np
import os
from pathlib import Path

import torch
import torch.nn as nn

from lhotse.cut import CutSet
from lhotse.dataset.sampling import SingleCutSampler, BucketingSampler
from lhotse.dataset.vad import VadDataset

from torch.utils.data import DataLoader

from preprocessing import prepare_vad_dataset
from models.dnn import DNN
from models.accuracy import compute_acc_without_pad

In [3]:
root_dir = Path('evaluation/data')
corpus_dir = root_dir / 'vad_data/'
output_dir = root_dir / 'vad_data_nb/'

In [4]:
cuts = CutSet.from_json(output_dir / 'cuts_80.json.gz')
cuts.describe()

Cuts count: 957
Total duration (hours): 3.3
Speech duration (hours): 2.6 (80.6%)
***
Duration statistics (seconds):
mean    12.3
std      3.9
min      1.4
25%     11.1
50%     13.9
75%     15.1
max     17.2
dtype: float64


In [5]:
# Shuffle data but keep seed fixed, split into 80/10/10
#cuts_train, cuts_dev_eval = train_test_split(cuts, train_size=0.8, random_state=0)
#cuts_dev, cuts_eval = train_test_split(cuts_dev_eval, train_size=0.5, random_state=0)

cuts = cuts.shuffle()
vad_manifests = prepare_vad_dataset.prepare_vad_dataset(corpus_dir, output_dir)

train_ratio = 0.8

num_total = len(vad_manifests["supervisions"])
stop_train_idx = int(np.floor(num_total * train_ratio))
stop_dev_idx = int((num_total - stop_train_idx) // 2 + stop_train_idx)

train_ids, dev_ids, eval_ids = [], [], []
counter = 0
for sup_seg in vad_manifests["supervisions"]:
    id = sup_seg.to_dict()["id"]
    if counter < stop_train_idx:
        train_ids.append(id)
    elif counter < stop_dev_idx:
        dev_ids.append(id)
    else:
        eval_ids.append(id)
    counter += 1

assert train_ids[-1] != dev_ids[0]
assert dev_ids[-1] != eval_ids[0]

cuts_train = cuts.subset(supervision_ids=train_ids)
cuts_dev = cuts.subset(supervision_ids=dev_ids)
cuts_eval = cuts.subset(supervision_ids=eval_ids)

cuts_eval.to_json(output_dir / 'cuts_80_eval.json.gz')


In [6]:
vad_dataset = VadDataset()

train_sampler = SingleCutSampler(cuts_train, shuffle=False, max_duration=300)
dev_sampler = SingleCutSampler(cuts_dev, shuffle=False, max_duration=300)

train_dloader = DataLoader(vad_dataset, sampler=train_sampler, batch_size=None)
dev_dloader = DataLoader(vad_dataset, sampler=dev_sampler, batch_size=None)

cut_ids = next(iter(dev_sampler))
sample = vad_dataset[cut_ids]

In [7]:
learning_rate = 1e-2
weight_decay = 1e-4
num_epochs = 15

model_id = "dnn_ce_fbank80_ignoreindex_lr2"
input_size = sample['inputs'][0].shape[1]
log_dir = Path('storage/models') / model_id

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = DNN(input_size=input_size, hidden_size=256, num_classes=2).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=-100)

#optim = torch.optim.Adam(model.parameters())
#                       lr=learning_rate, weight_decay=weight_decay)
                        
optim = torch.optim.SGD(model.parameters(), lr=learning_rate,
                        weight_decay=weight_decay)

train_acc = []
valid_acc = []
for epoch in range(num_epochs):
    # training
    acc = []
    model.train()
    train_dloader.sampler.set_epoch(epoch)

    for batch_idx, data in enumerate(train_dloader):

        inputs = data["inputs"].reshape(-1,input_size)
        targets = data["is_voice"].reshape(-1,1).view(-1)
                
        out = model(inputs.to(device))
        model_acc, _, _ = compute_acc_without_pad(
            out, targets.unsqueeze(1).to(device))

        loss = criterion(out, targets.long()) #ce
        
        if batch_idx % 10 == 0:
            print(f'Batch {batch_idx} => loss {loss}')
        optim.zero_grad()
        loss.backward()
        acc.append(model_acc)
        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optim.step()
        t_r = loss.item()
        
    train_acc.append(np.mean(acc))

    # validation
    acc = []
    model.eval()
    for data in dev_dloader:
        inputs = data["inputs"].reshape(-1,input_size)
        targets = data["is_voice"].reshape(-1,1).view(-1)
        out = model(inputs.to(device))
        model_acc, _, _ = compute_acc_without_pad(
            out, targets.unsqueeze(1).to(device))
        acc.append(model_acc)
    valid_acc.append(np.mean(acc))
    print(f"epoch: {epoch}, train acc: {train_acc[-1]:.3f}, dev acc: {valid_acc[-1]:.3f}, loss:{t_r:.3f}")
    torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
        'optimizer': optim.state_dict()},
        f'{log_dir}/checkpoint_{epoch}.pth')
                    

Batch 0 => loss 0.7164076566696167
Batch 10 => loss 0.2961089015007019
Batch 20 => loss 0.23858827352523804
Batch 30 => loss 0.2541408836841583
epoch: 0, train acc: 83.970, dev acc: 89.400, loss:0.269
Batch 0 => loss 0.2525580823421478
Batch 10 => loss 0.2328510284423828
Batch 20 => loss 0.21473461389541626
Batch 30 => loss 0.2372913658618927
epoch: 1, train acc: 89.788, dev acc: 90.200, loss:0.249
Batch 0 => loss 0.2395000457763672
Batch 10 => loss 0.22123488783836365
Batch 20 => loss 0.2099178582429886
Batch 30 => loss 0.23031498491764069
epoch: 2, train acc: 90.182, dev acc: 90.600, loss:0.240
Batch 0 => loss 0.23320896923542023
Batch 10 => loss 0.21519312262535095
Batch 20 => loss 0.2079838365316391
Batch 30 => loss 0.22592659294605255
epoch: 3, train acc: 90.485, dev acc: 91.000, loss:0.234
Batch 0 => loss 0.2285228967666626
Batch 10 => loss 0.21124285459518433
Batch 20 => loss 0.20663806796073914
Batch 30 => loss 0.2228812873363495
epoch: 4, train acc: 90.697, dev acc: 91.000, lo