In [1]:
project_dir = "/Users/joannarownicka/software/vad"
os.chdir(project_dir)

In [2]:
import numpy as np
import os
from pathlib import Path

import torch
import torch.nn as nn

from lhotse.cut import CutSet
from lhotse.dataset.sampling import SingleCutSampler, BucketingSampler
from lhotse.dataset.vad import VadDataset

from torch.utils.data import DataLoader

from preprocessing import prepare_vad_dataset
from models.dnn import DNN
from models.accuracy import compute_acc_without_pad

In [3]:
root_dir = Path('evaluation/data')
corpus_dir = root_dir / 'vad_data/'
output_dir = root_dir / 'vad_data_nb/'

In [4]:
cuts = CutSet.from_json(output_dir / 'cuts_80_data_augment.json.gz')
cuts.describe()

Cuts count: 4785
Total duration (hours): 16.5
Speech duration (hours): 13.3 (80.6%)
***
Duration statistics (seconds):
mean    12.4
std      4.0
min      1.3
25%     11.0
50%     13.7
75%     15.2
max     19.1
dtype: float64


In [5]:
# Shuffle data but keep seed fixed, split into 80/10/10
#cuts_train, cuts_dev_eval = train_test_split(cuts, train_size=0.8, random_state=0)
#cuts_dev, cuts_eval = train_test_split(cuts_dev_eval, train_size=0.5, random_state=0)

cuts = cuts.shuffle()
vad_manifests = prepare_vad_dataset.prepare_vad_dataset(corpus_dir, output_dir)

train_ratio = 0.8

num_total = len(vad_manifests["supervisions"])
stop_train_idx = int(np.floor(num_total * train_ratio))
stop_dev_idx = int((num_total - stop_train_idx) // 2 + stop_train_idx)

train_ids, dev_ids, eval_ids = [], [], []
counter = 0
for sup_seg in vad_manifests["supervisions"]:
    id = sup_seg.to_dict()["id"]
    if counter < stop_train_idx:
        train_ids.append(id)
    elif counter < stop_dev_idx:
        dev_ids.append(id)
    else:
        eval_ids.append(id)
    counter += 1

assert train_ids[-1] != dev_ids[0]
assert dev_ids[-1] != eval_ids[0]

cuts_train = cuts.subset(supervision_ids=train_ids)
cuts_dev = cuts.subset(supervision_ids=dev_ids)
cuts_eval = cuts.subset(supervision_ids=eval_ids)

cuts_eval.to_json(output_dir / 'cuts_80_eval_data_augment.json.gz')


In [6]:
vad_dataset = VadDataset()

train_sampler = SingleCutSampler(cuts_train, shuffle=False, max_duration=300)
dev_sampler = SingleCutSampler(cuts_dev, shuffle=False, max_duration=300)

train_dloader = DataLoader(vad_dataset, sampler=train_sampler, batch_size=None)
dev_dloader = DataLoader(vad_dataset, sampler=dev_sampler, batch_size=None)

cut_ids = next(iter(dev_sampler))
sample = vad_dataset[cut_ids]

In [7]:
learning_rate = 1e-2
weight_decay = 1e-4
num_epochs = 15

model_id = "dnn_ce_fbank80_ignoreindex_lr2_data_augment"
input_size = sample['inputs'][0].shape[1]
log_dir = Path('storage/models') / model_id

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = DNN(input_size=input_size, hidden_size=256, num_classes=2).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=-100)

#optim = torch.optim.Adam(model.parameters())
#                       lr=learning_rate, weight_decay=weight_decay)
                        
optim = torch.optim.SGD(model.parameters(), lr=learning_rate,
                        weight_decay=weight_decay)

train_acc = []
valid_acc = []
for epoch in range(num_epochs):
    # training
    acc = []
    model.train()
    train_dloader.sampler.set_epoch(epoch)

    for batch_idx, data in enumerate(train_dloader):

        inputs = data["inputs"].reshape(-1,input_size)
        targets = data["is_voice"].reshape(-1,1).view(-1)
                
        out = model(inputs.to(device))
        model_acc, _, _ = compute_acc_without_pad(
            out, targets.unsqueeze(1).to(device))

        loss = criterion(out, targets.long()) #ce
        
        if batch_idx % 10 == 0:
            print(f'Batch {batch_idx} => loss {loss}')
        optim.zero_grad()
        loss.backward()
        acc.append(model_acc)
        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optim.step()
        t_r = loss.item()
        
    train_acc.append(np.mean(acc))

    # validation
    acc = []
    model.eval()
    for data in dev_dloader:
        inputs = data["inputs"].reshape(-1,input_size)
        targets = data["is_voice"].reshape(-1,1).view(-1)
        out = model(inputs.to(device))
        model_acc, _, _ = compute_acc_without_pad(
            out, targets.unsqueeze(1).to(device))
        acc.append(model_acc)
    valid_acc.append(np.mean(acc))
    print(f"epoch: {epoch}, train acc: {train_acc[-1]:.3f}, dev acc: {valid_acc[-1]:.3f}, loss:{t_r:.3f}")
    torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
        'optimizer': optim.state_dict()},
        f'{log_dir}/checkpoint_{epoch}.pth')
                    

Batch 0 => loss 0.6250870227813721
Batch 10 => loss 0.27857843041419983
Batch 20 => loss 0.2462901622056961
Batch 30 => loss 0.20429855585098267
epoch: 0, train acc: 89.303, dev acc: 92.000, loss:0.197
Batch 0 => loss 0.24231553077697754
Batch 10 => loss 0.2113974392414093
Batch 20 => loss 0.22007986903190613
Batch 30 => loss 0.1873924285173416
epoch: 1, train acc: 92.545, dev acc: 92.250, loss:0.177
Batch 0 => loss 0.23180542886257172
Batch 10 => loss 0.20052513480186462
Batch 20 => loss 0.2121405452489853
Batch 30 => loss 0.1819898933172226
epoch: 2, train acc: 92.727, dev acc: 92.250, loss:0.170
Batch 0 => loss 0.22829879820346832
Batch 10 => loss 0.19605016708374023
Batch 20 => loss 0.20944048464298248
Batch 30 => loss 0.17929421365261078
epoch: 3, train acc: 92.758, dev acc: 92.500, loss:0.166
Batch 0 => loss 0.22661226987838745
Batch 10 => loss 0.19352906942367554
Batch 20 => loss 0.2072865068912506
Batch 30 => loss 0.17755171656608582
epoch: 4, train acc: 92.879, dev acc: 92.500