In [2]:
project_dir = "/Users/joannarownicka/software/vad"
os.chdir(project_dir)

In [3]:
import glob
import sox
import json
import numpy as np
import os
from pathlib import Path

import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from lhotse import LilcomFilesWriter

from lhotse.features import Fbank, FeatureSetBuilder
from lhotse.cut import CutSet, SupervisionSet
from lhotse.dataset.sampling import SingleCutSampler, BucketingSampler
from lhotse.dataset.vad import VadDataset

from torch.utils.data import DataLoader

from preprocessing import prepare_vad_dataset
from models.dnn import DNN
from models.binary_dnn import binaryClassification

In [4]:
root_dir = Path('evaluation/data')
corpus_dir = root_dir / 'vad_data/'
output_dir = root_dir / 'vad_data_nb/'

In [5]:
cuts = CutSet.from_json(output_dir / 'cuts.json.gz')
cuts.describe()

Cuts count: 957
Total duration (hours): 3.3
Speech duration (hours): 2.6 (80.6%)
***
Duration statistics (seconds):
mean    12.3
std      3.9
min      1.4
25%     11.1
50%     13.9
75%     15.1
max     17.2
dtype: float64


In [6]:
# Shuffle data but keep seed fixed, split into 80/10/10
#cuts_train, cuts_dev_eval = train_test_split(cuts, train_size=0.8, random_state=0)
#cuts_dev, cuts_eval = train_test_split(cuts_dev_eval, train_size=0.5, random_state=0)

vad_manifests = prepare_vad_dataset.prepare_vad_dataset(corpus_dir, output_dir)

train_ratio = 0.8

num_total = len(vad_manifests["supervisions"])
stop_train_idx = int(np.floor(num_total * train_ratio))
stop_dev_idx = int((num_total - stop_train_idx) // 2 + stop_train_idx)

train_ids, dev_ids, eval_ids = [], [], []
counter = 0
for sup_seg in vad_manifests["supervisions"]:
    id = sup_seg.to_dict()["id"]
    if counter < stop_train_idx:
        train_ids.append(id)
    elif counter < stop_dev_idx:
        dev_ids.append(id)
    else:
        eval_ids.append(id)
    counter += 1

assert train_ids[-1] != dev_ids[0]
assert dev_ids[-1] != eval_ids[0]

cuts_train = cuts.subset(supervision_ids=train_ids)
cuts_dev = cuts.subset(supervision_ids=dev_ids)
cuts_eval = cuts.subset(supervision_ids=eval_ids)

cuts_train.describe()
cuts_dev.describe()
cuts_eval.describe()


Cuts count: 766
Total duration (hours): 2.6
Speech duration (hours): 2.1 (80.6%)
***
Duration statistics (seconds):
mean    12.4
std      3.9
min      1.6
25%     11.2
50%     13.9
75%     15.1
max     16.9
dtype: float64
Cuts count: 95
Total duration (hours): 0.3
Speech duration (hours): 0.3 (79.9%)
***
Duration statistics (seconds):
mean    12.4
std      4.1
min      1.4
25%     11.3
50%     13.9
75%     15.1
max     17.2
dtype: float64
Cuts count: 98
Total duration (hours): 0.3
Speech duration (hours): 0.3 (79.4%)
***
Duration statistics (seconds):
mean    12.2
std      4.0
min      2.1
25%      9.5
50%     13.8
75%     15.1
max     16.4
dtype: float64


In [22]:
vad_dataset = VadDataset()

train_sampler = SingleCutSampler(cuts_train.cut_into_windows(5.0, keep_excessive_supervisions=True), shuffle=False, max_duration=300)
dev_sampler = SingleCutSampler(cuts_dev.cut_into_windows(5.0, keep_excessive_supervisions=True), shuffle=False, max_duration=300)
eval_sampler = SingleCutSampler(cuts_eval.cut_into_windows(5.0, keep_excessive_supervisions=True), shuffle=False, max_duration=300)

train_dloader = DataLoader(vad_dataset, sampler=train_sampler, batch_size=None)
dev_dloader = DataLoader(vad_dataset, sampler=dev_sampler, batch_size=None)
eval_dloader = DataLoader(vad_dataset, sampler=eval_sampler, batch_size=None)

cut_ids = next(iter(dev_sampler))
sample = vad_dataset[cut_ids]

In [32]:
lr = 1e-2
wd = 1e-4

model_id = "dnn_ce"
input_size = sample['inputs'][0].shape[1]
log_dir = Path('storage/models') / model_id

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = DNN(input_size=input_size, hidden_size=256, num_classes=2).to(device)
#model = binaryClassification().to(device)

#criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()

#optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
optim = torch.optim.SGD(model.parameters(), lr=lr,
                        momentum=0.9, dampening=0.9,
                        weight_decay=wd)

def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)

    return acc

train_acc = []
valid_acc = []
for epoch in range(10):
  # training
    acc = []
    model.train()
    train_dloader.sampler.set_epoch(epoch)

    for batch_idx, data in enumerate(train_dloader):

        inputs = data["inputs"].reshape(-1,input_size)
        targets = data["is_voice"].reshape(-1,1).view(-1)

        out = model(inputs.to(device))

        #loss = criterion(out, targets.unsqueeze(1).to(device)) #bce
        loss = criterion(out, targets.long()) #ce
        model_acc = binary_acc(out, targets.unsqueeze(1).to(device))

        if batch_idx % 10 == 0:
            print(f'Batch {batch_idx} => loss {loss}')
        optim.zero_grad()
        loss.backward()
        acc.append(model_acc)
        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optim.step()
        t_r = loss.item()
        
    train_acc.append(np.mean(acc))
    # validation
    acc = []
    model.eval()
    for data in dev_dloader:
        inputs = data["inputs"].reshape(-1,input_size)
        targets = data["is_voice"].reshape(-1,1).view(-1)
        out = model(inputs.to(device))
        model_acc = binary_acc(out, targets.unsqueeze(1).to(device))
        acc.append(model_acc)
    valid_acc.append(np.mean(acc))
    print(f"epoch: {epoch}, train acc: {train_acc[-1]:.3f}, dev acc: {valid_acc[-1]:.3f}, loss:{t_r:.3f}")
    torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
        'optimizer': optim.state_dict()},
        '{}/checkpoint_{}.pth'.format(log_dir, epoch))
                    

IndexError: Target 1 is out of bounds.