In [17]:
import os
import sys
import random

import shutil
from pathlib import Path
from math import ceil
from pydub import AudioSegment

import numpy as np
import tensorflow as tf
import torch
from torch import nn
import torch.nn.functional as F
from tensorflow.python.keras import backend as K
from audio import read_mfcc
from batcher import sample_from_mfcc
from constants import SAMPLE_RATE, NUM_FRAMES
from conv_models import DeepSpeakerModel
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from statistics import mean

In [19]:
import wandb
wandb.init(project="automatic-speaker-recognition")

Exception: The wandb backend process has shutdown

In [5]:
np.random.seed(1234)
random.seed(1234)

In [6]:
AUDIO_PATH = 'audio'
SOURCE_DIR = 'accents_features'
MODEL_PATH = 'model.pt'

TRAIN_SPLIT = 0.2

In [7]:
class ClassifierDataset(Dataset):
    """Load numpy files from directory structure where each numpy file represents
    the extracted features from the pre-trained model"""
    
    def __init__(self, directory):
        outputs = []
        labels = []

        speakers = [f for f in os.listdir(directory) if f != '.DS_Store']
        for i, speaker in enumerate(speakers):
            for clip in os.listdir(f'{directory}/{speaker}'):
                if 'npy' not in clip:
                    continue

                output = np.load(f'{directory}/{speaker}/{clip}')

                outputs.append(output)
                labels.append(i)

        self.outputs = np.array(outputs)
        self.labels = np.array(labels)

    def __len__(self):
        return len(self.outputs)

    def __getitem__(self, idx):
        return self.outputs[idx], self.labels[idx]

In [8]:
dataset_dir = f'{AUDIO_PATH}/{SOURCE_DIR}'
full_dataset = ClassifierDataset(dataset_dir)

classes = [f for f in os.listdir(dataset_dir) if f != '.DS_Store']

batch_size = 16

train_size = int(TRAIN_SPLIT * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [9]:
class Classifier(nn.Module):
    """Define a simple linear neural network

    Args:
        num_classes: the number of classes we are classifying

    """
    
    def __init__(self, num_classes):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(512, num_classes)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.softmax(x, dim=1)
        
        return x

In [14]:
def train_classifier(classifier_training_loader, classifier_validation_loader, num_classes, num_epochs=150, lr=0.003, use_checkpoint=False):
    classifier = Classifier(num_classes=num_classes)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(classifier.parameters(), lr=lr)
    initial_epoch_count = 0

    if use_checkpoint:
        print('INFO: Loading state from latest saved model')
        checkpoint = torch.load(MODEL_PATH)
        classifier.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        initial_epoch_count = checkpoint['epoch']
        print(f'INFO: Beginning from epoch {initial_epoch_count}')


    
    #weights = [subject_weight] * num_classes
    #weights[-1] = junk_weight
    #weights = torch.from_numpy(np.array(weights)).type(torch.FloatTensor)

    #criterion = nn.CrossEntropyLoss(weight=weights, reduction='mean')
    # disable weights when we aren't using junk

    wandb.watch(classifier)
    
    for epoch_num, epoch in enumerate(range(num_epochs)):
        
        wandb.log({'epoch': initial_epoch_count + epoch_num + 1})
        
        classifier.train()
        running_loss = 0.0
        for batch_index, (inputs, labels) in enumerate(classifier_training_loader):
            optimizer.zero_grad()
            outputs = classifier(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if batch_index % 120 == 119:
                msg = f'[{initial_epoch_count + epoch_num + 1}, {batch_index + 1}]: loss: {running_loss / 120}'
                print(msg)
                wandb.log({'train_loss': running_loss / 120})
                running_loss = 0.0

        classifier.eval()
        validation_loss = 0.0
        for batch_index, (inputs, labels) in enumerate(classifier_validation_loader):
            optimizer.zero_grad()
            outputs = classifier(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            validation_loss += loss.item()
            if batch_index % 120 == 119:
                msg = f'[{initial_epoch_count + epoch_num + 1}, {batch_index + 1}]: loss: {validation_loss / 120}'
                print(msg)
                wandb.log({'validation_loss': validation_loss / 120})

                validation_loss = 0.0

        torch.save({
            'epoch': initial_epoch_count + epoch_num,
            'model_state_dict': classifier.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()}, MODEL_PATH)
        torch.save(classifier.state_dict(), os.path.join(wandb.run.dir, 'model.pt'))

    return classifier

In [15]:
trained_classifier = train_classifier(train_loader, test_loader, num_classes=len(classes), num_epochs=10000, use_checkpoint=True)

INFO: Loading state from latest saved model
INFO: Beginning from epoch 123
[124, 120]: loss: 7.665716067949931
[124, 240]: loss: 7.6646699666976925
[124, 360]: loss: 7.6658719460169475
[124, 480]: loss: 7.667266277472178
[124, 120]: loss: 7.667253212134043
[124, 240]: loss: 7.664247266451517
[124, 360]: loss: 7.665185451507568
[124, 480]: loss: 7.6599707961082455
[124, 600]: loss: 7.6653106093406675
[124, 720]: loss: 7.660217805703481
[124, 840]: loss: 7.662793552875518
[124, 960]: loss: 7.663309629758199
[124, 1080]: loss: 7.66446715593338
[124, 1200]: loss: 7.666383163134257
[124, 1320]: loss: 7.663613386948904
[124, 1440]: loss: 7.665398426850637
[124, 1560]: loss: 7.660612801710765
[124, 1680]: loss: 7.6631582101186115
[124, 1800]: loss: 7.666545597712199
[124, 1920]: loss: 7.666243569056193
[125, 120]: loss: 7.663768955071768
[125, 240]: loss: 7.6641491214434305
[125, 360]: loss: 7.662774248917898
[125, 480]: loss: 7.6663167436917625
[125, 120]: loss: 7.666541357835134
[125, 240]:

KeyboardInterrupt: 

In [26]:
from sklearn.metrics import f1_score


def test_classifier(classifier, classifier_testing_loader, count, output_stats=False):
    class_correct = [0] * count
    class_total = [0] * count
    
    # used to calculate global f1
    all_labels = []
    all_predicted = []

    with torch.no_grad():
        for data in classifier_testing_loader:
            images, labels = data
            outputs = classifier(images)
            _, predicted = torch.max(outputs, 1)
            c = (predicted == labels).squeeze()
            
            all_labels += labels
            all_predicted += predicted

    f1 = f1_score(all_labels, all_predicted, average='weighted')

    if output_stats:
        print(f'f1: {f1}')
        
    return f1

In [68]:
# prev: f1: 0.8805042046315826
classifier = Classifier(num_classes=len(classes))
checkpoint = torch.load(MODEL_PATH)
classifier.load_state_dict(checkpoint['model_state_dict'])
classifier.eval()
f1 = test_classifier(classifier, test_loader, len(classes), output_stats=True)

f1: 0.9926611587401639
