The task of leraning semantic audio embeddings feel daunting - maybe we can help our model by pretraining the encoder to learn to distinguish words from audio?

This is providing a crutch to our model - it no longer will be unsupervised in the sense that we will levarge word labels for the pretraining. It might nonetheless be very useful to do as we work towards a fully working end to end unsupervised model.

In [2]:
from fastai.vision.all import *
from fastai.learner import *
from fastai.data.all import *
from fastai.callback.tracker import SaveModelCallback
import pandas as pd
import matplotlib.pyplot as plt
from pathlib2 import Path
import numpy as np
import random
from torch.nn import MSELoss

In [3]:
%%time

df = pd.read_csv('data/examples.csv')
df.shape

CPU times: user 28.4 s, sys: 2.86 s, total: 31.2 s
Wall time: 32.5 s


(17937758, 9)

In [21]:
vocab = list(df.target_word.unique())

In [8]:
%%time
fn2features = pd.read_pickle('data/fn2feature.pkl')

CPU times: user 18.5 s, sys: 13.9 s, total: 32.4 s
Wall time: 2min 24s


In [9]:
dataset_mean = -5
dataset_std = 15

def normalize_data(ary):
    return (ary - dataset_mean) / dataset_std

In [10]:
def empty_list(): return list()

In [11]:
word2row_idxs = pd.read_pickle('data/word2row_idxs.pkl')

In [22]:
def prepare_features(fn, pad_to=291, pad_left=False):
    ary = fn2features[fn][:pad_to]
    example = np.zeros((pad_to, 13))
    if pad_left:
        example[-ary.shape[0]:, :] = ary
    else: example[:ary.shape[0], :] = ary
    return example.astype(np.float32)

In [24]:
class Dataset():
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self, idx):
        target_fn = df.target_fn[idx]
        x = normalize_data(prepare_features(target_fn, pad_left=True))
        return x, vocab.index(df.target_word[idx])

In [25]:
train_examples = df[df.set_name.isin(['train-clean-360', 'train-clean-100', 'dev-clean'])]
valid_examples = df[df.set_name == 'test-clean']

In [29]:
train_ds = Dataset(train_examples)
valid_ds = Dataset(valid_examples)

In [30]:
len(train_ds), len(valid_ds)

(17743170, 194588)

In [31]:
BS = 2048
LR = 1e-3
NUM_WORKERS = 8

train_dl = DataLoader(train_ds, BS, NUM_WORKERS, shuffle=True)
valid_dl = DataLoader(valid_ds, BS, NUM_WORKERS)

dls = DataLoaders(train_dl, valid_dl)

In [33]:
# bidirectional encoder, 1 layer, concatenate hidden state
class Model(Module):
    def __init__(self, hidden_size=25, num_layers_encoder=3):
        self.return_embeddings = False
        self.num_layers_encoder = num_layers_encoder
        self.hidden_size = hidden_size
        
        self.encoder= nn.LSTM(
            input_size=13,
            hidden_size=hidden_size,
            num_layers=self.num_layers_encoder,
            batch_first=True,
            dropout=0,
            bidirectional=True
        )
        self.classifier = nn.Linear(2*hidden_size, len(vocab))
            
    def forward(self, x):
        _, (embeddings, _) = self.encoder(x)
        embeddings = torch.cat((embeddings[-1], embeddings[-2]), 1)
        return self.classifier(embeddings)

In [34]:
learn = Learner(dls.cuda(), Model().cuda(), loss_func=CrossEntropyLossFlat(), lr=1e-3, opt_func=Adam, metrics=[accuracy])

In [None]:
learn.fit(10, lr=1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,2.345063,2.355478,0.554258,21:41
