In [None]:
from fastai.vision.all import *
from fastai.learner import *
from fastai.data.all import *
from fastai.callback.tracker import SaveModelCallback
import pandas as pd
import matplotlib.pyplot as plt
from pathlib2 import Path
import numpy as np
import random
from torch.nn import MSELoss

In [None]:
%%time

df = pd.read_csv('data/examples.csv')
df.shape

While attempting to train the model, it turned out that the reading and unpickling operation done at this scale, with so many files, is very computationally expensive.

But the examples in the mfcc represenatations are very small. Let's read them all into the memory before training.

In [None]:
# uniq_fns = np.unique(df.target_fn.values)

In [None]:
# %%time

# fn2feature = {}
# for fn in uniq_fns:
#     ary = pd.read_pickle(f'data/examples/{fn}.pkl')
#     fn2feature[fn] = ary

# pd.to_pickle(fn2feature, 'data/fn2feature.pkl')

In [None]:
%%time
fn2features = pd.read_pickle('data/fn2feature.pkl')

I suspect that reading the data from a file saved within numpy (`numpy.save`) is much less computationally expensive than unpickling it, but I might be wrong. Either way, at ~4 million of unique utterances, the dataset is small enough to comfortably fit within memory of a GCP instance (at ~53GBs used RAM during training).

This might not be ideal for experimentation on home rigs. Saving the data using `numpy.save` and evaluating performance would definitely be a very interesting and useful exercise.

In [None]:
df.head()

In [None]:
train_examples = df[df.set_name.isin(['train-clean-360', 'train-clean-100', 'dev-clean'])]
valid_examples = df[df.set_name == 'test-clean']

In [None]:
train_examples.size, valid_examples.size

In [None]:
%%time

unique_fns = df.source_fn.unique()
np.random.shuffle(unique_fns)
lengths = []
for i, features in enumerate(fn2features.values()):
    lengths.append(features.shape[0])

In [None]:
max(lengths)

In [None]:
np.mean(lengths)

In [None]:
plt.imshow(features.T)

In [None]:
dataset_mean = -5
dataset_std = 15

def normalize_data(ary):
    return (ary - dataset_mean) / dataset_std

In [None]:
def prepare_features(fn, pad_to=max(lengths), pad_left=False):
    ary = fn2features[fn]
    example = np.zeros((pad_to, 13))
    if pad_left:
        example[-ary.shape[0]:, :] = ary
    else: example[:ary.shape[0], :] = ary
    return example.astype(np.float32)

In [None]:
dss = Datasets(
    df,
    [lambda row: normalize_data(prepare_features(row.source_fn, pad_left=True)),
     lambda row: normalize_data(prepare_features(row.target_fn)),
     lambda row: normalize_data(prepare_features(row.target_fn))],
    n_inp=2,
    splits = [train_examples.index, valid_examples.index]
)

In [None]:
BS = 2048
LR = 1e-3
NUM_WORKERS = 10

train_dl = DataLoader(dss.train, BS, NUM_WORKERS, shuffle=True)
valid_dl = DataLoader(dss.valid, BS, NUM_WORKERS)

dls = DataLoaders(train_dl, valid_dl)

In [None]:
# Got the following error while training:

# DataLoader worker (pid 2073) is killed by signal: Bus error. It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
# trying the solution I found here: https://github.com/pytorch/pytorch/issues/5040
# which is to execute
!sudo umount /dev/shm/ && sudo mount -t tmpfs -o rw,nosuid,nodev,noexec,relatime,size=50G shm /dev/shm

In [None]:
teacher_forcing_ratio = 0

class Model(Module):
    def __init__(self, hidden_size=50):
        self.encoder= nn.LSTM(
            input_size=13,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True,
            dropout=0,
            bidirectional=True
        )
        self.decoder = nn.LSTM(
            input_size=2*hidden_size+13,
            hidden_size=2*hidden_size,
            num_layers=1,
            batch_first=True,
            dropout=0,
            bidirectional=False
        )
        self.lin = nn.Linear(2*hidden_size, 13)
            
    def forward(self, source_features, target_features):
        _, (embeddings_dec, _) = self.encoder(source_features)
        embeddings_dec = embeddings_dec.view(embeddings_dec.shape[1], -1)
        
        outputs = torch.zeros_like(target_features)
        input = target_features[:, :1, :]
        outputs[:, 0, :] = input.squeeze()
    
        hidden = embeddings_dec.unsqueeze(0)
        cell = torch.zeros_like(embeddings_dec).unsqueeze(0)
        for t in range(1, target_features.shape[1]):
            input = torch.cat((input, embeddings_dec.unsqueeze(1)), 2)
            x, (hidden, cell) = self.decoder(input, (hidden, cell))
            x = self.lin(x)
            input = torch.sigmoid(x) * 25
            outputs[:, t, :] = input.squeeze()
            
            if random.random() < teacher_forcing_ratio:
                input = target_features[:, t, :].unsqueeze(1)
        return outputs

In [None]:
# mse_loss = MSELoss()
# def modified_MSE(preds, targs):
#     mask = targs == 0
#     preds[mask] = 0
#     return mse_loss(preds, targs)

learn = Learner(dls.cuda(), Model().cuda(), loss_func=MSELoss(), lr=1e-3)

In [None]:
learn.fit(4, cbs=SaveModelCallback(fname='1e-3_Adam', every_epoch=True))

## Calculate embedding for each unique word in the dataset

In [None]:
df_unique_utterances = df.drop_duplicates(['source_fn'])

In [None]:
dss_all_utterances = Datasets(
    df_unique_utterances,
    [lambda row: prepare_features(row.source_fn, pad_left=True), lambda row: 0],
    n_inp=2
)

In [None]:
all_dl = DataLoader(dss_all_utterances, BS, NUM_WORKERS)

In [None]:
%%time

all_embeddings = []
with torch.no_grad():
    learn.model.train = False
    for batch in all_dl:
        _, (embeddings, _) = learn.model.encoder(batch[0].cuda())
        all_embeddings.append(embeddings.squeeze(0).detach().cpu())

In [None]:
all_embeddings = torch.cat(all_embeddings)

In [None]:
df_unique_utterances.reset_index(drop=True, inplace=True)

In [None]:
%%time

word2row_idxs = defaultdict(lambda: list())

for idx, row in df_unique_utterances.iterrows():
    word2row_idxs[row.source_word].append(idx)
    
word2embedding = {}

for k, v in word2row_idxs.items():
    word2embedding[k] = all_embeddings[np.array(v)].mean(0)

In [None]:
word2embedding_without_nans= {}
nans_encountered = 0
for k, v in word2embedding.items():
    if k == k and (not np.isnan(v.numpy()).any()):
        word2embedding_without_nans[k] = v.numpy()
    else: nans_encountered += 1

print(f'Encountered rows with nan values: {nans_encountered}')

## Evaluating embeddings using [word-embeddings-benchmarks](https://github.com/kudkudak/word-embeddings-benchmarks)

In [None]:
import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.evaluate import evaluate_similarity
from web.embedding import Embedding, Vocabulary
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [None]:
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

In [None]:
our_embeddings = Embedding(
    Vocabulary([w.lower() for w in list(word2embedding_without_nans.keys())]),
    np.array(list(word2embedding_without_nans.values()))
)

speech2vec = KeyedVectors.load_word2vec_format('../speech2vec-pretrained-vectors/speech2vec/50.vec', binary=False) 
speech2vec_embeddings = Embedding(Vocabulary(list(speech2vec.vocab.keys())), speech2vec.vectors)

In [None]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(speech2vec_embeddings, data.X, data.y)))

In [None]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(our_embeddings, data.X, data.y)))