In [1]:
from fastai.vision.all import *
from fastai.learner import *
from fastai.data.all import *
from fastai.callback.tracker import SaveModelCallback
import pandas as pd
import matplotlib.pyplot as plt
from pathlib2 import Path
import numpy as np
import random
from torch.nn import MSELoss

In [2]:
%%time

df = pd.read_csv('data/examples_with_length_speech2vec_vocab.csv')
df.shape

CPU times: user 28.5 s, sys: 3.84 s, total: 32.3 s
Wall time: 48.9 s


(17937758, 12)

In [3]:
%%time
fn2features = pd.read_pickle('data/fn2feature.pkl')

CPU times: user 17.6 s, sys: 14.1 s, total: 31.7 s
Wall time: 2min 34s


In [4]:
df = df[((df.source_length < 70) & (df.target_length < 70) & (df.source_length > 25) & (df.target_length > 25))]
df = df[df.in_speech2vec_vocab]
df.reset_index(drop=True, inplace=True)

In [5]:
vocab = list(df.source_word.unique())

In [6]:
len(vocab)

33830

In [7]:
df.head()

Unnamed: 0,source_word,target_word,source_fn,target_fn,set_name,speaker_id,book_id,distance_from_target,audio_fpath,source_length,target_length,in_speech2vec_vocab
0,TIME,INTRODUCE,9b2072544ac6476e9808f483db57be83,2e9837191d104f93bdc794cdc8c3d43a,train-clean-360,7000,83696,2,data/LibriSpeech/train-clean-360/7000/83696/7000-83696-0000.flac,31,49,True
1,INTRODUCE,TIME,2e9837191d104f93bdc794cdc8c3d43a,9b2072544ac6476e9808f483db57be83,train-clean-360,7000,83696,2,data/LibriSpeech/train-clean-360/7000/83696/7000-83696-0000.flac,49,31,True
2,INTRODUCE,MYSELF,2e9837191d104f93bdc794cdc8c3d43a,4b23c979991b438abaa7ba2227dc93fa,train-clean-360,7000,83696,1,data/LibriSpeech/train-clean-360/7000/83696/7000-83696-0000.flac,49,65,True
3,MYSELF,INTRODUCE,4b23c979991b438abaa7ba2227dc93fa,2e9837191d104f93bdc794cdc8c3d43a,train-clean-360,7000,83696,1,data/LibriSpeech/train-clean-360/7000/83696/7000-83696-0000.flac,65,49,True
4,FINE,EVENING,fe0bdbeb478148b48f226ed00e4fd5d2,3f3b57b9ce2b4085b60e01c4f50b7d8c,train-clean-360,7000,83696,1,data/LibriSpeech/train-clean-360/7000/83696/7000-83696-0000.flac,33,54,True


In [8]:
dataset_mean = -3
dataset_std = 12

def normalize_data(ary):
    return (ary - dataset_mean) / dataset_std

In [9]:
def empty_list(): return list()

In [10]:
%%time

word2row_idxs = defaultdict(empty_list)

for idx, row in df.iterrows():
    word2row_idxs[row.source_word].append(idx)
    
pd.to_pickle(word2row_idxs, 'data/word2row_idxs_speech2vec_vocab_subset.pkl')

CPU times: user 4min 2s, sys: 388 ms, total: 4min 3s
Wall time: 4min 3s


In [11]:
word2row_idxs = pd.read_pickle('data/word2row_idxs_speech2vec_vocab_subset.pkl')

In [12]:
def prepare_features(fn, pad_to=69, pad_left=False):
    ary = fn2features[fn][:pad_to]
    example = np.zeros((pad_to, 13))
    if pad_left:
        example[-ary.shape[0]:, :] = ary
    else: example[:ary.shape[0], :] = ary
    return example.astype(np.float32)

In [13]:
if np.nan in vocab: vocab.remove(np.nan)

In [14]:
class Dataset():
    def __init__(self, n):
        self.vocab = vocab * n
    def __len__(self):
        return len(self.vocab)
    def __getitem__(self, idx):
        source_fn_row_idx = word2row_idxs[self.vocab[idx]][0]
        row_idx = np.random.choice(word2row_idxs[self.vocab[idx]])
        source_fn = df.source_fn[source_fn_row_idx]
#         target_fn = df.target_fn[row_idx]
        x = normalize_data(prepare_features(source_fn, pad_left=True))
        return x, vocab.index(df.target_word[row_idx])

In [15]:
BS = 2048
LR = 1e-3
NUM_WORKERS = 6

train_dl = DataLoader(Dataset(10*270), BS, NUM_WORKERS, shuffle=True)
valid_dl = DataLoader(Dataset(30), BS, NUM_WORKERS)

dls = DataLoaders(train_dl, valid_dl)

In [16]:
# Got the following error while training:

# DataLoader worker (pid 2073) is killed by signal: Bus error. It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
# trying the solution I found here: https://github.com/pytorch/pytorch/issues/5040
# which is to execute
!sudo umount /dev/shm/ && sudo mount -t tmpfs -o rw,nosuid,nodev,noexec,relatime,size=50G shm /dev/shm

In [17]:
# bidirectional encoder, 1 layer, concatenate hidden state
class Model(Module):
    def __init__(self, hidden_size=25, num_layers_encoder=1):
        self.return_embeddings = False
        self.num_layers_encoder = num_layers_encoder
        self.hidden_size = hidden_size
        
        self.encoder= nn.LSTM(
            input_size=13,
            hidden_size=hidden_size,
            num_layers=self.num_layers_encoder,
            batch_first=True,
            dropout=0,
            bidirectional=True
        ) 

        self.classifier = nn.Linear(2*hidden_size, len(vocab))
            
    def forward(self, source_features):
        _, (embeddings, _) = self.encoder(source_features)
        
        embeddings = torch.cat((embeddings[-1], embeddings[-2]), 1)
        if self.return_embeddings: return embeddings
    
        return self.classifier(embeddings)

In [18]:
learn = Learner(dls.cuda(), Model().cuda(), loss_func=CrossEntropyLossFlat(), lr=1e-3, opt_func=Adam, metrics=[accuracy])

In [19]:
learn.fit(2, cbs=SaveModelCallback(fname='1e-3_Adam', every_epoch=True))

epoch,train_loss,valid_loss,accuracy,time
0,7.232174,7.218996,0.094288,1:27:14
1,7.114392,7.123175,0.101202,1:32:05


In [20]:
learn.save('temp')

Path('models/temp.pth')

In [21]:
learn.fit(4, cbs=SaveModelCallback(fname='1e-3_Adam', every_epoch=True))

epoch,train_loss,valid_loss,accuracy,time
0,7.087136,7.090648,0.103526,1:27:54
1,7.067607,7.076911,0.104098,1:23:53
2,7.063848,7.061554,0.104314,1:22:40
3,7.047477,7.056731,0.105597,1:26:26


In [23]:
learn.fit(2, cbs=SaveModelCallback(fname='1e-4_Adam', every_epoch=True), lr=1e-4)

epoch,train_loss,valid_loss,accuracy,time
0,7.015036,7.020097,0.107841,1:35:08
1,7.020664,7.013508,0.10804,1:22:11


## Calculate embedding for each unique word in the dataset

In [25]:
dl_unique_utterances = DataLoader(Dataset(1), BS, NUM_WORKERS)

In [26]:
%%time

learn.model.return_embeddings = True
learn.model.train = False

all_embeddings = []

with torch.no_grad():    
    for batch in dl_unique_utterances:
        embeddings = learn.model(batch[0].cuda())
        all_embeddings.append(embeddings.detach().cpu().squeeze(0))

CPU times: user 80 ms, sys: 2.24 s, total: 2.32 s
Wall time: 4.34 s


In [27]:
all_embeddings = torch.cat(all_embeddings)

In [28]:
all_embeddings.shape

torch.Size([33830, 50])

In [29]:
len(vocab)

33830

In [31]:
word2embedding_without_nans= {}

for i, v in enumerate(vocab):
    word2embedding_without_nans[v] = all_embeddings[i].cpu().numpy()

In [32]:
from utils import Embeddings

In [33]:
e = Embeddings(
    np.array(list(word2embedding_without_nans.values())),
    [w.lower() for w in list(word2embedding_without_nans.keys())]
)

In [34]:
for w in ['fast', 'lost', 'small', 'true', 'crazy', 'slow']:
    print(f'{w}: {e.nn_words_to(e[w])}')

fast: ['fast', 'touched', 'moralists', 'ghost', 'mis']
lost: ['lost', 'last', 'washed', 'dispatched', 'flaps']
small: ['small', 'turtle', 'scuffle', 'bellew', 'renewal']
true: ['true', 'likely', 'ties', 'deny', 'without']
crazy: ['crazy', 'chambers', 'diffidence', 'extraneous', 'pores']
slow: ['slow', 'fo', 'miko', 'dilah', 'shawl']


## Evaluating embeddings using [word-embeddings-benchmarks](https://github.com/kudkudak/word-embeddings-benchmarks)

In [36]:
import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.evaluate import evaluate_similarity
from web.embedding import Embedding, Vocabulary
from gensim.models import Word2Vec
from gensim.models import KeyedVectors



In [37]:
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

In [38]:
our_embeddings = Embedding(
    Vocabulary([w.lower() for w in list(word2embedding_without_nans.keys())]),
    np.array(list(word2embedding_without_nans.values()))
)

speech2vec = KeyedVectors.load_word2vec_format('../speech2vec-pretrained-vectors/speech2vec/50.vec', binary=False) 
speech2vec_embeddings = Embedding(Vocabulary(list(speech2vec.vocab.keys())), speech2vec.vectors)

In [39]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(our_embeddings, data.X, data.y)))

Missing 417 words. Will replace them with mean vector
  A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
  B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
Missing 66 words. Will replace them with mean vector
Missing 26 words. Will replace them with mean vector


Spearman correlation of scores on MEN -0.03781824365133741
Spearman correlation of scores on WS353 0.14170910158434244
Spearman correlation of scores on SIMLEX999 0.00976532122028731


In [40]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(speech2vec_embeddings, data.X, data.y)))

Missing 392 words. Will replace them with mean vector
Missing 61 words. Will replace them with mean vector
Missing 24 words. Will replace them with mean vector


Spearman correlation of scores on MEN 0.5896756323911225
Spearman correlation of scores on WS353 0.49890235673392536
Spearman correlation of scores on SIMLEX999 0.28202624769092116
