In [1]:
from fastai.vision.all import *
from fastai.learner import *
from fastai.data.all import *
from fastai.callback.tracker import SaveModelCallback
import pandas as pd
import matplotlib.pyplot as plt
from pathlib2 import Path
import numpy as np
import random
from torch.nn import MSELoss

In [2]:
%%time

df = pd.read_csv('data/examples_with_length_speech2vec_vocab.csv')
df.shape

CPU times: user 17.8 s, sys: 2.29 s, total: 20.1 s
Wall time: 20.1 s


(14276908, 11)

In [3]:
# %%time
# fn2features = pd.read_pickle('data/fn2feature.pkl')

CPU times: user 17.7 s, sys: 15.4 s, total: 33 s
Wall time: 2min 16s


In [4]:
# def prepare_features(fn, pad_to=70, pad_left=False):
#     ary = fn2features[fn][:pad_to]
#     example = np.zeros((pad_to, 13))
#     if pad_left:
#         example[-ary.shape[0]:, :] = ary
#     else: example[:ary.shape[0], :] = ary
#     return example.astype(np.float32)

In [5]:
# dataset_mean = -2
# dataset_std = 10

# def normalize_data(ary):
#     return (ary - dataset_mean) / dataset_std

In [6]:
# %%time

# features = []
# for fn in df.target_fn.unique():
#     features.append(normalize_data(prepare_features(fn)))

CPU times: user 50.2 s, sys: 7.08 s, total: 57.3 s
Wall time: 57.3 s


In [7]:
# np.stack(features).mean(), np.stack(features).std()

(-0.0002623361, 1.0108287)

In [6]:
# fn2features_norm = {fn: normalize_data(prepare_features(fn)) for fn, features in fn2features.items()}
# pd.to_pickle(fn2features_norm, 'data/fn2features_norm.pkl')

In [3]:
%%time
fn2features = pd.read_pickle('data/fn2features_norm.pkl')

CPU times: user 12.6 s, sys: 12.9 s, total: 25.5 s
Wall time: 25.5 s


In [4]:
df = df[df.distance_from_target == 1]
df = df[(~df.target_word.isna() & ~df.source_word.isna())]
df = df[df.in_speech2vec_vocab]
df = df[((df.source_length < 71) & (df.target_length < 71) & (df.source_length > 19))]
df.reset_index(inplace=True, drop=True)

In [5]:
words = np.concatenate((df.target_word, df.source_word))
vocab = list(np.unique(words))

In [6]:
def empty_list(): return list()

In [7]:
# %%time

# word2row_idxs = defaultdict(empty_list)

# for idx, row in df.iterrows():
#     word2row_idxs[row.source_word].append(idx)    

# pd.to_pickle(word2row_idxs, 'data/word2row_idxs_speech2vec_vocab_min_length_20_max_length_70_only_driect_neighbors.pkl')

In [8]:
word2row_idxs = pd.read_pickle('data/word2row_idxs_speech2vec_vocab_min_length_20_max_length_70_only_driect_neighbors.pkl')
word2index = {word: i for i, word in enumerate(vocab)}

In [9]:
train_examples = df[df.set_name.isin(['train-clean-360', 'train-clean-100', 'dev-clean'])]
valid_examples = df[df.set_name == 'test-clean']

train_examples.reset_index(inplace=True, drop=True)
valid_examples.reset_index(inplace=True, drop=True)

In [10]:
class Dataset():
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self, idx):
        source_fn = self.df.source_fn[idx]
        target_word = self.df.target_word[idx]
        return fn2features[source_fn], word2index[target_word]

In [11]:
train_ds = Dataset(train_examples)
valid_ds = Dataset(valid_examples)

len(train_ds), len(valid_ds)

(4131148, 46189)

In [12]:
BS = 2048
NUM_WORKERS = 8

train_dl = DataLoader(train_ds, BS, NUM_WORKERS, shuffle=True)
valid_dl = DataLoader(valid_ds, BS, NUM_WORKERS)

dls = DataLoaders(train_dl, valid_dl)

In [13]:
# Got the following error while training:

# DataLoader worker (pid 2073) is killed by signal: Bus error. It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
# trying the solution I found here: https://github.com/pytorch/pytorch/issues/5040
# which is to execute
!sudo umount /dev/shm/ && sudo mount -t tmpfs -o rw,nosuid,nodev,noexec,relatime,size=50G shm /dev/shm

In [14]:
class Model(Module):
    def __init__(self, hidden_size=25):
        self.return_embeddings = False
        self.hidden_size = hidden_size
        
        self.encoder= nn.LSTM(
            input_size=13,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True,
            dropout=0,
            bidirectional=True
        )
        
        self.linear = nn.Linear(2*hidden_size, len(vocab))
            
    def forward(self, source_features):
        _, (embeddings, _) = self.encoder(source_features)        
        embeddings = torch.cat((embeddings[-1], embeddings[-2]), 1)
        
        if self.return_embeddings: return embeddings

        return self.linear(embeddings)

In [None]:
learn = Learner(
    dls.cuda(),
    Model().cuda(),
    loss_func=CrossEntropyLossFlat(),
    opt_func=Adam,
    metrics=[accuracy]
)

In [29]:
learn.fit(120, lr=1e-3, cbs=SaveModelCallback(fname='rnn_encoder', every_epoch=True))

epoch,train_loss,valid_loss,time
0,,00:11,


OSError: [Errno 12] Cannot allocate memory

## Calculate embedding for each unique word in the dataset

In [28]:
df_unique_utterances = df[df.set_name.isin(['train-clean-360', 'train-clean-100', 'dev-clean'])].drop_duplicates(['source_fn'])
df_unique_utterances.reset_index(drop=True, inplace=True)

In [30]:
all_dl = DataLoader(Dataset(df_unique_utterances), BS, NUM_WORKERS)

In [31]:
%%time

learn.model.return_embeddings = True
learn.model.train = False

all_embeddings = []

with torch.no_grad():    
    for batch in all_dl:
        embeddings = learn.model(batch[0].cuda())
        all_embeddings.append(embeddings.detach().cpu().squeeze(0))

CPU times: user 9.15 s, sys: 6.12 s, total: 15.3 s
Wall time: 1min 58s


In [32]:
all_embeddings = torch.cat(all_embeddings)

In [33]:
all_embeddings.shape

torch.Size([1810253, 50])

In [39]:
%%time

word2row_idxs_unique_utterances = defaultdict(empty_list)

for idx, row in df_unique_utterances.iterrows():
    word2row_idxs_unique_utterances[row.source_word].append(idx)
    
pd.to_pickle(word2row_idxs_unique_utterances, 'word2row_idxs_unique_utterances_speech2vec_vocab_min_length_20_max_length_70_only_driect_neighbors.pkl')

CPU times: user 2min 34s, sys: 316 ms, total: 2min 34s
Wall time: 2min 34s


In [40]:
word2row_idxs_unique_utterances = pd.read_pickle('word2row_idxs_unique_utterances_speech2vec_vocab_min_length_20_max_length_70_only_driect_neighbors.pkl')

In [41]:
word2embedding = {}

for k, v in word2row_idxs_unique_utterances.items():
    word2embedding[k] = all_embeddings[np.array(v)].mean(0)

In [42]:
word2embedding_without_nans= {}
nans_encountered = 0
for k, v in word2embedding.items():
    if k in vocab and k == k and (not np.isnan(v.numpy()).any()):
        word2embedding_without_nans[k] = v.numpy()
    else: nans_encountered += 1

print(f'Encountered rows with nan values: {nans_encountered}')

Encountered rows with nan values: 0


In [43]:
from utils import Embeddings

In [44]:
e = Embeddings(
    np.array(list(word2embedding_without_nans.values())),
    [w.lower() for w in list(word2embedding_without_nans.keys())]
)

In [45]:
for w in ['fast', 'lost', 'small', 'true', 'crazy', 'slow']:
    print(f'{w}: {e.nn_words_to(e[w])}')

fast: ['fast', 'priest', 'trust', 'legs', 'cast']
lost: ['lost', 'paused', 'closed', 'most', 'beast']
small: ['small', 'call', 'tall', 'cell', 'bow']
true: ['true', 'two', 'through', 'too', 'to']
crazy: ['crazy', 'freely', 'liberty', 'hearty', 'party']
slow: ['slow', 'so', 'swallow', 'below', 'subtle']


## Evaluating embeddings using [word-embeddings-benchmarks](https://github.com/kudkudak/word-embeddings-benchmarks)

In [46]:
import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.evaluate import evaluate_similarity
from web.embedding import Embedding, Vocabulary
from gensim.models import Word2Vec
from gensim.models import KeyedVectors



In [47]:
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

In [48]:
our_embeddings = Embedding(
    Vocabulary([w.lower() for w in list(word2embedding_without_nans.keys())]),
    np.array(list(word2embedding_without_nans.values()))
)

speech2vec = KeyedVectors.load_word2vec_format('../speech2vec-pretrained-vectors/speech2vec/50.vec', binary=False) 
speech2vec_embeddings = Embedding(Vocabulary(list(speech2vec.vocab.keys())), speech2vec.vectors)

In [49]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(our_embeddings, data.X, data.y)))

Missing 417 words. Will replace them with mean vector
  A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
  B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
Missing 66 words. Will replace them with mean vector
Missing 26 words. Will replace them with mean vector


Spearman correlation of scores on MEN -0.016457845881016953
Spearman correlation of scores on WS353 0.0895924100304081
Spearman correlation of scores on SIMLEX999 -0.08322106510718805


In [50]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(speech2vec_embeddings, data.X, data.y)))

Missing 392 words. Will replace them with mean vector
Missing 61 words. Will replace them with mean vector
Missing 24 words. Will replace them with mean vector


Spearman correlation of scores on MEN 0.5896756323911225
Spearman correlation of scores on WS353 0.49890235673392536
Spearman correlation of scores on SIMLEX999 0.28202624769092116
