In [1]:
from fastai.vision.all import *
from fastai.learner import *
from fastai.data.all import *
from fastai.callback.tracker import SaveModelCallback
import pandas as pd
import matplotlib.pyplot as plt
from pathlib2 import Path
import numpy as np
import random
from torch.nn import MSELoss

In [2]:
%%time

df = pd.read_csv('data/examples_with_length_speech2vec_vocab.csv')
df.shape

CPU times: user 31.5 s, sys: 3.42 s, total: 34.9 s
Wall time: 48.2 s


(17937758, 12)

In [3]:
%%time
fn2features = pd.read_pickle('data/fn2feature.pkl')

CPU times: user 18.5 s, sys: 13 s, total: 31.5 s
Wall time: 2min 20s


In [4]:
df = df[((df.source_length < 70) & (df.target_length < 70) & (df.source_length > 25) & (df.target_length > 25))]
df = df[df.in_speech2vec_vocab]
df.reset_index(drop=True, inplace=True)

In [5]:
vocab = list(df.source_word.unique())

In [6]:
len(vocab)

33881

In [7]:
df.head()

Unnamed: 0,source_word,target_word,source_fn,target_fn,set_name,speaker_id,book_id,distance_from_target,audio_fpath,source_length,target_length,in_speech2vec_vocab
0,TIME,INTRODUCE,9b2072544ac6476e9808f483db57be83,2e9837191d104f93bdc794cdc8c3d43a,train-clean-360,7000,83696,2,data/LibriSpeech/train-clean-360/7000/83696/7000-83696-0000.flac,31,49,True
1,INTRODUCE,TIME,2e9837191d104f93bdc794cdc8c3d43a,9b2072544ac6476e9808f483db57be83,train-clean-360,7000,83696,2,data/LibriSpeech/train-clean-360/7000/83696/7000-83696-0000.flac,49,31,True
2,INTRODUCE,MYSELF,2e9837191d104f93bdc794cdc8c3d43a,4b23c979991b438abaa7ba2227dc93fa,train-clean-360,7000,83696,1,data/LibriSpeech/train-clean-360/7000/83696/7000-83696-0000.flac,49,65,True
3,MYSELF,INTRODUCE,4b23c979991b438abaa7ba2227dc93fa,2e9837191d104f93bdc794cdc8c3d43a,train-clean-360,7000,83696,1,data/LibriSpeech/train-clean-360/7000/83696/7000-83696-0000.flac,65,49,True
4,FINE,EVENING,fe0bdbeb478148b48f226ed00e4fd5d2,3f3b57b9ce2b4085b60e01c4f50b7d8c,train-clean-360,7000,83696,1,data/LibriSpeech/train-clean-360/7000/83696/7000-83696-0000.flac,33,54,True


In [8]:
dataset_mean = -3
dataset_std = 12

def normalize_data(ary):
    return (ary - dataset_mean) / dataset_std

In [9]:
def empty_list(): return list()

In [10]:
# %%time

# word2row_idxs = defaultdict(empty_list)

# for idx, row in df.iterrows():
#     word2row_idxs[row.source_word].append(idx)
    
# pd.to_pickle(word2row_idxs, 'data/word2row_idxs_speech2vec_vocab_subset.pkl')

In [11]:
word2row_idxs = pd.read_pickle('data/word2row_idxs_speech2vec_vocab_subset.pkl')

In [12]:
def prepare_features(fn, pad_to=69, pad_left=False):
    ary = fn2features[fn][:pad_to]
    example = np.zeros((pad_to, 13))
    if pad_left:
        example[-ary.shape[0]:, :] = ary
    else: example[:ary.shape[0], :] = ary
    return example.astype(np.float32)

In [13]:
if np.nan in vocab: vocab.remove(np.nan)

In [14]:
for i, (k, v) in enumerate(fn2features.items()):
    fn2features[k] = normalize_data(prepare_features(k, pad_left=True))

class Dataset():
    def __init__(self, n):
        self.vocab = vocab * n
    def __len__(self):
        return len(self.vocab)
    def __getitem__(self, idx):
        row_idx = np.random.choice(word2row_idxs[self.vocab[idx]])
        source_fn = df.source_fn[row_idx]
        
        if random.random() < 0.5:
            target_fn = df.target_fn[row_idx]
            target = 1
        else:
            target_fn = df.target_fn[np.random.randint(df.target_fn.shape[0])]
            target = 0

        a = fn2features[source_fn]
        b = fn2features[target_fn]
        return np.stack((a, b)), target

In [15]:
BS = 128
LR = 1e-3
NUM_WORKERS = 6

train_dl = DataLoader(Dataset(10*270), BS, NUM_WORKERS, shuffle=True, pin_memory=True)
valid_dl = DataLoader(Dataset(30), BS, NUM_WORKERS)

dls = DataLoaders(train_dl, valid_dl)

In [16]:
# # Got the following error while training:

# # DataLoader worker (pid 2073) is killed by signal: Bus error. It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
# # trying the solution I found here: https://github.com/pytorch/pytorch/issues/5040
# # which is to execute
!sudo umount /dev/shm/ && sudo mount -t tmpfs -o rw,nosuid,nodev,noexec,relatime,size=70G shm /dev/shm

In [17]:
# bidirectional encoder, 1 layer, concatenate hidden state
class Model(Module):
    def __init__(self, hidden_size=25, num_layers_encoder=1):
        self.return_embeddings = False
        self.num_layers_encoder = num_layers_encoder
        self.hidden_size = hidden_size
        
        self.encoder= nn.LSTM(
            input_size=13,
            hidden_size=hidden_size,
            num_layers=self.num_layers_encoder,
            batch_first=True,
            dropout=0,
            bidirectional=True
        )
        
#         self.classifier = nn.Sequential(
#             LinBnDrop(100, 100, p=0.2),
#             LinBnDrop(100, 100, p=0.1),
#             LinBnDrop(100, 1)
#         )
            
    def forward(self, source_and_target_features):
        source_features = source_and_target_features[:, 0]
        _, (source_embeddings, _) = self.encoder(source_features)
        source_embeddings = torch.cat((source_embeddings[-1], source_embeddings[-2]), 1)
        if self.return_embeddings: return source_embeddings
        
        target_features = source_and_target_features[:, 1]
        _, (target_embeddings, _) = self.encoder(target_features)
        target_embeddings = torch.cat((target_embeddings[-1], target_embeddings[-2]), 1)
        
#         return self.classifier(torch.cat((source_embeddings, target_embeddings), 1))
        return torch.sum(source_embeddings * target_embeddings, -1)

In [18]:
learn = Learner(dls.cuda(), Model().cuda(), loss_func=BCEWithLogitsLossFlat(), opt_func=SGD, metrics=[accuracy_multi])

In [19]:
learn.fit(4, cbs=SaveModelCallback(fname='1e-3_sgd_adam', every_epoch=True))

epoch,train_loss,valid_loss,accuracy_multi,time
0,0.507166,0.512729,0.742463,2:59:21
1,0.495369,0.497761,0.748112,2:56:12
2,0.49148,0.488964,0.752384,2:52:15
3,0.485294,0.483425,0.757532,2:53:14


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [22]:
learn.fit(4, cbs=SaveModelCallback(fname='1e-3_siamese_adam', every_epoch=True))

epoch,train_loss,valid_loss,accuracy_multi,time
0,0.450644,0.449098,0.785276,51:44
1,0.441434,0.443099,0.788482,57:39
2,0.438916,0.439137,0.789262,51:14
3,0.438113,0.437682,0.790504,53:29


In [44]:
learn.load('1e-3_siamese_adam_3')

<fastai.learner.Learner at 0x7fd978ea8250>

In [45]:
learn.fit(4, cbs=SaveModelCallback(fname='1e-4_siamese_adam', every_epoch=True), lr=1e-4)

epoch,train_loss,valid_loss,accuracy_multi,time
0,0.43445,0.433558,0.791991,1:11:59
1,0.43241,0.434373,0.791121,51:54
2,0.431528,0.43395,0.791736,52:52
3,0.434072,0.434962,0.79018,47:56


## Calculate embedding for each unique word in the dataset

In [20]:
df_unique_utterances = df[df.set_name.isin(['train-clean-360', 'train-clean-100', 'dev-clean'])].drop_duplicates(['source_fn'])
df_unique_utterances.reset_index(drop=True, inplace=True)

In [21]:
class DatasetAllUtterances():
    def __len__(self):
        return df_unique_utterances.shape[0]
    def __getitem__(self, idx):
        source_fn = df_unique_utterances.iloc[idx].source_fn
        target_fn = df_unique_utterances.iloc[idx].target_fn
        x = normalize_data(prepare_features(source_fn, pad_left=True))
        y = normalize_data(prepare_features(target_fn))
        return np.stack((x, y)), y

In [22]:
all_dl = DataLoader(DatasetAllUtterances(), BS, NUM_WORKERS)

In [23]:
%%time

learn.model.return_embeddings = True
learn.model.train = False

all_embeddings = []

with torch.no_grad():    
    for batch in all_dl:
        embeddings = learn.model(batch[0].cuda())
        all_embeddings.append(embeddings.detach().cpu().squeeze(0))

CPU times: user 1min 12s, sys: 14.7 s, total: 1min 27s
Wall time: 2min 41s


In [24]:
all_embeddings = torch.cat(all_embeddings)

In [25]:
all_embeddings.shape

torch.Size([1810253, 50])

In [26]:
# %%time

# word2row_idxs_unique_utterances = defaultdict(empty_list)

# for idx, row in df_unique_utterances.iterrows():
#     word2row_idxs_unique_utterances[row.source_word].append(idx)
    
# pd.to_pickle(word2row_idxs_unique_utterances, 'word2row_idxs_unique_utterances_speech2vec_vocab_subset.pkl')

In [27]:
word2row_idxs_unique_utterances = pd.read_pickle('word2row_idxs_unique_utterances_speech2vec_vocab_subset.pkl')

In [28]:
word2embedding = {}

for k, v in word2row_idxs_unique_utterances.items():
    word2embedding[k] = all_embeddings[np.array(v)].mean(0)

In [29]:
word2embedding_without_nans= {}
nans_encountered = 0
for k, v in word2embedding.items():
    if k in vocab and k == k and (not np.isnan(v.numpy()).any()):
        word2embedding_without_nans[k] = v.numpy()
    else: nans_encountered += 1

print(f'Encountered rows with nan values: {nans_encountered}')

Encountered rows with nan values: 0


In [30]:
from utils import Embeddings

In [31]:
e = Embeddings(
    np.array(list(word2embedding_without_nans.values())),
    [w.lower() for w in list(word2embedding_without_nans.keys())]
)

In [32]:
for w in ['fast', 'lost', 'small', 'true', 'crazy', 'slow']:
    print(f'{w}: {e.nn_words_to(e[w])}')

fast: ['fast', 'past', 'passed', 'cast', 'vast']
lost: ['lost', 'announced', 'honest', 'proofs', 'pronounced']
small: ['small', 'smell', 'simple', 'flannel', 'tall']
true: ['true', 'withdrew', 'too', 'crew', 'fog']
crazy: ['crazy', 'sympathy', 'lazy', 'filthy', 'healthy']
slow: ['slow', 'flow', 'hello', 'steel', 'oh']


## Evaluating embeddings using [word-embeddings-benchmarks](https://github.com/kudkudak/word-embeddings-benchmarks)

In [33]:
import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.evaluate import evaluate_similarity
from web.embedding import Embedding, Vocabulary
from gensim.models import Word2Vec
from gensim.models import KeyedVectors



In [34]:
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

In [35]:
our_embeddings = Embedding(
    Vocabulary([w.lower() for w in list(word2embedding_without_nans.keys())]),
    np.array(list(word2embedding_without_nans.values()))
)

speech2vec = KeyedVectors.load_word2vec_format('../speech2vec-pretrained-vectors/speech2vec/50.vec', binary=False) 
speech2vec_embeddings = Embedding(Vocabulary(list(speech2vec.vocab.keys())), speech2vec.vectors)

In [36]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(our_embeddings, data.X, data.y)))

Missing 417 words. Will replace them with mean vector
  A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
  B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
Missing 66 words. Will replace them with mean vector
Missing 26 words. Will replace them with mean vector


Spearman correlation of scores on MEN -0.016762819642329257
Spearman correlation of scores on WS353 0.00432674472936977
Spearman correlation of scores on SIMLEX999 -0.03186349447228689


In [37]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(speech2vec_embeddings, data.X, data.y)))

Missing 392 words. Will replace them with mean vector
Missing 61 words. Will replace them with mean vector
Missing 24 words. Will replace them with mean vector


Spearman correlation of scores on MEN 0.5896756323911225
Spearman correlation of scores on WS353 0.49890235673392536
Spearman correlation of scores on SIMLEX999 0.28202624769092116
