In [1]:
from fastai.vision.all import *
from fastai.learner import *
from fastai.data.all import *
from fastai.callback.tracker import SaveModelCallback
import pandas as pd
import matplotlib.pyplot as plt
from pathlib2 import Path
import numpy as np
import random
from torch.nn import MSELoss

In [2]:
%%time

df = pd.read_csv('data/examples_with_length_speech2vec_vocab.csv')
df.shape

CPU times: user 17.8 s, sys: 2.53 s, total: 20.3 s
Wall time: 20.3 s


(14276908, 11)

In [3]:
df.head()

Unnamed: 0,source_word,target_word,source_fn,target_fn,set_name,speaker_id,book_id,distance_from_target,source_length,target_length,in_speech2vec_vocab
0,I,FELT,d0af6ad1469446abb6b3b8c98570f139,00ac9fc716964726bc0c17b850c712c0,train-clean-360,7000,83696,1,16,22,True
1,I,THAT,d0af6ad1469446abb6b3b8c98570f139,70f6c024d94549b1a2c19fb42298a8ec,train-clean-360,7000,83696,2,16,12,True
2,FELT,I,00ac9fc716964726bc0c17b850c712c0,d0af6ad1469446abb6b3b8c98570f139,train-clean-360,7000,83696,1,22,16,True
3,FELT,THAT,00ac9fc716964726bc0c17b850c712c0,70f6c024d94549b1a2c19fb42298a8ec,train-clean-360,7000,83696,1,22,12,True
4,FELT,IT,00ac9fc716964726bc0c17b850c712c0,acab1acfa29046c883b4193a4d50a82e,train-clean-360,7000,83696,2,22,8,True


In [4]:
df = df[df.distance_from_target == 1]
df = df[(~df.target_word.isna() & ~df.source_word.isna())]

df.shape

(7841852, 11)

In [5]:
df = df[df.in_speech2vec_vocab]

df.shape

(7718702, 11)

In [6]:
df = df[((df.source_length < 71) & (df.target_length < 71) & (df.source_length > 19))]

df.shape

(4177337, 11)

In [7]:
df.reset_index(inplace=True, drop=True)

In [8]:
words = np.concatenate((df.target_word, df.source_word))
vocab = np.unique(words)

In [9]:
len(vocab)

34360

In [10]:
# %%time

# uniq_fns = np.unique(df.target_fn.values)
# fn2feature = {}
# for fn in uniq_fns:
#     ary = pd.read_pickle(f'data/examples/{fn}.pkl')
#     fn2feature[fn] = ary

# pd.to_pickle(fn2feature, 'data/fn2feature.pkl')

In [11]:
%%time
fn2features = pd.read_pickle('data/fn2feature.pkl')

CPU times: user 17.5 s, sys: 11.6 s, total: 29 s
Wall time: 29.1 s


In [12]:
def empty_list(): return list()

In [13]:
# %%time

# word2row_idxs = defaultdict(empty_list)

# for idx, row in df.iterrows():
#     word2row_idxs[row.source_word].append(idx)
    
# pd.to_pickle(word2row_idxs, 'data/word2row_idxs.pkl')

In [14]:
word2row_idxs = pd.read_pickle('data/word2row_idxs.pkl')

In [15]:
class Dataset():
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self, idx):
        source_word = self.df.source_word[idx]
        target_word = self.df.target_word[idx]
        return vocab.index(self.df.source_word[idx]), vocab.index(self.df.target_word[idx])

In [16]:
train_examples = df[df.set_name.isin(['train-clean-360', 'train-clean-100', 'dev-clean'])]
valid_examples = df[df.set_name == 'test-clean']

train_examples.reset_index(inplace=True, drop=True)
valid_examples.reset_index(inplace=True, drop=True)

In [17]:
train_ds = Dataset(train_examples)
valid_ds = Dataset(valid_examples)

In [18]:
len(train_ds), len(valid_ds)

(4131148, 46189)

In [19]:
BS = 2048
NUM_WORKERS = 8

train_dl = DataLoader(train_ds, BS, NUM_WORKERS, shuffle=True)
valid_dl = DataLoader(valid_ds, BS, NUM_WORKERS)

dls = DataLoaders(train_dl, valid_dl)

In [20]:
class Model(Module):
    def __init__(self, hidden_size=50):
        self.embeddings =nn.Embedding(len(vocab), hidden_size)
        self.linear = nn.Linear(hidden_size, len(vocab))
            
    def forward(self, x):
        x = self.embeddings(x)
        return self.linear(x)

In [21]:
learn = Learner(
    dls.cuda(),
    Model().cuda(),
    loss_func=CrossEntropyLossFlat(),
    opt_func=Adam,
    metrics=[accuracy]
)

In [22]:
learn.fit(120, lr=1e-3, cbs=SaveModelCallback(fname='text_embeddings_smaller_dataset', every_epoch=True))

epoch,train_loss,valid_loss,accuracy,time
0,,00:07,,


AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 185, in _worker_loop
    data = fetcher.fetch(index)
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 34, in fetch
    data = next(self.dataset_iter)
  File "/home/radek/workspace/fastai/fastai/data/load.py", line 111, in create_batches
    yield from map(self.do_batch, self.chunkify(res))
  File "/opt/conda/lib/python3.7/site-packages/fastcore/utils.py", line 371, in chunked
    res = list(itertools.islice(it, chunk_sz))
  File "/home/radek/workspace/fastai/fastai/data/load.py", line 124, in do_item
    try: return self.after_item(self.create_item(s))
  File "/home/radek/workspace/fastai/fastai/data/load.py", line 130, in create_item
    def create_item(self, s):  return next(self.it) if s is None else self.dataset[s]
  File "<ipython-input-15-bb90a9afd58f>", line 9, in __getitem__
    return vocab.index(self.df.source_word[idx]), vocab.index(self.df.target_word[idx])
AttributeError: 'numpy.ndarray' object has no attribute 'index'


In [None]:
%%capture

val_losses, accuracies, task_perf = [], [], []
for i in range(120):
    learn.load(f'text_embeddings_{i}')
    loss, accuracy = learn.validate()
    val_losses.append(loss)
    accuracies.append(accuracy)
    
    embeddings = learn.model.embeddings.weight.cpu().detach().numpy()
    our_embeddings = Embedding(
        Vocabulary([w.lower() for w in vocab]),
        embeddings
    )

    task_perf.append([evaluate_similarity(our_embeddings, data.X, data.y) for name, data in iteritems(tasks)])

In [None]:
men, ws353, simlex999 = list(zip(*task_perf))

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax1 = plt.subplots()

ax1.plot(val_losses, label='val loss')

ax2 = ax1.twinx()
ax2.plot(men, label='MEN', c='g')
ax2.plot(ws353, label='WS353', c='m')
ax2.plot(simlex999, label='SIMLEX999', c='y')

ax1.legend(loc=[0.07, 0.9])
ax2.legend(loc=[0.7, 0.15])

ax1.set_xlabel('epochs');

## Evalute embeddings

In [None]:
from utils import Embeddings

In [None]:
embeddings = learn.model.embeddings.weight.cpu().detach().numpy()

In [None]:
word2embedding_without_nans= {}
nans_encountered = 0
for i in range(len(vocab)):    
    if vocab[i] == vocab[i]:
        word2embedding_without_nans[vocab[i]] = embeddings[i]
    else: nans_encountered += 1

print(f'Encountered rows with nan values: {nans_encountered}')

In [None]:
e = Embeddings(
    np.array(list(word2embedding_without_nans.values())),
    [w.lower() for w in list(word2embedding_without_nans.keys())]
)

In [None]:
for w in ['fast', 'lost', 'small', 'true', 'crazy', 'slow']:
    print(f'{w}: {e.nn_words_to(e[w])}')

## Evaluating embeddings using [word-embeddings-benchmarks](https://github.com/kudkudak/word-embeddings-benchmarks)

In [None]:
import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.evaluate import evaluate_similarity
from web.embedding import Embedding, Vocabulary
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [None]:
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

In [None]:
our_embeddings = Embedding(
    Vocabulary([w.lower() for w in list(word2embedding_without_nans.keys())]),
    np.array(list(word2embedding_without_nans.values()))
)

speech2vec = KeyedVectors.load_word2vec_format('../speech2vec-pretrained-vectors/speech2vec/50.vec', binary=False) 
speech2vec_embeddings = Embedding(Vocabulary(list(speech2vec.vocab.keys())), speech2vec.vectors)

In [None]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(our_embeddings, data.X, data.y)))

In [None]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(speech2vec_embeddings, data.X, data.y)))