In [1]:
from fastai.vision.all import *
from fastai.learner import *
from fastai.data.all import *
from fastai.callback.tracker import SaveModelCallback
import pandas as pd
import matplotlib.pyplot as plt
from pathlib2 import Path
import numpy as np
import random
from torch.nn import MSELoss

In [2]:
%%time

df = pd.read_csv('data/examples.csv')
df.shape

CPU times: user 15.9 s, sys: 2.08 s, total: 18 s
Wall time: 18 s


(14276908, 8)

In [3]:
df.head()

Unnamed: 0,source_word,target_word,source_fn,target_fn,set_name,speaker_id,book_id,distance_from_target
0,I,FELT,d0af6ad1469446abb6b3b8c98570f139,00ac9fc716964726bc0c17b850c712c0,train-clean-360,7000,83696,1
1,I,THAT,d0af6ad1469446abb6b3b8c98570f139,70f6c024d94549b1a2c19fb42298a8ec,train-clean-360,7000,83696,2
2,FELT,I,00ac9fc716964726bc0c17b850c712c0,d0af6ad1469446abb6b3b8c98570f139,train-clean-360,7000,83696,1
3,FELT,THAT,00ac9fc716964726bc0c17b850c712c0,70f6c024d94549b1a2c19fb42298a8ec,train-clean-360,7000,83696,1
4,FELT,IT,00ac9fc716964726bc0c17b850c712c0,acab1acfa29046c883b4193a4d50a82e,train-clean-360,7000,83696,2


In [4]:
df = df[df.distance_from_target == 1]
df = df[(~df.target_word.isna() & ~df.source_word.isna())]
df.reset_index(inplace=True, drop=True)

In [5]:
words = np.concatenate((df.target_word, df.source_word))
vocab = np.unique(words)

In [6]:
len(vocab)

64763

In [7]:
vocab = list(df.target_word.unique())

In [8]:
# %%time

# uniq_fns = np.unique(df.target_fn.values)
# fn2feature = {}
# for fn in uniq_fns:
#     ary = pd.read_pickle(f'data/examples/{fn}.pkl')
#     fn2feature[fn] = ary

# pd.to_pickle(fn2feature, 'data/fn2feature.pkl')

In [9]:
%%time
fn2features = pd.read_pickle('data/fn2feature.pkl')

CPU times: user 16.6 s, sys: 12.1 s, total: 28.7 s
Wall time: 28.7 s


In [10]:
dataset_mean = -5
dataset_std = 15

def normalize_data(ary):
    return (ary - dataset_mean) / dataset_std

In [11]:
def empty_list(): return list()

In [12]:
# %%time

# word2row_idxs = defaultdict(empty_list)

# for idx, row in df.iterrows():
#     word2row_idxs[row.source_word].append(idx)
    
# pd.to_pickle(word2row_idxs, 'data/word2row_idxs.pkl')

In [13]:
word2row_idxs = pd.read_pickle('data/word2row_idxs.pkl')

In [14]:
def prepare_features(fn, pad_to=291, pad_left=False):
    ary = fn2features[fn][:pad_to]
    example = np.zeros((pad_to, 13))
    if pad_left:
        example[-ary.shape[0]:, :] = ary
    else: example[:ary.shape[0], :] = ary
    return example.astype(np.float32)

In [15]:
class Dataset():
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self, idx):
        source_word = df.source_word[idx]
        target_word = df.target_word[idx]
        return vocab.index(df.source_word[idx]), vocab.index(df.target_word[idx])

In [16]:
train_examples = df[df.set_name.isin(['train-clean-360', 'train-clean-100', 'dev-clean'])]
valid_examples = df[df.set_name == 'test-clean']

In [17]:
train_ds = Dataset(train_examples)
valid_ds = Dataset(valid_examples)

In [18]:
len(train_ds), len(valid_ds)

(7754734, 87118)

In [19]:
BS = 2048
LR = 1e-3
NUM_WORKERS = 8

train_dl = DataLoader(train_ds, BS, NUM_WORKERS, shuffle=True)
valid_dl = DataLoader(valid_ds, BS, NUM_WORKERS)

dls = DataLoaders(train_dl, valid_dl)

In [20]:
class Model(Module):
    def __init__(self, hidden_size=50):
        self.embeddings =nn.Embedding(len(vocab), hidden_size)
        self.linear = nn.Linear(hidden_size, len(vocab))
            
    def forward(self, x):
        x = self.embeddings(x)
        return self.linear(x)

In [34]:
learn = Learner(dls.cuda(), Model().cuda(), loss_func=CrossEntropyLossFlat(), lr=1e-3, opt_func=Adam, metrics=[accuracy])

In [47]:
learn.fit(80, lr=1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,6.006123,5.900098,0.122386,06:28
1,5.978576,5.891648,0.121995,06:29
2,5.978074,5.88459,0.123235,06:29
3,5.965701,5.877883,0.122271,06:28
4,5.964121,5.871982,0.123511,06:28
5,5.964297,5.866182,0.123476,06:29
6,5.964773,5.860165,0.124383,06:29
7,5.961366,5.855416,0.124211,06:30
8,5.9568,5.850979,0.124188,06:30
9,5.940906,5.846665,0.124567,06:30


In [48]:
learn.save('text_embeddings')

Path('models/text_embeddings.pth')

In [58]:
learn.fit(40, lr=1e-4)

epoch,train_loss,valid_loss,accuracy,time
0,5.815872,5.749569,0.127276,06:23
1,5.812735,5.748744,0.127253,06:26
2,5.823493,5.748303,0.127184,06:26
3,5.822514,5.74802,0.127505,06:27
4,5.816574,5.747809,0.127333,06:34
5,5.822571,5.747888,0.127253,06:25
6,5.815116,5.747584,0.127586,06:25
7,5.811428,5.747555,0.127528,06:27
8,5.807635,5.747655,0.127528,06:27
9,5.821029,5.747511,0.127517,06:26


## Evalute embeddings

In [59]:
from utils import Embeddings

In [60]:
embeddings = learn.model.embeddings.weight.cpu().detach().numpy()

In [61]:
word2embedding_without_nans= {}
nans_encountered = 0
for i in range(len(vocab)):    
    if vocab[i] == vocab[i]:
        word2embedding_without_nans[vocab[i]] = embeddings[i]
    else: nans_encountered += 1

print(f'Encountered rows with nan values: {nans_encountered}')

Encountered rows with nan values: 0


In [62]:
e = Embeddings(
    np.array(list(word2embedding_without_nans.values())),
    [w.lower() for w in list(word2embedding_without_nans.keys())]
)

In [63]:
for w in ['fast', 'lost', 'small', 'true', 'crazy', 'slow']:
    print(f'{w}: {e.nn_words_to(e[w])}')

fast: ['fast', 'softly', 'heavily', 'rapidly', 'promiscuously']
lost: ['lost', 'stinted', 'taken', 'spoiled', 'recovered']
small: ['small', 'large', 'big', 'little', 'largest']
true: ['true', 'gratifying', 'divinest', 'noble', 'faultless']
crazy: ['crazy', 'worthless', 'patchwork', 'clever', 'lucky']
slow: ['slow', 'quick', 'swift', 'hasty', 'brisk']


## Evaluating embeddings using [word-embeddings-benchmarks](https://github.com/kudkudak/word-embeddings-benchmarks)

In [64]:
import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.evaluate import evaluate_similarity
from web.embedding import Embedding, Vocabulary
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [65]:
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

In [66]:
our_embeddings = Embedding(
    Vocabulary([w.lower() for w in list(word2embedding_without_nans.keys())]),
    np.array(list(word2embedding_without_nans.values()))
)

speech2vec = KeyedVectors.load_word2vec_format('../speech2vec-pretrained-vectors/speech2vec/50.vec', binary=False) 
speech2vec_embeddings = Embedding(Vocabulary(list(speech2vec.vocab.keys())), speech2vec.vectors)

In [67]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(our_embeddings, data.X, data.y)))

Missing 254 words. Will replace them with mean vector
Missing 50 words. Will replace them with mean vector
Missing 11 words. Will replace them with mean vector


Spearman correlation of scores on MEN 0.34706545804388017
Spearman correlation of scores on WS353 0.1660529090897729
Spearman correlation of scores on SIMLEX999 0.19159358728669124


In [57]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(speech2vec_embeddings, data.X, data.y)))

Missing 392 words. Will replace them with mean vector
Missing 61 words. Will replace them with mean vector
Missing 24 words. Will replace them with mean vector


Spearman correlation of scores on MEN 0.5896756323911225
Spearman correlation of scores on WS353 0.49890235673392536
Spearman correlation of scores on SIMLEX999 0.28202624769092116
