In [1]:
from fastai.vision.all import *
from fastai.learner import *
from fastai.data.all import *
from fastai.callback.tracker import SaveModelCallback
import pandas as pd
import matplotlib.pyplot as plt
from pathlib2 import Path
import numpy as np
import random
from torch.nn import MSELoss

In [2]:
%%time

df = pd.read_csv('data/examples.csv')
df.shape

CPU times: user 27.7 s, sys: 3.55 s, total: 31.3 s
Wall time: 33.1 s


(17937758, 9)

In [3]:
vocab = list(df.target_word.unique())

In [4]:
%%time
fn2features = pd.read_pickle('data/fn2feature.pkl')

CPU times: user 19.3 s, sys: 16.8 s, total: 36.1 s
Wall time: 2min 9s


In [5]:
dataset_mean = -5
dataset_std = 15

def normalize_data(ary):
    return (ary - dataset_mean) / dataset_std

In [6]:
def empty_list(): return list()

In [7]:
word2row_idxs = pd.read_pickle('data/word2row_idxs.pkl')

In [8]:
def prepare_features(fn, pad_to=291, pad_left=False):
    ary = fn2features[fn][:pad_to]
    example = np.zeros((pad_to, 13))
    if pad_left:
        example[-ary.shape[0]:, :] = ary
    else: example[:ary.shape[0], :] = ary
    return example.astype(np.float32)

In [9]:
class Dataset():
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self, idx):
        source_word = df.source_word[idx]
        target_word = df.target_word[idx]
        return vocab.index(df.source_word[idx]), vocab.index(df.target_word[idx])

In [10]:
train_examples = df[df.set_name.isin(['train-clean-360', 'train-clean-100', 'dev-clean'])]
valid_examples = df[df.set_name == 'test-clean']

In [11]:
train_ds = Dataset(train_examples)
valid_ds = Dataset(valid_examples)

In [12]:
len(train_ds), len(valid_ds)

(17743170, 194588)

In [13]:
BS = 2048
LR = 1e-3
NUM_WORKERS = 8

train_dl = DataLoader(train_ds, BS, NUM_WORKERS, shuffle=True)
valid_dl = DataLoader(valid_ds, BS, NUM_WORKERS)

dls = DataLoaders(train_dl, valid_dl)

In [14]:
class Model(Module):
    def __init__(self, hidden_size=50):
        self.embeddings =nn.Embedding(len(vocab), hidden_size)
        self.linear = nn.Linear(hidden_size, len(vocab))
            
    def forward(self, x):
        x = self.embeddings(x)
        return self.linear(x)

In [15]:
learn = Learner(dls.cuda(), Model().cuda(), loss_func=CrossEntropyLossFlat(), lr=1e-3, opt_func=Adam, metrics=[accuracy])

In [45]:
learn.fit(4, lr=1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,6.845256,6.770251,0.068576,15:40
1,6.749409,6.693825,0.071803,16:09
2,6.720186,6.658025,0.073139,15:56
3,6.692617,6.634218,0.074244,15:34


In [46]:
learn.fit(4, lr=1e-4)

epoch,train_loss,valid_loss,accuracy,time
0,6.654045,6.626616,0.074773,15:10
1,6.666471,6.623824,0.07482,15:32
2,6.643942,6.621236,0.074943,15:14
3,6.638732,6.619186,0.075118,15:01


In [47]:
learn.save('text_embeddings')

Path('models/text_embeddings.pth')

## Evalute embeddings

In [48]:
from utils import Embeddings

In [49]:
embeddings = learn.model.embeddings.weight.cpu().detach().numpy()

In [50]:
word2embedding_without_nans= {}
nans_encountered = 0
for i in range(len(vocab)):    
    if vocab[i] == vocab[i]:
        word2embedding_without_nans[vocab[i]] = embeddings[i]
    else: nans_encountered += 1

print(f'Encountered rows with nan values: {nans_encountered}')

Encountered rows with nan values: 1


In [51]:
e = Embeddings(
    np.array(list(word2embedding_without_nans.values())),
    [w.lower() for w in list(word2embedding_without_nans.keys())]
)

In [52]:
for w in ['fast', 'lost', 'small', 'true', 'crazy', 'slow']:
    print(f'{w}: {e.nn_words_to(e[w])}')

fast: ['fast', 'soon', 'sausage', 'caskeys', 'weishaupts']
lost: ['lost', 'counterpane', 'bithynians', 'created', 'undeceive']
small: ['small', 'large', 'big', 'little', 'vast']
true: ['true', "oglethorpe's", 'evident', 'estimated', 'remarkable']
crazy: ['crazy', "lightwood's", 'undertaken', 'cables', 'quieted']
slow: ['slow', 'clinton', 'rehearsal', "englishman's", 'fissures']


## Evaluating embeddings using [word-embeddings-benchmarks](https://github.com/kudkudak/word-embeddings-benchmarks)

In [53]:
import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.evaluate import evaluate_similarity
from web.embedding import Embedding, Vocabulary
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [54]:
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

In [55]:
our_embeddings = Embedding(
    Vocabulary([w.lower() for w in list(word2embedding_without_nans.keys())]),
    np.array(list(word2embedding_without_nans.values()))
)

speech2vec = KeyedVectors.load_word2vec_format('../speech2vec-pretrained-vectors/speech2vec/50.vec', binary=False) 
speech2vec_embeddings = Embedding(Vocabulary(list(speech2vec.vocab.keys())), speech2vec.vectors)

In [56]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(our_embeddings, data.X, data.y)))

Missing 242 words. Will replace them with mean vector
Missing 49 words. Will replace them with mean vector
Missing 11 words. Will replace them with mean vector


Spearman correlation of scores on MEN 0.15531257943634613
Spearman correlation of scores on WS353 0.12475258100502422
Spearman correlation of scores on SIMLEX999 -0.00859022433002065


In [57]:
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(speech2vec_embeddings, data.X, data.y)))

Missing 392 words. Will replace them with mean vector
Missing 61 words. Will replace them with mean vector
Missing 24 words. Will replace them with mean vector


Spearman correlation of scores on MEN 0.5896756323911225
Spearman correlation of scores on WS353 0.49890235673392536
Spearman correlation of scores on SIMLEX999 0.28202624769092116
