In [1]:
import os
import train_embeddings_helpers

In [2]:
df = train_embeddings_helpers.main()

In [3]:
df.shape

(7375, 3)

In [5]:
training_df = df[(df['Folder'] == 'train-clean-360') | (df['Folder'] == 'train-clean-100')]

In [6]:
training_df.shape

(2682, 3)

In [7]:
training_df.Folder.value_counts()

train-clean-360    2097
train-clean-100     585
Name: Folder, dtype: int64

In [8]:
# Text preprocessing: 
# lowercase text
training_df.Text = training_df.Text.str.lower()

In [9]:
# Preprocess with SpaCy - Init preprocessing class
preproc = train_embeddings_helpers.TextPreprocess()

In [10]:
%%time
# lemmatize text - we don't need it as FastText includes morphological structure
training_df.Text2 = training_df.Text.apply(preproc.lemmatize)

CPU times: user 7min 18s, sys: 42.8 s, total: 8min 1s
Wall time: 8min 3s


In [11]:
%%time
# remove stop words
training_df.Text2 = training_df.Text.apply(preproc.remove_stopwords)

CPU times: user 7min 5s, sys: 41.8 s, total: 7min 47s
Wall time: 7min 49s


In [12]:
# replace multiple spaces with one
training_df.Text2 = training_df.Text2.str.replace('[ ]{2,}', ' ')

In [13]:
from gensim.models.fasttext import FastText 

In [14]:
# vocab is a list of text chunks, not a list of sentences - does it matter?
vocab = training_df.Text2.to_list()
vocab = [chunk.split() for chunk in vocab]
len(vocab)

2682

In [15]:
%%time
# Init model with following parameters
# Rest of parameters are default, see: https://radimrehurek.com/gensim_3.8.3/auto_examples/tutorials/run_fasttext.html#sphx-glr-auto-examples-tutorials-run-fasttext-py
model = FastText(vocab,
    sg=1, #skipgram, not cbow
    size=50, # from the speech2vec authors
    window=2) # from the speech2vec authors

CPU times: user 1min 31s, sys: 1.53 s, total: 1min 33s
Wall time: 38.7 s


In [61]:
'freedom' in model.wv.vocab

True

In [60]:
model.wv.most_similar('freedom')

[('prosperity', 0.8996621370315552),
 ('welfare', 0.8982266187667847),
 ('censorship', 0.8971235752105713),
 ('discipleship', 0.8961487412452698),
 ('championship', 0.8959816694259644),
 ('victuals', 0.8880187273025513),
 ('immunity', 0.8850102424621582),
 ('independence', 0.8830586671829224),
 ('freedmen', 0.8828678131103516),
 ('depends', 0.8816428780555725)]

In [20]:
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("librispeech.model")
model.save(fname)

In [21]:
# load speech2vec paper pretrained vectors
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
speech2vec = KeyedVectors.load_word2vec_format(datapath('path/to/vectors'), binary=False) 

In [22]:
# Comparing embeddings

In [25]:
import logging
# Assumes the word-embeddings-benchmark repo is available
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.evaluate import evaluate_similarity
from web.embedding import Embedding, Vocabulary
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [26]:
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}


Dataset created in /Users/susan_jose/web_data/similarity



In [40]:
model_embeddings = model.wv

In [43]:
our_embeddings = Embedding(Vocabulary(list(model_embeddings.vocab.keys())), model_embeddings.vectors)

In [42]:
speech2vec_embeddings = Embedding(Vocabulary(list(speech2vec.vocab.keys())), speech2vec.vectors)

In [64]:
our_embeddings['friend']

array([-0.18957646, -0.16634491,  0.17309435, -0.10543959,  0.40227103,
        0.46282825,  0.36515525,  0.40559214, -0.04941564, -0.14805059,
       -0.43525735,  0.23659192,  0.24746099, -0.40600204,  0.47878677,
        0.03553356,  0.38474318,  0.5425196 ,  0.2359668 , -0.31592923,
        0.00404495,  0.8307184 ,  0.59101576,  0.30376983,  0.4759713 ,
        0.37360162, -0.21704458,  0.02078121, -0.7336424 , -0.25237545,
        0.15958194, -0.14043784, -0.47706226,  0.06924255, -0.10315103,
       -0.33340412,  0.22820638, -0.12231199, -0.28851575,  0.09375487,
       -1.1073669 ,  0.38313985, -0.07264016, -0.11328681,  0.45372888,
        0.273097  ,  0.22372684,  0.3424083 ,  0.5861805 , -0.0139005 ],
      dtype=float32)

In [65]:
speech2vec_embeddings['friend']

array([ 9.3413e-02, -2.8946e-02,  4.7311e-02, -6.3347e-01,  8.1125e-02,
       -5.3966e-01,  2.4762e-01,  2.5534e-01,  7.6590e-01,  6.0520e-01,
       -2.6295e-01, -6.2333e-01,  1.0154e-03,  4.5374e-02, -2.9838e-01,
        1.5166e-01, -4.0217e-02, -2.2317e-01,  3.7647e-01, -9.2910e-02,
       -3.9502e-01, -8.6376e-06,  3.9423e-02,  1.4150e-01, -2.2859e-01,
        2.7005e-02,  8.4433e-02, -2.0136e-01, -1.5508e-01, -9.8139e-02,
        9.0681e-02,  7.5589e-01,  1.8953e-02,  2.4214e-01, -4.0025e-01,
       -9.9142e-02,  1.6269e-01, -8.4639e-02,  1.2320e-01, -3.2370e-01,
        4.4290e-01,  3.1282e-01,  2.5814e-02,  8.5238e-02,  5.5223e-01,
        1.7657e-01,  2.3437e-01, -1.0683e-01, -3.2922e-01,  1.4219e-01],
      dtype=float32)

In [None]:
s2v_emb_no_nan = {}
nans_encountered = 0
for i in range(len(vocab)):    
    if vocab[i] == vocab[i]:
        word2embedding_without_nans[vocab[i]] = embeddings[i]
    else: nans_encountered += 1

print(f'Encountered rows with nan values: {nans_encountered}')

In [44]:
for name, data in tasks.items():
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(our_embeddings, data.X, data.y)))

Missing 701 words. Will replace them with mean vector
Missing 92 words. Will replace them with mean vector
Missing 113 words. Will replace them with mean vector
Spearman correlation of scores on MEN -0.05461907793994027
Spearman correlation of scores on WS353 0.08576695593634177
Spearman correlation of scores on SIMLEX999 0.05970123870565761


In [45]:
for name, data in tasks.items():
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(speech2vec_embeddings, data.X, data.y)))

Missing 392 words. Will replace them with mean vector
Missing 61 words. Will replace them with mean vector
Missing 24 words. Will replace them with mean vector
Spearman correlation of scores on MEN 0.6056592803599269
Spearman correlation of scores on WS353 0.43349390636024643
Spearman correlation of scores on SIMLEX999 0.25938770901422736
