In [28]:
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import text_to_word_sequence

def load_data(percentage_of_sentences=None):
    train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], batch_size=-1, as_supervised=True)

    train_sentences, y_train = tfds.as_numpy(train_data)
    test_sentences, y_test = tfds.as_numpy(test_data)

    # Take only a given percentage of the entire data
    if percentage_of_sentences is not None:
        assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)

        len_train = int(percentage_of_sentences/100*len(train_sentences))
        train_sentences, y_train = train_sentences[:len_train], y_train[:len_train]

        len_test = int(percentage_of_sentences/100*len(test_sentences))
        test_sentences, y_test = test_sentences[:len_test], y_test[:len_test]

    X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in train_sentences]
    X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in test_sentences]

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_data(percentage_of_sentences=10)

In [29]:
from gensim.models import Word2Vec

word2vec = Word2Vec(sentences=X_train)
wv = word2vec.wv

In [30]:
wv['dog']

array([-0.23497668,  0.21410768, -0.09329746,  0.28843257, -0.04212553,
       -0.34174207,  0.1387363 ,  0.6288712 , -0.20370507, -0.18113895,
       -0.09912767, -0.34090236, -0.00578235,  0.12549575, -0.03556455,
       -0.19897054,  0.19211519, -0.23955515,  0.01756367, -0.35183042,
        0.10034265,  0.07496872,  0.21786611, -0.09130505,  0.05712227,
        0.01246604, -0.22605494, -0.15108247, -0.14810516,  0.02504244,
        0.18676655, -0.02100156,  0.17425832, -0.31824365, -0.09372737,
        0.26894048,  0.11289704, -0.1902635 , -0.2051254 , -0.35592568,
       -0.20140712, -0.23146987, -0.23238839,  0.19963369,  0.3337569 ,
        0.00120562, -0.1379078 , -0.08244433,  0.25404862,  0.14747573,
        0.09173041, -0.11931178, -0.24258114,  0.01009944, -0.19944556,
        0.14878528,  0.11641503, -0.07407069, -0.21641934,  0.07837123,
        0.09931982,  0.03796556, -0.02898055,  0.09036348, -0.22365566,
        0.17359386, -0.08948612,  0.14312512, -0.312696  ,  0.30

In [31]:
size = len(wv['cat'])

print(f'The embedding space is of size {size}')

The embedding space is of size 100


In [32]:
wv.most_similar("car")

[('hotel', 0.9776648283004761),
 ('wedding', 0.9737498760223389),
 ('letters', 0.9731951951980591),
 ('door', 0.9720884561538696),
 ('revenge', 0.9708735942840576),
 ('mate', 0.9703019261360168),
 ('breasts', 0.9700096845626831),
 ('apartment', 0.9693183898925781),
 ('attend', 0.9692969918251038),
 ('storm', 0.9689933657646179)]

In [33]:
word_embedding = wv['car']
wv.similar_by_vector(word_embedding)

[('car', 1.0),
 ('hotel', 0.9776648283004761),
 ('wedding', 0.9737498760223389),
 ('letters', 0.9731951951980591),
 ('door', 0.9720884561538696),
 ('revenge', 0.9708735942840576),
 ('mate', 0.9703019261360168),
 ('breasts', 0.9700096845626831),
 ('apartment', 0.9693183898925781),
 ('attend', 0.9692969918251038)]

In [34]:
wv['good'] - wv['bad']

array([-0.38255894, -0.10721177,  0.2304197 ,  0.02755582,  0.13324553,
       -0.39485073, -0.1168834 ,  0.44016442, -0.05154616, -0.57956505,
       -0.09273262, -0.07966226,  0.5475657 ,  0.07492611, -0.06786942,
        0.21997654, -0.37384874,  0.23065782, -0.2574172 ,  0.15629458,
       -0.14411569, -0.05025716,  0.09265947,  0.1087577 ,  0.19852541,
       -0.05322106, -0.20482506,  0.5323632 , -0.08989573, -0.531576  ,
       -0.3143677 ,  0.28394455, -0.00750485,  0.05733454, -0.12472564,
       -0.25121814,  0.03185821, -0.2427192 , -0.32929644,  0.92004144,
       -0.3644895 , -0.03496844, -0.3654007 , -0.08925325,  0.36635056,
       -0.3328017 ,  0.41363016, -0.38901007,  0.0645439 ,  0.42165482,
        0.08537297, -0.01028484, -0.35063824, -0.15137872,  0.23469532,
        0.30082825,  0.08236474,  0.03962462,  0.25411546,  0.8313312 ,
        0.14700057,  0.29615444, -0.22249144, -0.41692823,  0.37150043,
       -0.05624306, -0.3431719 , -0.3161069 ,  0.65464365,  0.16

In [35]:
res = wv['good'] - wv['bad'] + wv['stupid']

In [36]:
wv.similar_by_vector(res)

[('nice', 0.8048093914985657),
 ('good', 0.7815244197845459),
 ('great', 0.771703839302063),
 ('decent', 0.7659851908683777),
 ('such', 0.7566556334495544),
 ('always', 0.7431327104568481),
 ('given', 0.7388507127761841),
 ('potential', 0.7275189161300659),
 ('also', 0.7275111675262451),
 ('tight', 0.7247037887573242)]

In [37]:
res = wv['queen'] - wv['king'] + wv['actor']
wv.similar_by_vector(res)

[('actor', 0.9623402953147888),
 ('performance', 0.8658728003501892),
 ('actress', 0.8373754024505615),
 ('role', 0.8170179724693298),
 ('job', 0.8021565079689026),
 ('guy', 0.767738401889801),
 ('character', 0.7672237753868103),
 ('gunslinger', 0.7588706612586975),
 ('man', 0.7535820007324219),
 ('touchy', 0.7453116774559021)]

In [38]:
word2vec_2 = Word2Vec(sentences=X_train, vector_size=50)
wv2 = word2vec_2.wv
len(wv2['movie'])

50

In [39]:
print('Vocabulary size', len(wv2.key_to_index))

diff_words = set([_ for elt in X_train for _ in elt])
print('Number of different words in the train set', len(diff_words))

Vocabulary size 8006
Number of different words in the train set 30419


In [40]:
word2vec_3 = Word2Vec(sentences=X_train, vector_size=50, min_count=40)
word2vec_4 = Word2Vec(sentences=X_train, vector_size=50, min_count=2)

print(f'Number of word in W2V #1 : {len(wv.key_to_index)}')
print(f'Number of word in W2V #2 : {len(wv2.key_to_index)}')
print(f'Number of word in W2V #3 : {len(word2vec_3.wv.key_to_index)}')
print(f'Number of word in W2V #4 : {len(word2vec_4.wv.key_to_index)}')

Number of word in W2V #1 : 8006
Number of word in W2V #2 : 8006
Number of word in W2V #3 : 1385
Number of word in W2V #4 : 16729


In [41]:
word2vec_5 = Word2Vec(sentences=X_train, vector_size=50, min_count=40, window=10)

In [42]:
import numpy as np

example = ['this', 'movie', 'is', 'the', 'worst', 'action', 'movie', 'ever']
example_missing_words = ['this', 'movie', 'is', 'laaaaaaaaaame']

def embed_sentence(word2vec, sentence):

    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])

    return np.array(embedded_sentence)

embedded_sentence = embed_sentence(word2vec, example)
assert(type(embedded_sentence) == np.ndarray)
assert(embedded_sentence.shape == (8, 100))

embedded_sentence_missing_words = embed_sentence(word2vec, example_missing_words)
assert(type(embedded_sentence_missing_words) == np.ndarray)
assert(embedded_sentence_missing_words.shape == (3, 100))