In [1]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
def load_glove_model(glove_file_path):
    glove_model = KeyedVectors.load_word2vec_format(glove_file_path, binary=False, no_header=True)
    return glove_model

def get_corpus():
    with open("../data/processed/glove_corpus", "r") as f:
        sentences = f.readlines()
    return sentences

In [3]:
sentences = get_corpus()
tokens = [sent.split() for sent in sentences]

In [4]:
base_model = Word2Vec(vector_size=300, min_count = 20, epochs=20)
base_model.build_vocab(tokens)
total_examples = base_model.corpus_count

In [5]:
base_model.wv.most_similar('what', topn=10)

[('onto', 0.18589073419570923),
 ('buildings', 0.18523985147476196),
 ('bungee', 0.17935125529766083),
 ('accidents', 0.17021802067756653),
 ('fair', 0.16874520480632782),
 ('figures.', 0.16867220401763916),
 ('families', 0.16840752959251404),
 ('size.', 0.1681390255689621),
 ('individual', 0.16532500088214874),
 ('department?', 0.16068579256534576)]

In [6]:
corpus_path = '../embeds/GloVe/glove.840B.300d.txt'
corpus_model = load_glove_model(corpus_path)
base_model.build_vocab([list(corpus_model.key_to_index.keys())], update=True)

In [7]:
corpus_model.most_similar('what', topn=10)

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


[('why', 0.8907533884048462),
 ('how', 0.8789804577827454),
 ('know', 0.8698911666870117),
 ('think', 0.8441894054412842),
 ('something', 0.8239272236824036),
 ('thing', 0.8229970335960388),
 ('that', 0.8225524425506592),
 ('exactly', 0.8140221238136292),
 ('thought', 0.8101893663406372),
 ('tell', 0.8013006448745728)]

In [8]:
base_model.train(tokens, total_examples=total_examples, epochs=base_model.epochs)
base_model_wv = base_model.wv

In [9]:
base_model_wv.most_similar('what', topn=10)

[('how', 0.4596197009086609),
 ('anything', 0.4548874795436859),
 ('where', 0.36545345187187195),
 ("what's", 0.36517012119293213),
 ('whatever', 0.3540089428424835),
 ('well...', 0.3507554829120636),
 ('what?', 0.33418241143226624),
 ('something', 0.33149653673171997),
 ('why', 0.3289160132408142),
 ('nothing', 0.2993703782558441)]

In [10]:
base_model_wv.most_similar('does', topn=10)

[("doesn't", 0.5743285417556763),
 ('did', 0.40398046374320984),
 ('do', 0.38602036237716675),
 ("doesn't.", 0.3456938564777374),
 ('does,', 0.34460973739624023),
 ('is', 0.3222086429595947),
 ('needs', 0.3137502670288086),
 ('prefers', 0.3080991506576538),
 ('makes', 0.3018656075000763),
 ('likes', 0.2977132797241211)]

In [11]:
base_model_wv.save_word2vec_format('../models/GloVe-Word2Vec/glove.bin')