In [1]:
from gensim.models import word2vec

import os
import logging
import gensim
import numpy as np
import pickle



In [2]:
class Sentence(object):
    """ Setup an iterator which cycle through the data 
    without having to load the entire data set into memory.
    This is vital, as some text data sets are huge """
    
    def __init__(self, dirname):
        self.dirname = dirname
    
    def __iter__(self):
        for file_name in os.listdir(self.dirname):
            with open(os.path.join(self.dirname, file_name), 'r') as f:
                for word in f.read().split():
                    yield word

In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
# sentences = word2vec.Text8Corpus('text')
path = "processed_text_data/text"
sentences = word2vec.LineSentence(path)

In [5]:
model = word2vec.Word2Vec(sentences, iter=10, min_count=20, size=300, workers=4)

2018-06-05 10:33:51,334 : INFO : collecting all words and their counts
2018-06-05 10:33:54,431 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-06-05 10:33:59,436 : INFO : collected 302084 word types from a corpus of 10346611 raw words and 1035 sentences
2018-06-05 10:33:59,438 : INFO : Loading a fresh vocabulary
2018-06-05 10:33:59,758 : INFO : min_count=20 retains 39233 unique words (12% of original 302084, drops 262851)
2018-06-05 10:33:59,759 : INFO : min_count=20 leaves 9432604 word corpus (91% of original 10346611, drops 914007)
2018-06-05 10:33:59,967 : INFO : deleting the raw counts dictionary of 302084 items
2018-06-05 10:34:00,163 : INFO : sample=0.001 downsamples 22 most-common words
2018-06-05 10:34:00,165 : INFO : downsampling leaves estimated 8854970 word corpus (93.9% of prior 9432604)
2018-06-05 10:34:00,377 : INFO : estimated required memory for 39233 words and 300 dimensions: 113775700 bytes
2018-06-05 10:34:00,379 : INFO : resetting lay

2018-06-05 10:34:59,361 : INFO : EPOCH 4 - PROGRESS: at 0.10% examples, 6449 words/s, in_qsize 8, out_qsize 0
2018-06-05 10:35:00,375 : INFO : EPOCH 4 - PROGRESS: at 10.14% examples, 385619 words/s, in_qsize 7, out_qsize 0
2018-06-05 10:35:01,377 : INFO : EPOCH 4 - PROGRESS: at 19.23% examples, 510913 words/s, in_qsize 7, out_qsize 0
2018-06-05 10:35:02,378 : INFO : EPOCH 4 - PROGRESS: at 28.21% examples, 573686 words/s, in_qsize 8, out_qsize 0
2018-06-05 10:35:03,387 : INFO : EPOCH 4 - PROGRESS: at 38.16% examples, 628986 words/s, in_qsize 8, out_qsize 0
2018-06-05 10:35:04,392 : INFO : EPOCH 4 - PROGRESS: at 47.34% examples, 658006 words/s, in_qsize 7, out_qsize 0
2018-06-05 10:35:05,398 : INFO : EPOCH 4 - PROGRESS: at 57.29% examples, 683981 words/s, in_qsize 8, out_qsize 0
2018-06-05 10:35:06,404 : INFO : EPOCH 4 - PROGRESS: at 67.44% examples, 706555 words/s, in_qsize 8, out_qsize 0
2018-06-05 10:35:07,404 : INFO : EPOCH 4 - PROGRESS: at 77.49% examples, 728073 words/s, in_qsize 7

2018-06-05 10:35:56,942 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-06-05 10:35:56,952 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-05 10:35:56,964 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-05 10:35:56,966 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-05 10:35:56,966 : INFO : EPOCH - 8 : training on 10346611 raw words (8854762 effective words) took 11.7s, 757124 effective words/s
2018-06-05 10:35:58,322 : INFO : EPOCH 9 - PROGRESS: at 0.10% examples, 6307 words/s, in_qsize 8, out_qsize 1
2018-06-05 10:35:59,327 : INFO : EPOCH 9 - PROGRESS: at 9.76% examples, 367224 words/s, in_qsize 7, out_qsize 0
2018-06-05 10:36:00,339 : INFO : EPOCH 9 - PROGRESS: at 19.90% examples, 523940 words/s, in_qsize 8, out_qsize 0
2018-06-05 10:36:01,344 : INFO : EPOCH 9 - PROGRESS: at 30.05% examples, 605733 words/s, in_qsize 8, out_qsize 1
2018-06-05 10:36:02,347 : INFO : EPOCH 9 - PROGRE

In [6]:
# get top 10 most common words
for i in range(10):
    print(model.wv.index2word[i])

ve
bir
i
da
bu
de
için
ile
çok
türkiye


In [7]:
# get top 10 least common words
vocab_size = len(model.wv.vocab)
for i in range(1, 11):
    print(model.wv.index2word[vocab_size - i])

yidoğan
şikesi
buducnost
gallinari
eurochallenge
ea7
rajon
lowdon
amachree
sakatlığından


In [12]:
# some similarity fun
print(model.wv.similarity('portakal', 'elma'))

0.5929566350028003


In [13]:
# what doesn't fit?
print(model.wv.doesnt_match("kırmızı mavi portakal".split()))

2018-06-05 10:37:31,454 : INFO : precomputing L2-norms of word weight vectors


portakal


In [26]:
print(model.wv.most_similar(positive=["istanbul", "ankara"], negative=["erkek"]))

[('center', 0.5331943035125732), ('halkalı', 0.4877026081085205), ('cnr', 0.477074533700943), ('sofya', 0.4761894643306732), ('garden', 0.47203749418258667), ('barselona', 0.4646550714969635), ('plaza', 0.4631446599960327), ('sheraton', 0.46216270327568054), ('contemporary', 0.4611188769340515), ('mudanya', 0.46009308099746704)]


In [27]:
# total number of words
len(model.wv.vocab)

39233

In [28]:
# create a dict for mapping words to index
def word_to_index(string_data, wv):
    index_data = {}
    for word in string_data:
        if word in wv:
            index_data[word] = wv.vocab[word].index
    return index_data

In [30]:
str_data = [x for x in open(path, encoding="utf-8").read().split()]

In [31]:
word_2_index = word_to_index(str_data, model.wv)
index_2_word = {k: v for v, k in word_2_index.items()}

In [32]:
word_2_index['uganda']

36084

In [34]:
index_2_word[36084]

'uganda'

In [35]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [36]:
if not os.path.exists("word2vec_data"):
    os.mkdir("word2vec_data")

save_obj(word_2_index, "word2vec_data/word_2_index")
save_obj(index_2_word, "word2vec_data/index_2_word")

In [37]:
model.save("word2vec_data/my_word2vec")

2018-06-05 10:43:32,611 : INFO : saving Word2Vec object under word2vec_data/my_word2vec, separately None
2018-06-05 10:43:32,612 : INFO : storing np array 'vectors' to word2vec_data/my_word2vec.wv.vectors.npy
2018-06-05 10:43:32,725 : INFO : not storing attribute vectors_norm
2018-06-05 10:43:32,726 : INFO : storing np array 'syn1neg' to word2vec_data/my_word2vec.trainables.syn1neg.npy
2018-06-05 10:43:32,836 : INFO : not storing attribute cum_table
2018-06-05 10:43:32,915 : INFO : saved word2vec_data/my_word2vec


In [38]:
# convert the wv word vectors into a numpy matrix that is suitable for insertion
# into TensorFlow or Keras models
vector_dim = 300
embedding_matrix = np.zeros((len(model.wv.vocab), vector_dim))
for i in range(len(model.wv.vocab)):
    embedding_vector = model.wv[model.wv.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [39]:
np.save(file="word2vec_data/embeddings", arr=embedding_matrix)