In [1]:
from gensim.models import word2vec

import os
import logging
import gensim
import numpy as np
import pickle

In [2]:
class Sentence(object):
    """ Setup an iterator which cycle through the data 
    without having to load the entire data set into memory.
    This is vital, as some text data sets are huge """
    
    def __init__(self, dirname):
        self.dirname = dirname
    
    def __iter__(self):
        for file_name in os.listdir(self.dirname):
            with open(os.path.join(self.dirname, file_name), 'r') as f:
                for word in f.read().split():
                    yield word

In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
# sentences = word2vec.Text8Corpus('text')
path = "processed_text_data/text"
sentences = word2vec.LineSentence(path)

In [5]:
model = word2vec.Word2Vec(sentences, iter=10, min_count=20, size=300, workers=4)

2018-05-25 12:40:59,581 : INFO : collecting all words and their counts
2018-05-25 12:41:00,757 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-05-25 12:41:03,456 : INFO : collected 302084 word types from a corpus of 10346611 raw words and 1035 sentences
2018-05-25 12:41:03,457 : INFO : Loading a fresh vocabulary
2018-05-25 12:41:03,643 : INFO : min_count=20 retains 39233 unique words (12% of original 302084, drops 262851)
2018-05-25 12:41:03,643 : INFO : min_count=20 leaves 9432604 word corpus (91% of original 10346611, drops 914007)
2018-05-25 12:41:03,750 : INFO : deleting the raw counts dictionary of 302084 items
2018-05-25 12:41:03,810 : INFO : sample=0.001 downsamples 22 most-common words
2018-05-25 12:41:03,811 : INFO : downsampling leaves estimated 8854970 word corpus (93.9% of prior 9432604)
2018-05-25 12:41:03,945 : INFO : estimated required memory for 39233 words and 300 dimensions: 113775700 bytes
2018-05-25 12:41:03,945 : INFO : resetting lay

2018-05-25 12:41:58,659 : INFO : EPOCH 4 - PROGRESS: at 17.00% examples, 364295 words/s, in_qsize 7, out_qsize 0
2018-05-25 12:41:59,672 : INFO : EPOCH 4 - PROGRESS: at 23.96% examples, 413133 words/s, in_qsize 7, out_qsize 0
2018-05-25 12:42:00,696 : INFO : EPOCH 4 - PROGRESS: at 30.53% examples, 437333 words/s, in_qsize 8, out_qsize 1
2018-05-25 12:42:01,717 : INFO : EPOCH 4 - PROGRESS: at 37.68% examples, 461582 words/s, in_qsize 7, out_qsize 0
2018-05-25 12:42:02,718 : INFO : EPOCH 4 - PROGRESS: at 44.83% examples, 479828 words/s, in_qsize 7, out_qsize 0
2018-05-25 12:42:03,739 : INFO : EPOCH 4 - PROGRESS: at 51.50% examples, 490738 words/s, in_qsize 7, out_qsize 0
2018-05-25 12:42:04,748 : INFO : EPOCH 4 - PROGRESS: at 58.07% examples, 498937 words/s, in_qsize 7, out_qsize 0
2018-05-25 12:42:05,756 : INFO : EPOCH 4 - PROGRESS: at 64.73% examples, 506512 words/s, in_qsize 7, out_qsize 0
2018-05-25 12:42:06,776 : INFO : EPOCH 4 - PROGRESS: at 70.82% examples, 508503 words/s, in_qsiz

2018-05-25 12:42:59,803 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-25 12:42:59,815 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-25 12:42:59,818 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-25 12:42:59,820 : INFO : EPOCH - 7 : training on 10346611 raw words (8854927 effective words) took 15.6s, 566280 effective words/s
2018-05-25 12:43:00,905 : INFO : EPOCH 8 - PROGRESS: at 0.10% examples, 8207 words/s, in_qsize 7, out_qsize 0
2018-05-25 12:43:01,906 : INFO : EPOCH 8 - PROGRESS: at 7.44% examples, 327373 words/s, in_qsize 7, out_qsize 0
2018-05-25 12:43:02,912 : INFO : EPOCH 8 - PROGRESS: at 14.78% examples, 439395 words/s, in_qsize 7, out_qsize 0
2018-05-25 12:43:03,914 : INFO : EPOCH 8 - PROGRESS: at 22.13% examples, 495331 words/s, in_qsize 7, out_qsize 0
2018-05-25 12:43:04,925 : INFO : EPOCH 8 - PROGRESS: at 29.28% examples, 520196 words/s, in_qsize 7, out_qsize 0
2018-05-25 12:43:05,928 : I

In [6]:
# get top 10 most common words
for i in range(10):
    print(model.wv.index2word[i])

ve
bir
i
da
bu
de
için
ile
çok
türkiye


In [7]:
# get top 10 least common words
vocab_size = len(model.wv.vocab)
for i in range(1, 11):
    print(model.wv.index2word[vocab_size - i])

yidoğan
şeküri
kalınbağırsak
gargara
wen
göğüste
başvurulmalıdır
intikamzamani
kbb
greenpeace


In [8]:
# some similarity fun
print(model.wv.similarity('erkek', 'kadın'))

0.5309982688681165


In [9]:
# what doesn't fit?
print(model.wv.doesnt_match("kırmızı mavi portakal".split()))

2018-05-25 12:44:04,590 : INFO : precomputing L2-norms of word weight vectors


portakal


In [10]:
# total number of words
len(model.wv.vocab)

39233

In [11]:
# create a dict for mapping words to index
def word_to_index(string_data, wv):
    index_data = {}
    for word in string_data:
        if word in wv:
            index_data[word] = wv.vocab[word].index
    return index_data

In [12]:
str_data = [x for x in open(path).read().split()]

In [13]:
word_2_index = word_to_index(str_data, model.wv)
index_2_word = {k: v for v, k in word_2_index.items()}

In [14]:
word_2_index['uganda']

36222

In [15]:
index_2_word[36222]

'uganda'

In [16]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [17]:
if not os.path.exists("word2vec_data"):
    os.mkdir("word2vec_data")

save_obj(word_2_index, "word2vec_data/word_2_index")
save_obj(index_2_word, "word2vec_data/index_2_word")

In [18]:
model.save("word2vec_data/my_word2vec")

2018-05-25 12:45:27,034 : INFO : saving Word2Vec object under word2vec_data/my_word2vec, separately None
2018-05-25 12:45:27,036 : INFO : storing np array 'vectors' to word2vec_data/my_word2vec.wv.vectors.npy
2018-05-25 12:45:27,220 : INFO : not storing attribute vectors_norm
2018-05-25 12:45:27,221 : INFO : storing np array 'syn1neg' to word2vec_data/my_word2vec.trainables.syn1neg.npy
2018-05-25 12:45:27,255 : INFO : not storing attribute cum_table
2018-05-25 12:45:27,339 : INFO : saved word2vec_data/my_word2vec


In [19]:
# convert the wv word vectors into a numpy matrix that is suitable for insertion
# into TensorFlow or Keras models
vector_dim = 300
embedding_matrix = np.zeros((len(model.wv.vocab), vector_dim))
for i in range(len(model.wv.vocab)):
    embedding_vector = model.wv[model.wv.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [20]:
np.save(file="word2vec_data/embeddings", arr=embedding_matrix)