# Gensim Word2Vec Embedding model

- https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial
- https://rare-technologies.com/word2vec-tutorial/

# Mount & pakage ready

## [Colab] google drive mount

In [1]:
# 구글 코랩 환경에서 실행, 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
# 구글 드라이브 경로 지정
path = '/content/drive/My Drive/Colab Notebooks/2020-PoscoICT/Data/'

In [None]:
import pandas as pd
import gensim, logging
import multiprocessing
import os

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Memory Efficient File Loading for Word2Vec Training

In [16]:
class SentenceReader:

    def __init__(self, filepath):
        self.filepath = filepath


    def __iter__(self):
        for line in open(self.filepath):
            ### TODO Tokenizer
            yield line.replace('\n', '').split(' ')

In [17]:
sentences = SentenceReader(path + 'news_sentence_train.txt')

In [13]:
sentences

<__main__.SentenceReader at 0x7fb488a89390>

## Most Frequent Words:
Mainly a sanity check of the effectiveness of the lemmatization, removal of stopwords, and addition of bigrams.

In [18]:
from collections import defaultdict

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

283149

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:100]

# Training the model
## Gensim Word2Vec Implementation

In [20]:
import multiprocessing
from gensim.models import Word2Vec

In [21]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
print(cores)

2


## The parameters:

* `min_count` <font color='purple'>=</font> <font color='green'>int</font> - Ignores all words with total absolute frequency lower than this - (2, 100)


* `window` <font color='purple'>=</font> <font color='green'>int</font> - The maximum distance between the current and predicted word within a sentence. E.g. `window` words on the left and `window` words on the left of our target - (2, 10)


* `size` <font color='purple'>=</font> <font color='green'>int</font> - Dimensionality of the feature vectors. - (50, 300)


* `sample` <font color='purple'>=</font> <font color='green'>float</font> - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial.  - (0, 1e-5)


* `alpha` <font color='purple'>=</font> <font color='green'>float</font> - The initial learning rate - (0.01, 0.05)


* `min_alpha` <font color='purple'>=</font> <font color='green'>float</font> - Learning rate will linearly drop to `min_alpha` as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00


* `negative` <font color='purple'>=</font> <font color='green'>int</font> - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)


* `workers` <font color='purple'>=</font> <font color='green'>int</font> - Use these many worker threads to train the model (=faster training with multicore machines)

In [23]:
w2v_model = Word2Vec(min_count=10,
                     window=5,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

## Building the Vocabulary Table:
Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them):

In [24]:
from time import time
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.07 mins


## Training of the model:
_Parameters of the training:_
* `total_examples` <font color='purple'>=</font> <font color='green'>int</font> - Count of sentences;
* `epochs` <font color='purple'>=</font> <font color='green'>int</font> - Number of iterations (epochs) over the corpus - [10, 20, 30]

In [25]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=50, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 4.84 mins


As we do not plan to train the model any further, we are calling init_sims(), which will make the model much more memory-efficient:

In [26]:
w2v_model.init_sims(replace=True) # much more memory-efficient

In [27]:
w2v_model.save(path + "w2v_news_train")

# Exploring the model
## Most similar to:



In [28]:
from gensim.models import KeyedVectors
w2v_model = KeyedVectors.load(path + "w2v_news_train", mmap='r')

In [29]:
w2v_model.wv.most_similar(positive=["청년"])

  if np.issubdtype(vec.dtype, np.int):


[('남동산단', 0.4753042459487915),
 ('창출을', 0.45043906569480896),
 ('취업', 0.44848722219467163),
 ('일자리', 0.4475654065608978),
 ('연출가', 0.441278874874115),
 ('창업을', 0.43236786127090454),
 ('창출', 0.4313412308692932),
 ('창업', 0.42924684286117554),
 ('스타트업', 0.424064576625824),
 ('저소득층', 0.4182107150554657)]

In [30]:
w2v_model.wv.most_similar(positive=["청소년"])

  if np.issubdtype(vec.dtype, np.int):


[('어린이와', 0.5152695775032043),
 ('아동', 0.4739971458911896),
 ('성인', 0.46220508217811584),
 ('EBS', 0.44438570737838745),
 ('청소년을', 0.4132959544658661),
 ('독감', 0.40381166338920593),
 ('소아', 0.4004584550857544),
 ('청소년이', 0.40038204193115234),
 ('가출', 0.38508570194244385),
 ('모집한다', 0.38276952505111694)]

In [31]:
w2v_model.wv.most_similar(positive=["강원도"])

  if np.issubdtype(vec.dtype, np.int):


[('영광', 0.502439022064209),
 ('원주', 0.4965716600418091),
 ('고향인', 0.49260371923446655),
 ('일대에서', 0.4854246973991394),
 ('청주', 0.4750401973724365),
 ('강릉', 0.4679393172264099),
 ('강원', 0.46717369556427),
 ('순천', 0.4649154543876648),
 ('평창', 0.4627965986728668),
 ('여수', 0.4618801474571228)]

In [None]:
w2v_model.wv.most_similar(positive=["춘천"])

## Similarities:
Here, we will see how similar are two words to each other :

In [32]:
w2v_model.wv.similarity("청년", '청소년')

  if np.issubdtype(vec.dtype, np.int):


0.18243223

In [33]:
w2v_model.wv.similarity("강원도", '도내')

  if np.issubdtype(vec.dtype, np.int):


0.0848353

In [34]:
w2v_model.wv.similarity("강원도", '원주')

  if np.issubdtype(vec.dtype, np.int):


0.49657166

In [35]:
w2v_model.wv.similarity('서울', '춘천')

  if np.issubdtype(vec.dtype, np.int):


0.32474127


## Analogy difference:


In [36]:
w2v_model.wv.most_similar(positive=["취업", "청년"], negative=["게임"], topn=5)

  if np.issubdtype(vec.dtype, np.int):


[('소득세', 0.4937027096748352),
 ('남동산단', 0.4596513509750366),
 ('일자리', 0.424974262714386),
 ('저소득층', 0.42217764258384705),
 ('비정규직', 0.4212227761745453)]

In [37]:
w2v_model.wv.most_similar(positive=["춘천", "서울"], negative=["일본"], topn=5)

  if np.issubdtype(vec.dtype, np.int):


[('방면', 0.543633222579956),
 ('사거리', 0.5430142283439636),
 ('강릉', 0.5377534627914429),
 ('인근에서', 0.53123539686203),
 ('용인', 0.5289182662963867)]

In [38]:
w2v_model.wv.most_similar(positive=["춘천", "서울"], topn=5)

  if np.issubdtype(vec.dtype, np.int):


[('강릉', 0.6352565288543701),
 ('사거리', 0.6134753227233887),
 ('원주', 0.6037938594818115),
 ('삼성동', 0.5981919169425964),
 ('인근에서', 0.5937576293945312)]

In [None]:
# vocab_dict - key: vocab, value: embedding vector
# read test file 
# compute sentence embedding vectors
# save sentences + sentence embedding vectors