# Implementation of Word2Vec and FastText Word Embedding with Gensim

# Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

import string
from string import digits

from time import time 
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

# Load data source

In [2]:
source_dir = '../data/yorubaDS2020/'
df = pd.read_table(source_dir + 'yoruba_on_tweets.txt', names=['text'], encoding='utf-8-sig') 

In [3]:
def text_to_word_list(text, remove_polish_letters):
    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    # Remove all numbers from text
    remove_digits = str.maketrans('', '', digits)
    text = sub("[0123456789]", "", text)
    text = sub(" +", " ", text)
    text = str(text)
    text = text.lower()
    return text  

In [4]:
df.text = df.text.apply(lambda x: text_to_word_list(x, unidecode))

In [5]:
df[:2]

Unnamed: 0,text
0,búrẹ́dì rèé bí àmàlà yìí
1,káááábíbèsí o. mo yí'kàá ọ̀tún mo yí'kàá òsì o.


In [6]:
yoruba_model = df.copy()
yoruba_model = yoruba_model[yoruba_model.text.str.len()>1]

In [7]:
sent = [row for row in yoruba_model.text]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[0:5]

INFO - 14:06:47: collecting all words and their counts
INFO - 14:06:47: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 14:06:50: collected 3277 word types from a corpus of 1254223 words (unigram + bigrams) and 13913 sentences
INFO - 14:06:50: using 3277 counts as vocab in Phrases<0 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 14:06:50: source_vocab length 3277
INFO - 14:06:50: Phraser built with 5 5 phrasegrams


['búrẹ́dì rèé bí àmàlà yìí',
 "káááábíbèsí o. mo yí'kàá ọ̀tún mo yí'kàá òsì o.",
 'bó ti wà ní lìkì ní nbẹ ní gbànja. ní tèmi o, adìẹ funfun ọ̀hún ò bá dùn ún dín jẹ o jàre.',
 'a ti gba òmìnira ọjọ́ ti pẹ́, ó wá ku ìdándè.',
 "ìbànújẹ́ ò sí fún ẹni t'éyín rẹ̀ ta'áta. ẹ̀rín ni ní gbogboògbà."]

In [8]:
model_word2vec = Word2Vec(min_count=3,window=4,size=300,sample=1e-5, 
                          alpha=0.03, min_alpha=0.0007, negative=20,
                          sg=0, workers=multiprocessing.cpu_count()-1)
# size: The number of dimensions of the embeddings and the default is 100.
# window: The maximum distance between a target word and words around the target word. The default window is 5.
# min_count: The minimum count of words to consider when training the model; 
#            words with occurrence less than this count will be ignored. The default for min_count is 5.
# sg: it is used to indicate skip-gram or CBOW but when CBOW=0 or skip gram=1
start = time()
model_word2vec.build_vocab(sentences, progress_per=50000)
print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 14:06:50: collecting all words and their counts
INFO - 14:06:50: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 14:06:50: collected 141 word types from a corpus of 1254223 raw words and 13913 sentences
INFO - 14:06:50: Loading a fresh vocabulary
INFO - 14:06:50: min_count=3 retains 102 unique words (72% of original 141, drops 39)
INFO - 14:06:50: min_count=3 leaves 1254174 word corpus (99% of original 1254223, drops 49)
INFO - 14:06:50: deleting the raw counts dictionary of 141 items
INFO - 14:06:50: sample=1e-05 downsamples 76 most-common words
INFO - 14:06:50: downsampling leaves estimated 27995 word corpus (2.2% of prior 1254174)
INFO - 14:06:50: estimated required memory for 102 words and 300 dimensions: 295800 bytes
INFO - 14:06:50: resetting layer weights


Time to build vocab: 0.0 mins


In [9]:
start = time()
model_word2vec.train(sentences, total_examples=model_word2vec.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))
model_word2vec.init_sims(replace=True)

INFO - 14:06:50: training model with 7 workers on 102 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4
INFO - 14:06:50: worker thread finished; awaiting finish of 6 more threads
INFO - 14:06:50: worker thread finished; awaiting finish of 5 more threads
INFO - 14:06:50: worker thread finished; awaiting finish of 4 more threads
INFO - 14:06:50: worker thread finished; awaiting finish of 3 more threads
INFO - 14:06:50: worker thread finished; awaiting finish of 2 more threads
INFO - 14:06:50: worker thread finished; awaiting finish of 1 more threads
INFO - 14:06:50: worker thread finished; awaiting finish of 0 more threads
INFO - 14:06:50: EPOCH - 1 : training on 1254223 raw words (28110 effective words) took 0.4s, 78414 effective words/s
INFO - 14:06:51: worker thread finished; awaiting finish of 6 more threads
INFO - 14:06:51: worker thread finished; awaiting finish of 5 more threads
INFO - 14:06:51: worker thread finished; awaiting finish of 4 more threads

Time to train the model: 0.2 mins


In [10]:
model_word2vec.save("yoruba_word2vec.model")

INFO - 14:07:02: saving Word2Vec object under yoruba_word2vec.model, separately None
INFO - 14:07:02: not storing attribute vectors_norm
INFO - 14:07:02: not storing attribute cum_table
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
INFO - 14:07:02: saved yoruba_word2vec.model


In [None]:
file_export = yoruba_model.copy()
file_export['old_title'] = file_export.text

In [None]:
file_export.old_title = file_export.old_title.str.join('')
file_export.text = file_export.text.apply(lambda x: ''.join(bigram[x]))

In [None]:
file_export[['text']].to_csv('cleaned_yoruba_on_tweet.csv', index=False)

In [None]:
file_export

# T-SNE Visualizations

In [None]:
def display_closestwords_tsnescatterplot(model, word, size):
    arr = np.empty((0,size), dtype='f')
    word_labels = [word]
    
    close_words = model.similar_by_word(word)

    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)

        tsne = TSNE(n_components=2, random_state=0)
        np.set_printoptions(suppress=True)
        Y = tsne.fit_transform(arr)

        x_coords = Y[:, 0]
        y_coords = Y[:, 1]
        plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
        plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
        plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
        plt.show()

In [None]:
display_closestwords_tsnescatterplot(model_word2vec, 'gba', 50) 

# FastText

In [None]:
from gensim.models import FastText
model_fastText = FastText(sentences, size=100, window=5, min_count=5, workers=4,sg=1)