# Implementation of Word2Vec and FastText Word Embedding with Gensim

In [1]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import spacy  # For preprocessing
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
# Load data source
source = '../data/yorubaDS2020/'

In [3]:
df = pd.read_table(source + 'yoruba_proverbs_tweets.txt', names=['Yoruba_proverb_tweets'])
df.shape

(2700, 1)

In [4]:
df.head(3)

Unnamed: 0,Yoruba_proverb_tweets
0,A di gàárì sílẹ̀ ewúrẹ́ ńyọjú; ẹrù ìran rẹ̀ ni?
1,A fi ọ́ jọba ò ńṣàwúre o fẹ́ jẹ Ọlọ́run ni?
2,"A fijó gba Awà; a fìjà gba Awà; bí a ò bá jó, ..."


In [5]:
# Lowercase all characters
df.Yoruba_proverb_tweets=df.Yoruba_proverb_tweets.astype(str).apply(lambda x: x.lower())

In [6]:
import string
from string import digits
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
df.Yoruba_proverb_tweets=df.Yoruba_proverb_tweets.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [7]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
df.mar = df.Yoruba_proverb_tweets.apply(lambda x: re.sub("[0123456789]", "", x))

  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
# Remove extra spaces
df.Yoruba_proverb_tweets=df.Yoruba_proverb_tweets.apply(lambda x: x.strip())
df.Yoruba_proverb_tweets=df.Yoruba_proverb_tweets.apply(lambda x: re.sub(" +", " ", x))

In [9]:
# Vocabulary of Yoruba 
all_yoruba_words=set()
for yoruba in df.Yoruba_proverb_tweets:
    for word in yoruba.split():
        if word not in all_yoruba_words:
            all_yoruba_words.add(word)

In [12]:
all_yoruba_words

{'àwòròǹṣoṣòó',
 'ru',
 'féwe',
 'ọtí',
 'ìkòkò',
 'mànàmáná',
 'kẹ́hìn',
 'ọ̀dọ̀fín',
 'ìja',
 'lalágbẹ̀dẹ',
 'abẹ́nú',
 'àkẹ̀hìnsí',
 'gbogbo',
 'adámálèṣe',
 'sún',
 'ṣípá',
 'rímúu',
 'àjẹìwálé',
 'kèrègbè',
 'ehínín',
 'ìjàm̀pere',
 'dídì',
 'ọ̀rẹ́',
 'ńwòye',
 'ìṣokùn',
 'ṣánko',
 'mọ́dìí',
 'ṣọ̀tẹ̀',
 'màjèṣín',
 'kẹ́tẹ́kẹ́tẹ́',
 'légbògi',
 'jiná',
 'ẹgàn',
 'níle',
 'ìpàṣán',
 'ati',
 'gbígbọ́n',
 'ḿmu',
 'àìmọ̀kan',
 'lẹ́ẹ̀kan',
 'ìgbẹ̀hìnin',
 'filẹ̀',
 'lábẹ́ẹ',
 'aṣeburúkú',
 '“fọ́maàn”',
 'ọ̀láà',
 'owóò',
 'yàrà',
 'dìtùfù',
 'ńnáni',
 'ìyẹn',
 'àmọ̀tẹ́kùnún',
 'ṣìsọ',
 'tẹ́lẹ̀',
 'pè',
 'ńmayo',
 'dán',
 'fájá',
 'mọdẹẹ́',
 'tìyẹ́',
 'nínu',
 'ḿmọ”',
 'rìkíṣí',
 'ńgbọ́',
 'àkùnsínú',
 'ńfòsì',
 'kúgbé”',
 'sọlẹ̀',
 'ọ̀gàǹgà',
 'láyà',
 'aṣebi',
 'yìnbọn',
 'làgbà',
 'mọ̀ọ́',
 'gbẹ́kẹ̀lé',
 'ńkọ́kọ́',
 'ọkùnrin',
 'wọgbó',
 'gẹṣin',
 'rọ́ba',
 'wòó',
 'ńlẹ̀',
 'ìyà',
 'ẹ̀fà',
 'aláìnítìjú',
 'lójúlójú',
 'gángan',
 'òṣónú',
 'lèèyàn',
 'làkèré',
 'òjòwú',
 'ọ̀wọ̀ọ',
 'àj

In [13]:
from gensim.models import Word2Vec
model_ted = Word2Vec(sentences=all_yoruba_words, size=100, window=5, min_count=5, workers=4, sg=0)

INFO - 13:40:32: 'pattern' package not found; tag filters are not available for English
INFO - 13:40:32: collecting all words and their counts
INFO - 13:40:32: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 13:40:32: collected 46 word types from a corpus of 30571 raw words and 5517 sentences
INFO - 13:40:32: Loading a fresh vocabulary
INFO - 13:40:32: min_count=5 retains 41 unique words (89% of original 46, drops 5)
INFO - 13:40:32: min_count=5 leaves 30562 word corpus (99% of original 30571, drops 9)
INFO - 13:40:32: deleting the raw counts dictionary of 46 items
INFO - 13:40:32: sample=0.001 downsamples 39 most-common words
INFO - 13:40:32: downsampling leaves estimated 7080 word corpus (23.2% of prior 30562)
INFO - 13:40:32: estimated required memory for 41 words and 100 dimensions: 53300 bytes
INFO - 13:40:32: resetting layer weights
INFO - 13:40:32: training model with 4 workers on 41 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 wi

In [22]:
model_ted.wv.most_similar("oyè")

KeyError: "word 'oyè' not in vocabulary"

# FastText

In [23]:
from gensim.models import FastText
model_fastText = FastText(all_yoruba_words, size=100, window=5, min_count=5, workers=4,sg=1)

INFO - 13:52:03: collecting all words and their counts
INFO - 13:52:03: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 13:52:03: collected 46 word types from a corpus of 30571 raw words and 5517 sentences
INFO - 13:52:03: Loading a fresh vocabulary
INFO - 13:52:03: min_count=5 retains 41 unique words (89% of original 46, drops 5)
INFO - 13:52:03: min_count=5 leaves 30562 word corpus (99% of original 30571, drops 9)
INFO - 13:52:03: deleting the raw counts dictionary of 46 items
INFO - 13:52:03: sample=0.001 downsamples 39 most-common words
INFO - 13:52:03: downsampling leaves estimated 7080 word corpus (23.2% of prior 30562)
INFO - 13:52:03: estimated required memory for 41 words, 41 buckets and 100 dimensions: 71996 bytes
INFO - 13:52:03: resetting layer weights
INFO - 13:52:48: Total number of ngrams is 41
INFO - 13:52:50: training model with 4 workers on 41 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
INFO - 13:52:52: EPOCH 

In [24]:
model_fastText.wv.most_similar("oyè")

INFO - 14:08:57: precomputing L2-norms of word weight vectors
INFO - 14:08:57: precomputing L2-norms of ngram weight vectors


KeyError: 'all ngrams for word oyè absent from model'