# Train a FastText Word Embedding [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dadelani/NLP_DL_Intro/blob/main/Lesson1.ipynb)


##### Sections:

There are four sections in this notebook:

1. Installations: this is where we do installation for relevant dependencies
2. Imports: here, we perform imports for all the dependencies needed
4. Training: Here, the actual training process is done

### Installations

In [1]:
!pip install gensim==4.3.2
!pip install pandas
!pip install scipy



###Imports

In [2]:
from gensim.models import FastText, KeyedVectors
import string
import os

punctuations = set(string.punctuation)

### Code:

In [3]:
def create_dir(output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)


def preprocess(myfile):
    with open(myfile, encoding='utf-8') as f:
        text_lines  = f.readlines()
    n_tokens = 0
    new_text_lines = []
    token_set = []
    for text in text_lines:
        text_tokens = text.split()
        new_text_tokens = [word for word in text_tokens if word not in punctuations]
        new_text_lines.append(new_text_tokens)
        n_tokens+=len(new_text_tokens)
        token_set+=new_text_tokens

    print("# sentences", len(new_text_lines))
    print("# Tokens ", n_tokens)
    print("# Vocabulary ", len(set(token_set)))

    return new_text_lines


#https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#
#https://radimrehurek.com/gensim/models/word2vec.html#module-gensim.models.word2vec
#https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html
def train_fastText(data_path, lang):
    model_full = FastText(preprocess(data_path+'data/'+lang+'.all'), vector_size=300, window=5, min_count=3, workers=4, sg=1, epochs=10, negative=10)
    output_dir = data_path+"embeddings/"+lang+"/"
    create_dir(output_dir)
    model_full.wv.save(output_dir+lang+".bin")
    print("embedding training Done")

### Modify this

In [4]:
!git clone https://github.com/dadelani/NLP_DL_Intro.git

Cloning into 'NLP_DL_Intro'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 17 (delta 3), reused 12 (delta 1), pack-reused 0[K
Receiving objects: 100% (17/17), 3.26 MiB | 10.20 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [5]:
data_path = "NLP_DL_Intro/"

In [6]:
train_fastText(data_path, 'yo')

# sentences 105919
# Tokens  1980552
# Vocabulary  136608
embedding training Done


### Analysis

In [7]:
model = KeyedVectors.load(data_path+'embeddings/yo/yo.bin')


### Word similarity Evaluation

In [8]:
import pandas as pd
wordSim353 = pd.read_csv(data_path+"data/wordSim353.csv")

In [9]:
wordSim353

Unnamed: 0,S/N,English1,English2,Yoruba1,Yoruba2,EngSim
0,1,tiger,cat,ẹkùn,ológbò,7.35
1,2,tiger,tiger,ẹkùn,ẹkùn,10.00
2,3,book,paper,ìwé,bébà,7.46
3,4,computer,keyboard,kọ̀mpútà,pátákó ìtẹ̀wé,7.62
4,5,computer,internet,kọ̀mpútà,ayélujára,7.58
...,...,...,...,...,...,...
348,349,weather,forecast,ojú-ọjọ́,àsọtẹ́lẹ̀ ojú ọjọ́,8.34
349,350,disaster,area,àjálù,agbègbè,6.25
350,351,governor,office,gómìnà,ọ́físì,6.34
351,352,architecture,century,àwòrán ilégbèé,ọgbọ́rùn-ún ọdún,3.78


In [10]:
found = []
for i in range(0,353):
    p=None
    try:
        yor1 = wordSim353.at[i,'Yoruba1'].strip()
        yor2 = wordSim353.at[i,'Yoruba2'].strip()
        p=round(model.similarity(yor1, yor2) * 10,2)
        found.append(i)
    except:
        pass
    wordSim353.at[i,'model']=p
print("# pairs of words found: ", len(found))
#print(found)
#print(len(set(found) & set(found_pairs)))

# pairs of words found:  353


In [11]:
from scipy.stats import spearmanr
ourcorr1, p_value1 = spearmanr(wordSim353.EngSim,wordSim353.model, nan_policy="omit")

In [12]:
ourcorr1

0.15765993421369978

In [13]:
wv = model.word_vec

In [14]:
model['ọdọ̀']

array([-7.92991892e-02, -1.80173784e-01, -1.73998863e-01,  3.47394422e-02,
        3.57980616e-06, -2.33404934e-02,  8.26034248e-02,  8.86629522e-02,
       -8.34870487e-02,  1.33198187e-01,  1.85880810e-01, -8.69626626e-02,
        2.31984053e-02, -1.18206061e-01, -8.52655619e-02,  1.07652403e-01,
        1.02428766e-02, -5.17650135e-02, -1.44465538e-02,  3.33454870e-02,
        9.89369676e-03, -1.17027178e-01,  1.02621928e-01, -1.39791593e-02,
        1.24087550e-01,  8.30306709e-02, -1.21862441e-02, -6.31167889e-02,
        5.70970662e-02,  1.32579375e-02, -2.14290097e-02, -8.95439237e-02,
       -6.83886260e-02,  1.46315824e-02,  8.38238746e-02,  3.14537855e-03,
        1.81687213e-02,  9.68609378e-02, -2.51433611e-01,  5.79100847e-02,
       -1.48740903e-01,  9.69939977e-02, -4.72241081e-02, -1.03834353e-01,
        4.72501473e-04,  1.23524509e-01, -1.11884817e-01,  2.63256490e-01,
        1.58634692e-01,  1.90831833e-02, -2.59112800e-03,  3.84469070e-02,
        3.89112979e-02,  

In [15]:
model.most_similar("igi")

[('ìgi', 0.7853971719741821),
 ('jigi', 0.7300555109977722),
 ('Igi', 0.7245133519172668),
 ('gi', 0.6883360147476196),
 ('ewédú', 0.6740989089012146),
 ('èso', 0.671331524848938),
 ('dòdò', 0.6674048900604248),
 ('ihò', 0.6612542867660522),
 ('imú', 0.6603915691375732),
 ('igbó', 0.6600068807601929)]