# Train a FastText Word Embedding [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dadelani/NLP_DL_Intro/blob/main/Lesson1.ipynb)


##### Sections:

There are four sections in this notebook:

1. Installations: this is where we do installation for relevant dependencies
2. Imports: here, we perform imports for all the dependencies needed
4. Training: Here, the actual training process is done

### Installations

In [1]:
!pip install gensim==4.3.2



###Imports

In [10]:
from gensim.models import FastText, KeyedVectors
import string
import os

punctuations = set(string.punctuation)

### Code:

In [5]:
def create_dir(output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)


def preprocess(myfile):
    with open(myfile, encoding='utf-8') as f:
        text_lines  = f.readlines()
    n_tokens = 0
    new_text_lines = []
    token_set = []
    for text in text_lines:
        text_tokens = text.split()
        new_text_tokens = [word for word in text_tokens if word not in punctuations]
        new_text_lines.append(new_text_tokens)
        n_tokens+=len(new_text_tokens)
        token_set+=new_text_tokens

    print("# sentences", len(new_text_lines))
    print("# Tokens ", n_tokens)
    print("# Vocabulary ", len(set(token_set)))

    return new_text_lines


#https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#
#https://radimrehurek.com/gensim/models/word2vec.html#module-gensim.models.word2vec
#https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html
def train_fastText(data_path, lang):
    model_full = FastText(preprocess(data_path+'data/'+lang+'.all'), vector_size=300, window=5, min_count=3, workers=4, sg=1, epochs=10, negative=10)
    output_dir = data_path+"embeddings/"+lang+"/"
    create_dir(output_dir)
    model_full.wv.save(output_dir+lang+".bin")
    print("embedding training Done")

### Modify this

In [6]:
!git clone https://github.com/dadelani/NLP_DL_Intro.git

Cloning into 'NLP_DL_Intro'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 9 (delta 1), reused 5 (delta 0), pack-reused 0[K
Receiving objects: 100% (9/9), 3.25 MiB | 10.29 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [7]:
data_path = "NLP_DL_Intro/"

In [8]:
train_fastText(data_path, 'yo')

# sentences 105919
# Tokens  1980552
# Vocabulary  136608
embedding training Done


### Analysis

In [14]:
model = KeyedVectors.load(data_path+'embeddings/yo/yo.bin')


In [18]:
wv = model.word_vec

In [20]:
model['ọdọ̀']

array([-0.3009538 ,  0.05521351, -0.06429673,  0.03832434, -0.00474682,
        0.19180416,  0.15293947, -0.0270202 ,  0.01817678, -0.02435626,
       -0.12085856,  0.04649919,  0.11483972, -0.20378807, -0.02971853,
        0.01525708, -0.12156061, -0.10994134,  0.10481274, -0.0447474 ,
        0.08684216, -0.05806672,  0.03063249,  0.08800097, -0.0788087 ,
        0.3184726 , -0.00492311, -0.09382931,  0.03710625, -0.06846689,
        0.03872345, -0.03963874,  0.04347344, -0.07017129,  0.01230362,
       -0.0480312 ,  0.02866575,  0.06393517, -0.03471699,  0.17563716,
       -0.13303536, -0.06490525,  0.03045777,  0.0809239 , -0.01946333,
       -0.09839525,  0.00418832,  0.04204311,  0.18329844, -0.2132204 ,
        0.10813662,  0.00101487,  0.25050825,  0.02499843, -0.1711096 ,
        0.0842272 , -0.02111266,  0.0498506 , -0.05969177,  0.07424942,
       -0.08946718,  0.02495108,  0.13357803,  0.10525002, -0.01230905,
       -0.0306818 , -0.02400396,  0.10772721,  0.04698715,  0.00

In [22]:
model.most_similar("igi")

[('ìgi', 0.759122371673584),
 ('jigi', 0.7245650887489319),
 ('gi', 0.7015208601951599),
 ('Igi', 0.694075882434845),
 ('dòdò', 0.6915137767791748),
 ('ewédú', 0.690724790096283),
 ('Dòdò', 0.6883922219276428),
 ('Ìgi', 0.6787936091423035),
 ('èso', 0.6696765422821045),
 ('èédú', 0.6695988178253174)]