# Fasttext word2vec

### in this notebook we train fasttext and word2vec models with gnesim and plot commons persian words.

In [None]:
!pip install hazm

In [1]:
# Import necessary libraries
from gensim.models import Word2Vec
from gensim.models import FastText
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go


In [2]:
from google.colab import drive
drive.mount('/content/drive')
# read Documents from json file
dataset = pd.read_json('/content/drive/MyDrive/datasets/data.json')

Mounted at /content/drive


In [3]:
# @title Preprocessing_Calss
import hazm
import string
import re

class preprocessing:
  def __init__(self):
    persian_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
    self.punctuations_list = string.punctuation + persian_punctuations
    self.arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    self.stop_words = hazm.stopwords_list()
    self.lemmatizer = hazm.Lemmatizer()

  def fit(self, train_data):
    train_data['text'] = train_data['text'].apply(self._remove_diacritics)
    train_data['text'] = train_data['text'].apply(self._remove_punctuations)
    train_data['text'] = train_data['text'].apply(self._remove_repeating_char)
    train_data['text'] = train_data['text'].apply(self._normalize_persian)
    train_data['text'] = train_data['text'].apply(self._tokenize)
    train_data['text'] = train_data['text'].apply(self._remove_stopwords)
    train_data['text'] = train_data['text'].apply(self._lemmatizer)
    return train_data


  def _remove_diacritics(self, text):
    text = re.sub(self.arabic_diacritics, '', text)
    return text

  def _remove_punctuations(self, text):
    translator = str.maketrans('', '', self.punctuations_list)
    return text.translate(translator)

  def _remove_repeating_char(self, text):
    return re.sub(r'(.)\1+', r'\1', text)


  def _normalize_persian(self, text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ي", "ی", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ی", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("ك" ,"ک" , text)
    text = re.sub("[^ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی]", " ", text)
    text = re.sub("[^\S\n\t]+", ' ', text)
    return text


  def _tokenize(self, text):
    return text.split()

  def _remove_stopwords(self, words):
    return [word  for word in words if word not in self.stop_words]

  def _lemmatizer(self, words):
    result = set()
    for token in words:
      result.add(self.lemmatizer.lemmatize(token))
    return list(result)

In [5]:
pp = preprocessing()
data = pp.fit(dataset)

# Training Word2vec and Fasttext models

In [7]:
#Function to train Word2Vec model
def train_word2vec(data):
    model = Word2Vec(data['text'], vector_size=100, window=5, min_count=1, workers=4)
    return model

In [27]:
# Function to train FastText model
def train_fasttext(data):
    model = FastText(data['text'], vector_size=100, window=5, min_count=1, workers=4)
    return model


# Plotting common persian words

In [28]:
# Function to plot embeddings
def plot_embeddings(model, name):

    # read the words stored in persian_words.txt
    words_path = '/content/drive/MyDrive/datasets/827_common_parsi_words.txt'
    with open(words_path, 'r') as f:
        words = f.readlines()
    words = [w.strip() for w in words]

    # get the text embeddings for the words in the list of words
    vecs = []
    out_of_vocabulary = 0

    for w in words:
        try:
            vecs.append(model.wv[w])
        except Exception as e:
            out_of_vocabulary += 1

    print("out_of_vocabulary:",out_of_vocabulary)
    vecs = np.array(vecs)

    # apply dimensionality reduction to the vector representations using tSNE method
    tsne = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=20000)
    results = tsne.fit_transform(vecs)
    # plot the results with plotly package
    plots = []
    for i in range(len(words) - out_of_vocabulary):
        pl = go.Scatter(x=[results[i, 0]], y=[results[i, 1]], mode='markers+text',text=[words[i]],
                        textposition='bottom center',marker=dict(size=10, color=i, colorscale='Jet', opacity=0.8),
                        textfont=dict(size=14,),name=words[i])
        plots.append(pl)

    py.plot(plots, filename= name +'tsne_persianwords.html', auto_open=True)


In [29]:
# Train models
word2vec_model = train_word2vec(data)
fasttext_model = train_fasttext(data)


In [30]:
# Plot embeddings
plot_embeddings(word2vec_model, 'Word2Vec_Embeddings')
#plot_embeddings(glove_model, 'GloVe Embeddings')
plot_embeddings(fasttext_model, 'FastText_Embeddings')

out_of_vocabulary: 107
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 721 samples in 0.001s...
[t-SNE] Computed neighbors for 721 samples in 0.047s...
[t-SNE] Computed conditional probabilities for sample 721 / 721
[t-SNE] Mean sigma: 0.703712
[t-SNE] KL divergence after 250 iterations with early exaggeration: 64.031982
[t-SNE] KL divergence after 20000 iterations: 0.177927




out_of_vocabulary: 0
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 828 samples in 0.001s...
[t-SNE] Computed neighbors for 828 samples in 0.031s...
[t-SNE] Computed conditional probabilities for sample 828 / 828
[t-SNE] Mean sigma: 2.514999
[t-SNE] KL divergence after 250 iterations with early exaggeration: 64.555176
[t-SNE] KL divergence after 20000 iterations: 0.678683
