In [None]:
#The First International Workshop on Arabic Big Data & AI (IWABigDAI) May 11 and May 12 2022
#https://sites.google.com/view/arabicbigdata/home

#Tutorial 3: Visualising with Word Embeddings
#author: Dr Mahmoud El-Haj (with help from the Internet)
#GitHub repository: https://github.com/drelhaj/NLP_ML_Visualization_Tutorial

#Thanks to Jeff Delaney kaggle.com/jeffd23 for this simple tutorial on Kaggle https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne

#Visualizing Word Vectors with t-SNE
# TSNE is pretty useful when it comes to visualizing similarity between objects. It works by taking a group of high-dimensional (100 dimensions via Word2Vec) vocabulary word feature vectors, then compresses them down to 2-dimensional x,y coordinate pairs. The idea is to keep similar words close together on the plane, while maximizing the distance between dissimilar words.

#Steps
#1.Clean the data
#2.Build a corpus
#3.Train a Word2Vec Model
#4.Visualize t-SNE representations of the most common words


#TIP: On jupyter you can use %%capture first thing in a cell to catch warnings (warning this stops all sort of output)
#Otherwise use something like:
#with warnings.catch_warnings():
#    warnings.simplefilter('ignore')
#    # Your problematic instruction(s) here'''

In [None]:
%%capture
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import re
import nltk

from gensim.models import word2vec

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline



In [None]:
#Reading a the 2019 CCC talks, which is stored as a CSV file
df = pd.read_csv("csvs/2020-Qatar.csv", delimiter=',', header=0, encoding='utf8')#notice the delimiter is not a comma, check your files first.
print('Number of titles: {:,}\n'.format(df.shape[0]))
df.sample(3)

In [None]:
#remove chars that are not letters or numbers, downcase, then remove stop words

import nltk
import re
import math
#nltk.download('stopwords') #uncomment if not downloaded
ar_stop = set(nltk.corpus.stopwords.words('arabic'))

def clean_sentence(text):
    text = ''.join([i for i in text if not i.isdigit()])
    text = text.replace('>', ' ').replace('<', ' ')
    text = re.sub(' +', ' ', text)
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', text).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in ar_stop or len(word) < 3:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence

In [None]:
#Creates a list of lists containing words from each sentence
import arabic_reshaper # this was missing in your code
from bidi.algorithm import get_display

def build_corpus(columnData):
    corpus = []
    for i in range(len(df)):
        if str(columnData[i]) != 'nan':
            sentence = clean_sentence(columnData[i])
            sentence = arabic_reshaper.reshape(sentence)
            sentence = get_display(sentence) # add this line
            word_list = sentence.split(" ")
            corpus.append(word_list)
    return corpus

corpus = build_corpus(df["tweet_text"])




In [None]:
print("[",df["tweet_text"][0],"],[",df["tweet_text"][1],"]")#original first two sentences as in the CSV file
print(corpus[0:2])#first two sentences after cleaning and removing stop words

In [None]:
#Word2Vec model
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
#The Word to Vec model produces a vocabulary, with each word being represented by an n-dimensional numpy array (100 values in this example)
model = Word2Vec(sentences=corpus, size=100, window=5, min_count=1, workers=4,sg=0)#sg refers to skip-gram and here it's off and by default we are using a CBOW. #CBOW tries to predict a word on the basis of its neighbors, while Skip Gram tries to predict the neighbors of a word. In simpler words, CBOW tends to find the probability of a word occurring in a context. So, it generalizes over all the different contexts in which a word can be used.
model.save("models/word2vec.model")




In [None]:
word = 'عيد'
arb_word = arabic_reshaper.reshape(word)
arb_word = get_display(arb_word) # add this line
model.wv[arb_word]

In [None]:
#OOV!! In some cases and especially with small training data, some words are not observed when train- ing the embedding, which are known as out-of- vocabulary (OOV) words.
word = 'الأضحى'
arb_word = arabic_reshaper.reshape(word)
arb_word = get_display(arb_word) # add this line
try:
    oov_word = arb_word
    model.wv[oov_word]
except KeyError:
    print("Oops! The word","[",word,"] not in vocabulary!")

In [None]:
%%capture
#Plotting similarities of a word embedding model using a scatter plot from t-SNE
#(t-SNE) t-Distributed Stochastic Neighbor Embedding is a non-linear dimensionality reduction algorithm used for exploring high-dimensional data. It maps multi-dimensional data to two or more dimensions suitable for human observation. With help of the t-SNE algorithms, you may have to plot fewer exploratory data analysis plots next time you work with high dimensional data.
def tsne_plot(model,modelName):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    pltFileName = 'plots'+'/'+'word_embeddings'+'_'+modelName+'.pdf';
    plt.savefig(pltFileName)
    plt.show()

In [None]:
model10 = Word2Vec(sentences=corpus, size=100, window=5, min_count=1, workers=4,sg=0)#sg refers to skip-gram and here it's off and by default we are using a CBOW. #CBOW tries to predict a word on the basis of its neighbors, while Skip Gram tries to predict the neighbors of a word. In simpler words, CBOW tends to find the probability of a word occurring in a context. So, it generalizes over all the different contexts in which a word can be used.
model10.save("./models/word2vec10.model")


In [None]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    #plotting the w2v model for words occuring more than 50 times
    tsne_plot(model10,"model10")

In [None]:
from sklearn.decomposition import PCA
import seaborn as sns
sns.set_style("darkgrid")


def tsnescatterplot(model, word, list_names):
    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    """
    arrays = np.empty((0, 100), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.wv.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    for wrd_score in close_words:
        wrd_vector = model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    

        
    # Reduces the dimensionality from 100 to 11 dimensions with PCA
    reduc = PCA(n_components=10, ).fit_transform(arrays)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))
    pltFileName = 'plots'+'/'+'word_embeddings_similarity_to'+'_'+word+'.pdf';
    plt.savefig(pltFileName)
    

In [None]:
word = 'عيد'
arb_word = arabic_reshaper.reshape(word)
arb_word = get_display(arb_word) # add this line
model.most_similar(arb_word)


In [None]:
tsnescatterplot(model, arb_word, [i[0] for i in model.wv.most_similar(arb_word)]) 