In [1]:
import glob
import tqdm

In [None]:
def make_corpus(input_path, output_file_path):
    file_list = sorted(glob.glob(input_path + '/*'))
    with open(output_file_path, 'a', encoding = 'utf-8') as output_file:
        for file in tqdm.tqdm(file_list):
            with open(file, 'r', encoding = 'utf-8') as input_file:
                #file_name_full  = file.split('\\')[-1]
                #file_name_without_proces = file_name_full.split('_')[1]
                #file_name_without_txt = file_name_without_proces.split('.')[0]
                file_content = input_file.read()
                file_content = file_content.replace('\n', ' ')
                #output_file.write(file_name_without_txt)
                #output_file.write("$$$")
                output_file.write(file_content)
                output_file.write('\n')

In [None]:
make_corpus ("prepared_dataset" , "Arabic_corpus.txt")

In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds
from IPython.display import clear_output
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import pickle
import json
import pandas as pd 

In [3]:
from gensim.models import Word2Vec

In [4]:
from graph_utils import *

# make SVD

In [None]:
class Vectorizer():
    def __init__(self, corp_path):
        self.corp_path = corp_path
    
    def upload_corp(self):
        with open(self.corp_path, 'r',encoding = 'utf-8') as f:
            self.corp = list(set(f.read().split('\n')[:-1]))
        #self.corp = [text.replace('$$$', ' ') for text in self.corp]
    
    
    def log(self, part):
        clear_output(wait=True)
        print(f'{part} is processing')
        
    def make_tf_idf_matrix(self, token_pattern=None, min_df = 1, max_df=None, use_idf = True):
        if token_pattern:
            self.tfidf = TfidfVectorizer(analyzer='word', min_df=min_df, token_pattern=token_pattern, use_idf=use_idf)
        else:
            self.tfidf = TfidfVectorizer(analyzer='word', min_df=min_df, use_idf=use_idf)
            
        self.W = self.tfidf.fit_transform(self.corp)
        self.words_list = self.tfidf.get_feature_names_out()
        
    def make_svd(self, output_folder, k=6 ):
#         self.u, self.sigma, self.vt = svds(self.W, k)
        
#         self.descending_order_of_inds = np.flip(np.argsort(self.sigma))
#         self.u = self.u[:,self.descending_order_of_inds]
#         self.vt = self.vt[self.descending_order_of_inds,:]
#         self.sigma = np.diag(self.sigma[self.descending_order_of_inds])
        self.u, self.sigma, self.vT = svds(self.W, k)
        self.descending_order_of_inds = np.argsort(-self.sigma)
        
        self.u = self.u[:, self.descending_order_of_inds]
        self.sigma = np.diag(self.sigma[self.descending_order_of_inds])
        self.vT = self.vT[self.descending_order_of_inds, :]

        #Checking that sizes are ok
        #assert self.sigma.shape == (k,)
        #assert swlf.vt.shape == (k, self.W.shape[1])
        #assert swlf.u.shape == (self.W.shape[0], k)
        print (self.W.shape)
        print (self.sigma.shape)
        print (self.vT.shape)
        print (self.u.shape)
        
        self.embedded_matrix = self.sigma@self.vT
        #self.embedded_matrix = np.dot(np.diag(self.sigma), self.vt).T
        self.words_embedding_dict = dict(zip(self.words_list, self.embedded_matrix.T))
        #self.words_embedding_dict = dict(zip(self.words_list, self.embedded_matrix))
        
#         with open(output_folder+'/' + str(k) + '_sigma_vt.npy', 'wb') as f:
#             #np.save(f, np.dot(np.diag(self.sigma), self.vt).T)
#             np.save(f, self.embedded_matrix.T)
#         with open(output_folder+'/' +  str(k) + '_sigma.npy', 'wb') as f:
#             np.save(f, self.sigma)
#         with open(output_folder+'/' +  str(k) + '_u.npy', 'wb') as f:
#             np.save(f, self.u)
#         with open(output_folder+'/' +  str(k) + '_vt.npy', 'wb') as f:
#             np.save(f, self.vT)
            
        self.save_word_embedding(k)
    
    
    def get_emb_dict(self):
        
        self.log('Upload')
        self.upload_corp()
        self.log('TfIdf')
        self.make_tf_idf_matrix()
        self.log('SVD')
        self.make_svd('./Matrixes')
        
        return self.words_embedding_dict
    
    def save_word_embedding(self, shape):
        
        dict_ = {}
        for key, value in self.words_embedding_dict.items():
            dict_[key] = value.tolist()
        save_name = "Arabic_WordVectorDict_SVD" + str(shape) + ".json"
        with open(save_name, 'w') as f:
            json.dump(dict_, f)
            
    

In [None]:
vect = Vectorizer('./Arabic_corpus.txt')
emb_dict = vect.get_emb_dict()

# make CBOW

In [None]:
def load_corpus(corp_path):
    with open(corp_path, 'r',encoding = 'utf-8') as f:
        corp = list(set(f.read().split('\n')[:-1]))
    #corp = [text.replace('$$$', ' ') for text in corp]
    corp = [text.split() for text in corp]
    return corp

In [None]:
documents = load_corpus('Arabic_corpus.txt')

In [None]:
len(documents)

In [None]:
dimension = 6
model = Word2Vec(sentences=documents, vector_size=dimension, min_count=1)
model.save("word2vec_Arabic.model")

In [None]:
dictionary = {key : model.wv[key] for key in model.wv.key_to_index}

In [None]:
def save_dictionary(dictionary,shape):
    for key, value in dictionary.items():
        dictionary[key] = value.tolist()
    save_name = "Arabic_WordVectorDict_CBOW" + str(shape) + ".json"
    with open(save_name, 'w') as f:
        json.dump(dictionary, f)

In [None]:
save_dictionary( dictionary, dimension)

# vectorized corpuses

In [None]:
corpus_path = './Arabic_corpus.txt'

In [None]:
def convert_text_to_vector(text, dictionary, n, m):
    text = text.strip().split()
    text_vectors = []
    text_words = []

    for i in range(len(text) - n + 1):
        gram_vec = []
        words = []
        for word in text[i:i+n]:
            if word not in dictionary:
                gram_vec = []
                break
            vec_ = dictionary[word][:m]
            gram_vec.append(vec_)
            words.append(word)
        if len(gram_vec) != n or len(gram_vec[-1]) != m:
            continue
        text_vectors.append(np.array(gram_vec).flatten())
        text_words.append(' '.join(words))
    text_dict = dict(zip(text_words, text_vectors))
    return text_dict

In [None]:
def vectorize_corpus_with_ngrams(corpus_path: str,
                                   separator: str,
                                   word_vector_dict: dict, n : int , m : int  ) -> np.ndarray:
    colnames=[ 'Text'] 
    df_corpus = pd.read_csv(corpus_path, names =colnames, header=None)
    corpus = df_corpus['Text'].tolist()
    vectorized_corpus = list()
    for text_index, text in enumerate(tqdm(corpus)):
        vectorized_text = dict()
        if not text:
            continue
        vectorized_t = convert_text_to_vector(text, word_vector_dict,n,m)
        vectorized_text['document_index'] = text_index
        vectorized_text['vectorized_text'] = vectorized_t
        vectorized_corpus.append(vectorized_text)
    return np.array(vectorized_corpus, dtype=object)

In [None]:
#How long is the n-gram
n = 1
#How long is a vector for every word
m = 6


In [None]:
with open('Arabic_WordVectorDict_CBOW6.json', encoding='utf-8') as f:
    emb_dic = json.load(f)
#print(CBOW_dic)

In [None]:
print(emb_dic['التي'])

In [None]:
convert_text_to_vector('والالقاب التي حققها طيله مسيرته', emb_dic, n, int(m/n))

In [None]:
vectorized_corpus = vectorize_corpus_with_ngrams(corpus_path,
                                               ' ',
                                               emb_dic,n,int(m/n))

In [None]:
np.save("Arabic_vectorized_corpus_CBOW_1gram_6", vectorized_corpus)

In [None]:
with open("Arabic_WordVectorDict_SVD6.json", encoding='utf-8') as f:
    emb_dic = json.load(f)

In [None]:
vectorized_corpus = vectorize_corpus_with_ngrams(corpus_path,
                                               ' ',
                                               emb_dic,n,int(m/n))

In [None]:
np.save("Arabic_vectorized_corpus_SVD_1gram_6", vectorized_corpus)

 # Arabic Graphs

In [5]:
At_VECTORIZED_CORPUS = "./Arabic_vectorized_corpus_SVD_1gram_6.npy"
at_corpus = np.load(At_VECTORIZED_CORPUS, allow_pickle=True)
at_corpus = at_corpus.tolist()

In [6]:
len(at_corpus)

2912

In [7]:
import random

n = int(len(at_corpus)/100)
# printing n elements from list
test = random.choices(at_corpus, k=n)

In [8]:
print (len(test))

29


In [9]:
#writing the sample 
#np.save("Arabic_vectorized_corpus_sample10%_SVD_1gram_6", np.array(test))
np.save("Arabic_vectorized_corpus_sample1%_SVD_1gram_6", np.array(test))

In [None]:
At_VECTORIZED_CORPUS = "./Arabic_vectorized_corpus_sample10%_SVD_1gram_6.npy"
at_corpus = np.load(At_VECTORIZED_CORPUS, allow_pickle=True)
at_corpus = at_corpus.tolist()

In [None]:
vertices = create_vertices(at_corpus, 6)

In [None]:
len(vertices)

In [None]:
gg = GG(vertices)



In [None]:
gg.reset_graph_neighbors()
        

In [None]:
vectors = gg.get_vectors()
        

In [None]:
words = gg.get_words()
        

In [None]:
word_num_dict = {word: num for word, num in enumerate(words)}
        

In [None]:
file_ = open('eng_delaunay_progress.txt', 'w')
        

In [None]:
file_.write("Delaunay start... \n")
        

In [None]:
gg.delaunay = Delaunay(np.array(vectors), qhull_options="Qbb Qc Qz Qx Q12")
        

In [None]:
delaunay_graph = gg.delaunay.simplices.tolist()
        

In [None]:
file_.write("Delaunay done!")
       

In [None]:
 for triangle in tqdm(delaunay_graph, file=file_):
            triangle_words = set(map(word_num_dict.get, triangle))
            self.triangles.append(triangle_words)
            for word in triangle_words:
                new_neighbors = triangle_words.difference(set([word]))
                self.vertices[word].neighbors.update(new_neighbors)
    file_.close()

In [None]:
gg.create_gabriel_graph()

In [None]:
with open('Arabic_Gabriel_Graph_SVD_1gram_6.pkl', 'wb') as outp:
    pickle.dump(gg, outp, pickle.HIGHEST_PROTOCOL)

In [None]:
# with open('Arabic_Gabriel_Graph_SVD_1gram_6.pkl', 'rb') as inp:
#     At_graph_svd_1g6 = pickle.load(inp)

In [10]:
At_VECTORIZED_CORPUS = "./Arabic_vectorized_corpus_CBOW_1gram_6.npy"
at_corpus = np.load(At_VECTORIZED_CORPUS, allow_pickle=True)
at_corpus = at_corpus.tolist()

In [11]:
import random

n = int(len(at_corpus)/100)
# printing n elements from list
test = random.choices(at_corpus, k=n)

In [12]:
#writing the sample 
#np.save("Arabic_vectorized_corpus_sample10%_CBOW_1gram_6", np.array(test))
np.save("Arabic_vectorized_corpus_sample1%_CBOW_1gram_6", np.array(test))

In [None]:
vertices = create_vertices(at_corpus, 6)

In [None]:
gg = GG(vertices)
gg.create_gabriel_graph()


In [None]:
with open('Arabic_Gabriel_Graph_CBOW_1gram_6.pkl', 'wb') as outp:
    pickle.dump(gg, outp, pickle.HIGHEST_PROTOCOL)

In [None]:
# with open('Arabic_Gabriel_Graph_CBOW_1gram_6.pkl', 'rb') as inp:
#     At_graph_svd_1g6 = pickle.load(inp)