In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds
from IPython.display import clear_output
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import pickle
import io 
import json
import random

In [2]:
from graph_utils import *

In [5]:
def convert_text_to_vector(text, dictionary, n, m):
    text = text.strip().split()
    text_vectors = []
    text_words = []

    for i in range(len(text) - n + 1):
        gram_vec = []
        words = []
        for word in text[i:i+n]:
            if word not in dictionary:
                gram_vec = []
                break
            vec_ = dictionary[word][:m]
            gram_vec.append(vec_)
            words.append(word)
        if len(gram_vec) != n or len(gram_vec[-1]) != m:
            continue
        text_vectors.append(np.array(gram_vec).flatten())
        text_words.append(' '.join(words))
    text_dict = dict(zip(text_words, text_vectors))
    return text_dict

In [12]:
def vectorize_corpus_with_ngrams(corpus_path: str,
                                   separator: str,
                                   word_vector_dict: dict, n : int , m : int  ) -> np.ndarray:
    colnames=['Text Title', 'X', 'Y', 'Text'] 
    df_corpus = pd.read_csv(corpus_path, delimiter= "$", names =colnames, header=None)[['Text Title', 'Text']]
    df_corpus = df_corpus.dropna(subset=['Text'])
    corpus = df_corpus['Text'].tolist()
    vectorized_corpus = list()
    for text_index, text in enumerate(tqdm(corpus)):
        vectorized_text = dict()
        if not text:
            continue
        vectorized_t = convert_text_to_vector(text, word_vector_dict,n,m)
        vectorized_text['document_index'] = text_index
        vectorized_text['vectorized_text'] = vectorized_t
        vectorized_corpus.append(vectorized_text)
    return np.array(vectorized_corpus, dtype=object)

In [13]:
#How long is the n-gram
n = 1
#How long is a vector for every word
m = 6


# Thai SVD

In [14]:
class Vectorizer():
    def __init__(self, corp_path):
        self.corp_path = corp_path
    
    def upload_corp(self):
        with open(self.corp_path, 'r',encoding = 'utf-8') as f:
            self.corp = list(set(f.read().split('\n')[:-1]))
        self.corp = [text.replace('$$$', ' ') for text in self.corp]
    
    
    def log(self, part):
        clear_output(wait=True)
        print(f'{part} is processing')
        
    def make_tf_idf_matrix(self, token_pattern=None, min_df = 1, max_df=None, use_idf = True):
        if token_pattern:
            self.tfidf = TfidfVectorizer(analyzer='word', min_df=min_df, token_pattern=token_pattern, use_idf=use_idf)
        else:
            self.tfidf = TfidfVectorizer(analyzer='word', min_df=min_df, use_idf=use_idf)
            
        self.W = self.tfidf.fit_transform(self.corp)
        self.words_list = self.tfidf.get_feature_names_out()
        
    def make_svd(self, output_folder, k=30 ):
#         self.u, self.sigma, self.vt = svds(self.W, k)
        
#         self.descending_order_of_inds = np.flip(np.argsort(self.sigma))
#         self.u = self.u[:,self.descending_order_of_inds]
#         self.vt = self.vt[self.descending_order_of_inds,:]
#         self.sigma = np.diag(self.sigma[self.descending_order_of_inds])
        self.u, self.sigma, self.vT = svds(self.W, k)
        self.descending_order_of_inds = np.argsort(-self.sigma)
        
        self.u = self.u[:, self.descending_order_of_inds]
        self.sigma = np.diag(self.sigma[self.descending_order_of_inds])
        self.vT = self.vT[self.descending_order_of_inds, :]

        #Checking that sizes are ok
        #assert self.sigma.shape == (k,)
        #assert swlf.vt.shape == (k, self.W.shape[1])
        #assert swlf.u.shape == (self.W.shape[0], k)
        print (self.W.shape)
        print (self.sigma.shape)
        print (self.vT.shape)
        print (self.u.shape)
        
        self.embedded_matrix = self.sigma@self.vT
        #self.embedded_matrix = np.dot(np.diag(self.sigma), self.vt).T
        self.words_embedding_dict = dict(zip(self.words_list, self.embedded_matrix.T))
        #self.words_embedding_dict = dict(zip(self.words_list, self.embedded_matrix))
        
#         with open(output_folder+'/' + str(k) + '_sigma_vt.npy', 'wb') as f:
#             #np.save(f, np.dot(np.diag(self.sigma), self.vt).T)
#             np.save(f, self.embedded_matrix.T)
#         with open(output_folder+'/' +  str(k) + '_sigma.npy', 'wb') as f:
#             np.save(f, self.sigma)
#         with open(output_folder+'/' +  str(k) + '_u.npy', 'wb') as f:
#             np.save(f, self.u)
#         with open(output_folder+'/' +  str(k) + '_vt.npy', 'wb') as f:
#             np.save(f, self.vT)
            
        self.save_word_embedding(k)
    
    
    def save_word_embedding(self, shape):
        
        dict_ = {}
        for key, value in self.words_embedding_dict.items():
            dict_[key] = value.tolist()
        save_name = "Thai_WordVectorDict_SVD" + str(shape) + ".json"
        with open(save_name, 'w') as f:
            json.dump(dict_, f)
            
    def get_emb_dict(self):
        
        self.log('Upload')
        self.upload_corp()
        self.log('TfIdf')
        self.make_tf_idf_matrix()
        self.log('SVD')
        self.make_svd('./Matrixes',k=6)
        
        return self.words_embedding_dict

In [15]:
vect = Vectorizer('./Thai_corpus.txt')
emb_dict = vect.get_emb_dict()

SVD is processing
(5781, 8985590)
(6, 6)
(6, 8985590)
(5781, 6)


In [None]:
# #How long is the n-gram
# n = 1
# #How long is a vector for every word
# m = 100


In [16]:
vectorized_corpus = vectorize_corpus_with_ngrams('./Thai_corpus.txt',
                                               ' ',
                                               emb_dict,n,int(m/n))

100%|██████████| 5878/5878 [01:31<00:00, 63.96it/s] 


In [17]:
np.save("Thai_vectorized_corpus_SVD_1gram_6", vectorized_corpus)

In [3]:
At_VECTORIZED_CORPUS = "./Thai_vectorized_corpus_SVD_1gram_6.npy"
at_corpus = np.load(At_VECTORIZED_CORPUS, allow_pickle=True)
at_corpus = at_corpus.tolist()

In [4]:
n = int(len(at_corpus)/100)
# printing n elements from list
test = random.choices(at_corpus, k=n)
np.save("Thai_vectorized_corpus_sample1%_SVD_1gram_6", np.array(test))


In [None]:
# vectorized_corpus = vectorized_corpus.tolist()

In [None]:
# vertices = create_vertices(vectorized_corpus, 6)

In [None]:
# len(vertices)

In [None]:
# gg = GG(vertices)
# gg.create_gabriel_graph()


In [None]:
# with open('Thai_Gabriel_Graph_SVD_1gram_100.pkl', 'wb') as outp:
#     pickle.dump(gg, outp, pickle.HIGHEST_PROTOCOL)

In [None]:
# degrees = gg.degree_distribution()
# dmax = max(degrees)

# fig = plt.figure("Degree of a random graph", figsize=(8, 8))
# # Create a gridspec for adding subplots of different sizes
# axgrid = fig.add_gridspec(5, 4)



# ax1 = fig.add_subplot(axgrid[:, :2])
# ax1.plot(degrees, "b-", marker="o")
# ax1.set_title("Degree Rank Plot")
# ax1.set_ylabel("Degree")
# ax1.set_xlabel("Rank")

# ax2 = fig.add_subplot(axgrid[:, 2:])
# ax2.bar(*np.unique(degrees, return_counts=True))
# ax2.set_title("Degree histogram")
# ax2.set_xlabel("Degree")
# ax2.set_ylabel("# of Nodes")

# fig.tight_layout()

In [None]:
# gabriel = gg.get_networkx_graph()

In [None]:
# print (nx.diameter(gabriel))

In [None]:
# print (nx.radius(gabriel))

In [None]:
# print (nx.average_shortest_path_length(gabriel))

In [None]:
# print (gabriel.number_of_nodes())

In [None]:
# print (gabriel.number_of_edges())

In [None]:
# print(nx.is_connected(gabriel))

# Thai CBOW

In [18]:
def get_vocab(corp_path):
    fin = io.open(corp_path, 'r', encoding='utf-8')
    vocab = set()
    for line in fin:
        for word in line.replace('$$$', ' ').split():
            vocab.add(word)
    return vocab

In [19]:
vocab = get_vocab('Thai_corpus.txt')

In [20]:
len(vocab)

16074599

In [21]:
from gensim.models import Word2Vec
dimension = 6
model = Word2Vec(vector_size=dimension, min_count=1)
model.build_vocab(vocab)
model.save("word2vec_Thai.model")

In [22]:
%%time

with io.open('Thai_corpus.txt', 'r', encoding='utf-8') as file:
    eof = False
    while not eof:
        limit = 1000
        documents = []
        for line in file:
            documents.append(line.split())
            limit -= 1
            if limit == 0:
                break
        else:
            eof = True
        model = Word2Vec.load("word2vec_Thai.model")
        model.build_vocab(documents, update=True)
        model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
        model.save("word2vec_Thai.model")

Wall time: 13min 39s


In [23]:
emb_dict = {key : model.wv[key] for key in model.wv.key_to_index}

In [24]:
# len(emb_dict)

In [25]:
# #How long is the n-gram
# n = 1
# #How long is a vector for every word
# m = 100


In [26]:
vectorized_corpus = vectorize_corpus_with_ngrams('./Thai_corpus.txt',
                                               ' ',
                                               emb_dict,n,int(m/n))

100%|██████████| 5878/5878 [04:13<00:00, 23.16it/s]


In [27]:
np.save("Thai_vectorized_corpus_CBOW_1gram_6", vectorized_corpus)

In [5]:
At_VECTORIZED_CORPUS = "./Thai_vectorized_corpus_CBOW_1gram_6.npy"
at_corpus = np.load(At_VECTORIZED_CORPUS, allow_pickle=True)
at_corpus = at_corpus.tolist()

In [6]:
n = int(len(at_corpus)/100)
# printing n elements from list
test = random.choices(at_corpus, k=n)
np.save("Thai_vectorized_corpus_sample1%_CBOW_1gram_6", np.array(test))


In [None]:
# vectorized_corpus = vectorized_corpus.tolist()

In [None]:
# vertices = create_vertices(vectorized_corpus, 6)

In [None]:
# len(vertices)

In [None]:
# gg = GG(vertices)
# gg.create_gabriel_graph()


In [None]:
# with open('Thai_Gabriel_Graph_CBOW_1gram_100.pkl', 'wb') as outp:
#     pickle.dump(gg, outp, pickle.HIGHEST_PROTOCOL)

In [None]:
# degrees = gg.degree_distribution()
# dmax = max(degrees)

# fig = plt.figure("Degree of a random graph", figsize=(8, 8))
# # Create a gridspec for adding subplots of different sizes
# axgrid = fig.add_gridspec(5, 4)



# ax1 = fig.add_subplot(axgrid[:, :2])
# ax1.plot(degrees, "b-", marker="o")
# ax1.set_title("Degree Rank Plot")
# ax1.set_ylabel("Degree")
# ax1.set_xlabel("Rank")

# ax2 = fig.add_subplot(axgrid[:, 2:])
# ax2.bar(*np.unique(degrees, return_counts=True))
# ax2.set_title("Degree histogram")
# ax2.set_xlabel("Degree")
# ax2.set_ylabel("# of Nodes")

# fig.tight_layout()

In [None]:
# gabriel = gg.get_networkx_graph()

In [None]:
# print (nx.diameter(gabriel))

In [None]:
# print (nx.radius(gabriel))

In [None]:
# print (nx.average_shortest_path_length(gabriel))

In [None]:
# print (gabriel.number_of_nodes())

In [None]:
# print (gabriel.number_of_edges())

In [None]:
# print(nx.is_connected(gabriel))