In [13]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds
from IPython.display import clear_output
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import pickle
import json
import pandas as pd 

In [10]:
class Vectorizer():
    def __init__(self, corp_path):
        self.corp_path = corp_path
    
    def upload_corp(self):
        with open(self.corp_path, 'r',encoding = 'utf-8') as f:
            self.corp = list(set(f.read().split('\n')[:-1]))
        self.corp = [text.replace('$$$', ' ') for text in self.corp]
    
    
    def log(self, part):
        clear_output(wait=True)
        print(f'{part} is processing')
        
    def make_tf_idf_matrix(self, token_pattern=None, min_df = 1, max_df=None, use_idf = True):
        if token_pattern:
            self.tfidf = TfidfVectorizer(analyzer='word', min_df=min_df, token_pattern=token_pattern, use_idf=use_idf)
        else:
            self.tfidf = TfidfVectorizer(analyzer='word', min_df=min_df, use_idf=use_idf)
            
        self.W = self.tfidf.fit_transform(self.corp)
        self.words_list = self.tfidf.get_feature_names_out()
        
    def make_svd(self, output_folder, k=6 ):
#         self.u, self.sigma, self.vt = svds(self.W, k)
        
#         self.descending_order_of_inds = np.flip(np.argsort(self.sigma))
#         self.u = self.u[:,self.descending_order_of_inds]
#         self.vt = self.vt[self.descending_order_of_inds,:]
#         self.sigma = np.diag(self.sigma[self.descending_order_of_inds])
        self.u, self.sigma, self.vT = svds(self.W, k)
        self.descending_order_of_inds = np.argsort(-self.sigma)
        
        self.u = self.u[:, self.descending_order_of_inds]
        self.sigma = np.diag(self.sigma[self.descending_order_of_inds])
        self.vT = self.vT[self.descending_order_of_inds, :]

        #Checking that sizes are ok
        #assert self.sigma.shape == (k,)
        #assert swlf.vt.shape == (k, self.W.shape[1])
        #assert swlf.u.shape == (self.W.shape[0], k)
        print (self.W.shape)
        print (self.sigma.shape)
        print (self.vT.shape)
        print (self.u.shape)
        
        self.embedded_matrix = self.sigma@self.vT
        #self.embedded_matrix = np.dot(np.diag(self.sigma), self.vt).T
        self.words_embedding_dict = dict(zip(self.words_list, self.embedded_matrix.T))
        #self.words_embedding_dict = dict(zip(self.words_list, self.embedded_matrix))
        
        with open(output_folder+'/' + str(k) + '_sigma_vt.npy', 'wb') as f:
            #np.save(f, np.dot(np.diag(self.sigma), self.vt).T)
            np.save(f, self.embedded_matrix.T)
        with open(output_folder+'/' +  str(k) + '_sigma.npy', 'wb') as f:
            np.save(f, self.sigma)
        with open(output_folder+'/' +  str(k) + '_u.npy', 'wb') as f:
            np.save(f, self.u)
        with open(output_folder+'/' +  str(k) + '_vt.npy', 'wb') as f:
            np.save(f, self.vT)
            
        self.save_word_embedding(k)
    
    
    def get_emb_dict(self):
        
        self.log('Upload')
        self.upload_corp()
        self.log('TfIdf')
        self.make_tf_idf_matrix()
        self.log('SVD')
        self.make_svd('./Matrixes')
        
        return self.words_embedding_dict
    
    def save_word_embedding(self, shape):
        
        dict_ = {}
        for key, value in self.words_embedding_dict.items():
            dict_[key] = value.tolist()
        save_name = "Atikamekw_WordVectorDict" + str(shape) + ".json"
        with open(save_name, 'w') as f:
            json.dump(dict_, f)
            
    

In [11]:
vect = Vectorizer('./Atikamekw_corpus.txt')
emb_dict = vect.get_emb_dict()

SVD is processing
(1555, 12506)
(6, 6)
(6, 12506)
(1555, 6)


In [12]:
for c, k in enumerate(emb_dict.keys()):
    print(f'{k}: ', emb_dict[k].round(4))
    if c == 1000:
        break



27:  [-0.0413  0.0557  0.007   0.0049 -0.0901 -0.0077]
27ancienne:  [-0.0058 -0.0003 -0.0007  0.0009 -0.0031 -0.002 ]
27as:  [-0.0027 -0.0021 -0.0001  0.0035  0.0009 -0.0013]
27assomption:  [-0.0054  0.0019  0.0002 -0.0012 -0.0031 -0.0008]
27axstal:  [-0.0027 -0.0021 -0.0001  0.0035  0.0009 -0.0013]
27s:  [-0.0335  0.0471  0.0171  0.0024 -0.0151 -0.0305]
27urf:  [-0.0141  0.0199  0.0028  0.0029 -0.0315  0.0024]
80:  [-0.0092  0.0061  0.0012  0.0007 -0.0157  0.0007]
89cole:  [-0.0039 -0.0024 -0.0002  0.002  -0.0005  0.    ]
89piphanie:  [-0.0077  0.0074  0.0002 -0.0025 -0.006  -0.0119]
8ele:  [-0.0635  0.0897  0.0227  0.0115 -0.1086 -0.0248]
93il:  [-0.0127  0.0182  0.0023  0.0015 -0.0236  0.0015]
93lac:  [-0.0059  0.0083  0.0012  0.0016 -0.0156  0.0014]
93ur:  [-0.0127  0.0182  0.0023  0.0015 -0.0236  0.0015]
99a:  [-0.0038 -0.0018  0.     -0.0009 -0.001  -0.0007]
a0:  [-0.0059  0.0083  0.0012  0.0016 -0.0156  0.0014]
a2teau:  [-0.0068  0.0097  0.0013  0.0013 -0.0153  0.0009]
a2teaugua

In [29]:
def corpus_as_csv(corp_path):
    colnames=['Text Title', 'X', 'Y', 'Text'] 
    return pd.read_csv(corp_path, delimiter= "$", names =colnames, header=None)[['Text Title', 'Text']]
    

In [38]:
def vectorize_text(text: str, word_vector_dict: dict) -> dict:
    words = text.split()
    vectors = list(map(word_vector_dict.get, words))
    text_dict = dict(zip(words, vectors))
    return text_dict
    
def vectorize_corpus_with_dots(corpus: list,
                                   separator: str,
                                   word_vector_dict: dict) -> np.ndarray:
    vectorized_corpus = list()
    for text_index, text in enumerate(tqdm(corpus)):
        #vectorized_text = list()
        vectorized_text = dict()
        if not text:
            continue
        vectorized_t = vectorize_text(text, word_vector_dict)
        #vectorized_text.append({'document_index': text_index,
        #                        'vectorized_text': vectorized_t})
        vectorized_text['document_index'] = text_index
        vectorized_text['vectorized_text'] = vectorized_t
        vectorized_corpus.append(vectorized_text)
    return np.array(vectorized_corpus, dtype=object)

In [37]:
corpus = corpus_as_csv('./Atikamekw_corpus.txt')
corpus

Unnamed: 0,Text Title,Text
0,%C3%89cole,otapi kiskinohamatowikam part_prox ici acteam ...
1,%C3%89cole,otapi kiskinohamatowikam part_prox ici acteam ...
2,%C3%8Ele,cap otenw aski ici acteam kanatw irik part_sub...
3,A-ca-oo-mah-ca-ye,part_inter ca oo mah ca aka matci part_conn sw...
4,Aatsista-Mahkan,aatsistw mah matci part_conn Pron nit part_sub...
...,...,...
1579,Pincourt,otenw kepew ask ici acteam kanatw irik part_su...
1580,Pipatciw,Pron pirecic part_sub icinikatakaniwiw part_su...
1581,Pipikorikic,arikic part_sub tipotokanikesiw ospiskoniw eko...
1582,Pipon,pipo enkw peiw part_sub aist ekoni apiw part_s...


In [39]:
vectorized_corpus = vectorize_corpus_with_dots(corpus['Text'].tolist(),
                                               ' ',
                                               emb_dict)

  0%|          | 0/1584 [00:00<?, ?it/s]

In [40]:
np.save("Atikamekw_vectorized_corpus6", vectorized_corpus)

In [41]:
vectorized_corpus[:2]

array([{'document_index': 0, 'vectorized_text': {'otapi': array([-0.04764308, -0.03287035, -0.00241814, -0.00122465, -0.00122185,
               0.00409874]), 'kiskinohamatowikam': array([-0.03106345, -0.01966565, -0.00214713,  0.02871427, -0.00074987,
              -0.00171411]), 'part_prox': array([-1.02567021, -0.64670653, -0.07451045, -0.22958758, -0.15007555,
              -0.1009112 ]), 'ici': array([-2.5174693 ,  1.52444357, -0.43262462, -0.2451195 ,  0.21619956,
              -0.04247688]), 'acteam': array([-2.22932306,  2.93890509, -0.53626136,  0.07769396,  0.2386532 ,
              -0.00551912]), 'atikam': array([-0.36070451, -0.23475505, -0.05531922,  0.2303212 , -0.02510502,
              -0.00348017]), 'oten': array([-0.29676909, -0.0981933 , -0.033418  , -0.0323603 , -0.14303653,
              -0.12652829]), 'manawan': array([-0.19881873, -0.15709143, -0.01774884,  0.25443941,  0.01664875,
              -0.01545245]), 'icinikatcikateam': array([-0.07810037, -0.04264253, 