In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds
from IPython.display import clear_output
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import pickle

In [2]:
class Vectorizer():
    def __init__(self, corp_path):
        self.corp_path = corp_path
    
    def upload_corp(self):
        with open(self.corp_path, 'r',encoding = 'utf-8') as f:
            self.corp = list(set(f.read().split('\n')[:-1]))
        self.corp = [text.replace('$$$', ' ') for text in self.corp]
    
    
    def log(self, part):
        clear_output(wait=True)
        print(f'{part} is processing')
        
    def make_tf_idf_matrix(self, token_pattern=None, min_df = 1, max_df=None, use_idf = True):
        if token_pattern:
            self.tfidf = TfidfVectorizer(analyzer='word', min_df=min_df, token_pattern=token_pattern, use_idf=use_idf)
        else:
            self.tfidf = TfidfVectorizer(analyzer='word', min_df=min_df, use_idf=use_idf)
            
        self.W = self.tfidf.fit_transform(self.corp)
        self.words_list = self.tfidf.get_feature_names_out()
        
    def make_svd(self, output_folder, k=6 ):
#         self.u, self.sigma, self.vt = svds(self.W, k)
        
#         self.descending_order_of_inds = np.flip(np.argsort(self.sigma))
#         self.u = self.u[:,self.descending_order_of_inds]
#         self.vt = self.vt[self.descending_order_of_inds,:]
#         self.sigma = np.diag(self.sigma[self.descending_order_of_inds])
        self.u, self.sigma, self.vT = svds(self.W, k)
        self.descending_order_of_inds = np.argsort(-self.sigma)
        
        self.u = self.u[:, self.descending_order_of_inds]
        self.sigma = np.diag(self.sigma[self.descending_order_of_inds])
        self.vT = self.vT[self.descending_order_of_inds, :]

        #Checking that sizes are ok
        #assert self.sigma.shape == (k,)
        #assert swlf.vt.shape == (k, self.W.shape[1])
        #assert swlf.u.shape == (self.W.shape[0], k)
        print (self.W.shape)
        print (self.sigma.shape)
        print (self.vT.shape)
        print (self.u.shape)
        
        self.embedded_matrix = self.sigma@self.vT
        #self.embedded_matrix = np.dot(np.diag(self.sigma), self.vt).T
        self.words_embedding_dict = dict(zip(self.words_list, self.embedded_matrix.T))
        #self.words_embedding_dict = dict(zip(self.words_list, self.embedded_matrix))
        
        with open(output_folder+'/' + str(k) + '_sigma_vt.npy', 'wb') as f:
            #np.save(f, np.dot(np.diag(self.sigma), self.vt).T)
            np.save(f, self.embedded_matrix.T)
        with open(output_folder+'/' +  str(k) + '_sigma.npy', 'wb') as f:
            np.save(f, self.sigma)
        with open(output_folder+'/' +  str(k) + '_u.npy', 'wb') as f:
            np.save(f, self.u)
        with open(output_folder+'/' +  str(k) + '_vt.npy', 'wb') as f:
            np.save(f, self.vT)
    
    
    def get_emb_dict(self):
        
        self.log('Upload')
        self.upload_corp()
        self.log('TfIdf')
        self.make_tf_idf_matrix()
        self.log('SVD')
        self.make_svd('./Matrixes')
        
        return self.words_embedding_dict

In [4]:
vect = Vectorizer('./Atikamekw_corpus.txt')
emb_dict = vect.get_emb_dict()

SVD is processing
(1555, 12506)
(100, 100)
(100, 12506)
(1555, 100)


In [6]:
for c, k in enumerate(emb_dict.keys()):
    print(f'{k}: ', emb_dict[k].round(4))
    if c == 1000:
        break



27:  [ 0.0413 -0.0557 -0.007  -0.0049  0.0901 -0.0077 -0.0072  0.0083  0.0066
 -0.0062 -0.017   0.0095 -0.0072 -0.0126  0.0252 -0.0075  0.0193 -0.1594
 -0.1821  0.1127  0.1269  0.0064 -0.003  -0.0272 -0.0104 -0.01   -0.0132
 -0.0167  0.0193 -0.0093 -0.0145 -0.0021  0.0224 -0.0051  0.0155  0.0077
 -0.002  -0.0059 -0.0024  0.0035  0.0063  0.0218  0.0074 -0.0179  0.0193
  0.0338  0.0004  0.037   0.0129  0.0319  0.0189 -0.0018 -0.0272  0.0169
 -0.0039 -0.026   0.0133  0.0025  0.0111 -0.0186  0.0031  0.0085  0.0107
  0.0153 -0.0018  0.0424 -0.0236  0.0266  0.0744 -0.004  -0.0246 -0.0603
  0.0293  0.0655 -0.2903 -0.2201  0.1597 -0.118   0.0591  0.0462 -0.2458
  0.0256 -0.259   0.076  -0.0338 -0.037  -0.0006 -0.0537 -0.0237  0.0524
 -0.0164 -0.0043  0.0347  0.0183 -0.0021 -0.0386 -0.0128 -0.0199  0.0276
 -0.01  ]
27ancienne:  [ 0.0058  0.0003  0.0007 -0.0009  0.0031 -0.002   0.0066 -0.0101  0.0003
  0.0068 -0.0087 -0.0068  0.0099 -0.001   0.0024 -0.0014  0.0086  0.0016
 -0.0015  0.0015 -0.008

aiapeam:  [ 0.0134 -0.008   0.0442  0.0064 -0.008   0.0018  0.0065  0.0077 -0.007
 -0.0013  0.0061 -0.0013  0.0001  0.0057 -0.0027  0.0017 -0.0015  0.0021
 -0.0014  0.0021  0.0007 -0.0001  0.0084  0.0024 -0.0038 -0.0015 -0.0007
  0.0022  0.0012 -0.0059 -0.0076  0.0035  0.0066 -0.0059  0.0114  0.0109
  0.0102  0.0008  0.0067  0.0052 -0.0009  0.0029 -0.0079 -0.0004  0.0057
  0.0067 -0.0118 -0.006  -0.0045  0.0005  0.0011  0.0009 -0.0062 -0.0014
 -0.0042  0.0024 -0.0009 -0.0013  0.0059 -0.004  -0.0017  0.0006  0.0001
 -0.0069 -0.0011 -0.0018  0.0046 -0.005   0.0048 -0.0048 -0.0104  0.0039
 -0.0029  0.0028  0.0096 -0.0023  0.0027 -0.0049  0.0041 -0.0017 -0.004
 -0.0068  0.0029  0.0017 -0.0024  0.0027  0.0028  0.0062  0.0031  0.0087
  0.0092 -0.0054  0.0011 -0.0022 -0.0026 -0.0057  0.0102  0.0036 -0.001
 -0.0129]
aiapew:  [ 0.0075 -0.0146  0.0499  0.0003 -0.0105 -0.      0.0006  0.0019 -0.0009
 -0.0003  0.0016 -0.0003 -0.0011  0.0008  0.0004 -0.0003 -0.0002 -0.0009
 -0.0001  0.      0.0004 

akoskewkano:  [ 0.0001  0.0001  0.      0.     -0.      0.      0.      0.     -0.
 -0.     -0.     -0.     -0.      0.     -0.      0.      0.      0.
  0.     -0.      0.      0.     -0.      0.      0.      0.     -0.
 -0.      0.     -0.     -0.     -0.      0.      0.     -0.     -0.
 -0.     -0.      0.      0.      0.      0.      0.     -0.     -0.
  0.     -0.      0.      0.      0.      0.      0.      0.      0.
 -0.     -0.     -0.     -0.     -0.     -0.      0.     -0.      0.
  0.      0.     -0.      0.      0.     -0.      0.     -0.     -0.
  0.     -0.     -0.      0.     -0.      0.     -0.      0.      0.
 -0.      0.      0.     -0.      0.     -0.     -0.     -0.      0.
  0.      0.      0.      0.     -0.     -0.     -0.     -0.     -0.
 -0.    ]
akosw:  [ 0.0312  0.0214  0.0017 -0.003  -0.0038  0.0029 -0.0031  0.0072 -0.0023
  0.0204  0.016   0.0028  0.0118 -0.0104 -0.0248  0.0177  0.0014  0.0185
 -0.0159  0.005   0.0132 -0.0242  0.0406  0.021   0.017   0.023

apiskotaw:  [ 0.0043  0.0026 -0.0003  0.0033 -0.0004  0.0008 -0.0029  0.0026  0.0004
 -0.     -0.0055 -0.0016 -0.0028  0.0006  0.0016 -0.0033  0.0082  0.0001
  0.0018 -0.0005 -0.0001  0.0001 -0.0054  0.0044  0.0045  0.0027  0.0053
 -0.0072  0.0015 -0.0047 -0.0028 -0.0025  0.0041  0.0019 -0.0017 -0.0032
 -0.0131 -0.005   0.0014  0.0074 -0.0011  0.0028 -0.0009  0.001   0.0054
 -0.0059  0.0023 -0.0005  0.0012  0.0064 -0.0012  0.0084  0.003   0.0034
  0.0038 -0.0031 -0.0042 -0.0016 -0.0025  0.0068 -0.0008 -0.0019  0.0036
 -0.0023  0.0022  0.0024  0.0037 -0.0053  0.0043  0.0015  0.0011 -0.0029
  0.0036 -0.0043  0.002  -0.      0.0029  0.0007 -0.0051 -0.0026  0.001
  0.0013 -0.0022  0.0014  0.0009  0.003   0.0085  0.0037  0.0004 -0.0007
  0.0014 -0.0024  0.0062 -0.0014 -0.0119 -0.0051  0.0009 -0.0049  0.003
 -0.003 ]
apit:  [ 1.430e-01  1.000e-01  6.000e-04  1.592e-01 -1.740e-02  7.100e-03
 -2.248e-01 -1.017e-01  2.940e-02  1.280e-02  3.700e-02  1.726e-01
 -1.582e-01 -1.761e-01  1.853e-01  1