In [18]:
import numpy as np
import json
import pandas as pd
from tqdm import tqdm

In [2]:
corpus_path = './Atikamekw_corpus.txt'

In [3]:
with open('Atikamekw_WordVectorDict_CBOW6.json', encoding='utf-8') as f:
    CBOW_dic = json.load(f)
print(CBOW_dic)

{'part_sub': [0.44148096442222595, -1.7316007614135742, 6.537610054016113, 3.303389072418213, 6.366267681121826, -0.8693588376045227], 'Pron': [1.5865613222122192, 0.5234644412994385, 6.584729194641113, 2.7325732707977295, 7.91921329498291, -1.4414900541305542], 'part_conn': [0.1397046148777008, -0.47278907895088196, 6.67777681350708, 2.7297861576080322, 7.023789882659912, -0.6262089610099792], 'ici': [-3.497478485107422, -3.053382158279419, 7.427798271179199, 3.33215594291687, 3.950025796890259, -1.2739609479904175], 'w_demo': [-0.46305274963378906, -0.767657995223999, 6.427474021911621, 3.1356327533721924, 5.265308380126953, -0.7726060152053833], 'part_quan': [0.414537250995636, -0.8636816143989563, 6.5042243003845215, 2.4882123470306396, 6.620715618133545, 0.031597573310136795], 'kir': [-0.14793549478054047, -0.7427756190299988, 6.574421405792236, 2.598635196685791, 6.338756084442139, -0.90300053358078], 'part_time': [0.4056661128997803, -0.8957804441452026, 6.447068214416504, 2.939

In [4]:
print(CBOW_dic['part_comp'])

[-0.4728509485721588, -1.206741213798523, 5.689608573913574, 2.5249478816986084, 5.329906463623047, 0.009926319122314453]


In [5]:
with open('Atikamekw_WordVectorDict_SVD6.json', encoding='utf-8') as f:
    SVD_dic = json.load(f)
print(SVD_dic)

{'27': [-0.041304377057255454, 0.05570783024825155, 0.006979340775684273, 0.004931295155092913, -0.0900947725217365, -0.007723167164375795], '27ancienne': [-0.005824610088950816, -0.00026546416014596125, -0.0006553050194603175, 0.000854260287881318, -0.003149740718460547, -0.0019634165559767724], '27as': [-0.0027192804551133165, -0.002135555183373085, -0.00010609482534898735, 0.00349558597285552, 0.0009042155299378019, -0.0013037643090760985], '27assomption': [-0.0053664687852333585, 0.001928591591039309, 0.00016136527430210592, -0.0011622791197661699, -0.0030744047512420857, -0.0008191367228486183], '27axstal': [-0.002719280455109253, -0.002135555183375478, -0.00010609482534917245, 0.0034955859728557728, 0.0009042155299378019, -0.0013037643090760985], '27s': [-0.033522761180787515, 0.04705786223202121, 0.01709845403054557, 0.0024433517410546923, -0.01514241381639943, -0.030518468236150422], '27urf': [-0.014071478317387797, 0.01994680126479937, 0.0028039529720731335, 0.0029145517362522

In [6]:
def convert_text_to_vector(text, dictionary, n, m):
  #split the text into words
  text = text.strip().split()
  #It is the list for all the vectors in text
  text_vectors = []
  text_words = []

  #inspect all n-grams in the text
  for i in range(len(text) - n + 1):
        #The list of vectors for words in the n-gram
        gram_vec = []
        words = []
        #Let's look into every word of the n-gram
        for word in text[i:i+n]:
            #If current word is not in dictionary, we skip this n-gram 
            if word not in dictionary:
                gram_vec = []
                break
            vec_ = dictionary[word][:m]
            gram_vec.append(vec_)
            words.append(word)
        
      #If the list of vectors is not correct, we skip the n-gram
        if len(gram_vec) != n or len(gram_vec[-1]) != m:
            continue
        text_vectors.append(np.array(gram_vec).flatten())
        text_words.append(' '.join(words))
  text_dict = dict(zip(text_words, text_vectors))
  return text_dict

In [7]:
#How long is the n-gram
n = 2
#How long is a vector for every word
m = 6

convert_text_to_vector('Pron part_sub pimickaw apowi part_sub apitisiw tcima', SVD_dic, n, int(m/n))

{'part_sub pimickaw': array([-6.74005356e+00, -2.50705829e+00,  3.23092440e-01, -7.33867470e-03,
        -5.41052291e-03,  2.64921146e-04]),
 'pimickaw apowi': array([-0.00733867, -0.00541052,  0.00026492, -0.0246183 , -0.0181497 ,
         0.00012461]),
 'apowi part_sub': array([-2.46183027e-02, -1.81497041e-02,  1.24612650e-04, -6.74005356e+00,
        -2.50705829e+00,  3.23092440e-01]),
 'part_sub apitisiw': array([-6.74005356e+00, -2.50705829e+00,  3.23092440e-01, -4.12126100e-02,
        -2.88967803e-02,  2.16413002e-03]),
 'apitisiw tcima': array([-0.04121261, -0.02889678,  0.00216413, -0.00867581, -0.00652213,
         0.00015374])}

In [13]:
def vectorize_corpus_with_ngrams(corpus_path: str,
                                   separator: str,
                                   word_vector_dict: dict, n : int , m : int  ) -> np.ndarray:
    colnames=['Text Title', 'X', 'Y', 'Text'] 
    df_corpus = pd.read_csv(corpus_path, delimiter= "$", names =colnames, header=None)[['Text Title', 'Text']]
    corpus = df_corpus['Text'].tolist()
    vectorized_corpus = list()
    for text_index, text in enumerate(tqdm(corpus)):
        vectorized_text = dict()
        if not text:
            continue
        vectorized_t = convert_text_to_vector(text, word_vector_dict,n,m)
        vectorized_text['document_index'] = text_index
        vectorized_text['vectorized_text'] = vectorized_t
        vectorized_corpus.append(vectorized_text)
    return np.array(vectorized_corpus, dtype=object)

In [14]:
n =1 
m = 6

In [19]:
vectorized_corpus = vectorize_corpus_with_ngrams(corpus_path,
                                               ' ',
                                               CBOW_dic,n,int(m/n))

100%|██████████| 1584/1584 [00:00<00:00, 2462.71it/s]


In [20]:
np.save("Atikamekw_vectorized_corpus_CBOW_1gram_6", vectorized_corpus)

In [21]:
vectorized_corpus = vectorize_corpus_with_ngrams(corpus_path,
                                               ' ',
                                               SVD_dic,n,int(m/n))

100%|██████████| 1584/1584 [00:00<00:00, 3244.39it/s]


In [22]:
np.save("Atikamekw_vectorized_corpus_SVD_1gram_6", vectorized_corpus)