In [None]:
import stanfordnlp
import glob

In [None]:
stanfordnlp.download('bxr')

In [None]:
nlp = stanfordnlp.Pipeline(lang='bxr', processors='tokenize,mwt,pos,lemma')

In [None]:
import re
#предобаботка текста
alphabet = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоӨөПпРрСсТтУуҮүФфХхҺһЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
special = ",.;:!@#$%^&*(){}[]\"\xa0"
def parse(word, output_file):
    if len(word) == 0:
        return
    if word[0] in alphabet:
        pos = alphabet.find(word[0])
        if pos % 2 == 0:
            word = alphabet[pos + 1] + word[1:]
        for symbol in special:
            word = word.replace(symbol, '')
        index = 0
        for letter in word:
            pos_letter = alphabet.find(letter)
            if pos_letter % 2 == 0:
                word = word[:index] + alphabet[pos_letter + 1] + word[index + 1 :]
            index += 1
        if len(word) >= 2:
            word_nlp = nlp(word)
            for sent in word_nlp.sentences:
                    for w in sent.words:
                        lemma = w.lemma
                        output_file.write(lemma)
                        output_file.write(' ')

#получение корпуса
def make_corpus(input_paths, output_file_path):
    with open(output_file_path, 'w', encoding="utf_8_sig") as output_file:
        cnt = 0
        for input_path in input_paths:
            with open(input_path, encoding = 'utf_8') as data:
                data = data.read().split("</doc>")
                for i in range(len(data)):
                    cnt_words = len(data[i].split(' '))
                    if cnt_words >= 20:
                        cnt += 1
                        cur_text = data[i][(data[i].find('>') + 1):]
                        words = cur_text.replace("\n", " ").split(" ")
                        for word in words:
                            cur = parse(word, output_file)
                        output_file.write('\n')

In [None]:
make_corpus(['Corpus/Raw_data/wiki_00','Corpus/Raw_data/wiki_01','Corpus/Raw_data/wiki_02',
             'Corpus/Raw_data/wiki_03','Corpus/Raw_data/wiki_04', 'Corpus/Raw_data/wiki_05'], 'Corpus/processed.txt')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
def make_table_and_dict(corpus_path, min_df, max_df, token_pattern = None, use_idf = True):
    with open(corpus_path, 'r', encoding="utf8") as corpus_file:
        vectorizer = TfidfVectorizer()
        data_vectorized = vectorizer.fit_transform(corpus_file)
    return data_vectorized, vectorizer.get_feature_names(), vectorizer.idf_

In [None]:
buryat_data_vectorized, buryat_dictionary, idfs = make_table_and_dict('./Corpus/processed.txt', 3, 0.8)
pairs = dict(zip(buryat_dictionary, idfs))
with open('./Corpus/cutted_buryat_dict.txt', 'w', encoding="utf8") as output_file:
    for word in buryat_dictionary:
        output_file.write(word)
        output_file.write('\n')

In [None]:
from scipy.sparse.linalg import svds
import numpy as np

In [None]:
with open('./Corpus/' + 'Tf-Idf_Matrix.npy', 'wb') as f:
    np.save(f, buryat_data_vectorized)

In [None]:
#получение SVD-разложения
def create_table(data_vectorized, k, name):
    u, sigma, vt = svds(data_vectorized, k)
    with open('./Corpus/SVD_U.npy', 'wb') as f:
        np.save(f, u)
    with open('./Corpus/SVD_sigma.npy', 'wb') as f:
        np.save(f, sigma)
    print(sigma)
    with open('./Corpus/SVD_VT.npy', 'wb') as f:
        np.save(f, vt)
    with open('./Corpus/' + name + str(k) + '.npy', 'wb') as f:
        np.save(f, np.dot(np.diag(sigma), vt).T)

In [None]:
create_table(buryat_data_vectorized, 1024, 'buryat_sigma_v')

In [None]:
table = np.load('./Corpus/buryat_sigma_v1024.npy')
i = 0
dictionary = {}
with open('./Corpus/cutted_buryat_dict.txt',  'r', encoding="utf8") as f:
    for line in f:
        dictionary[line[:-1]] = table[i] #removing '\n' at the end of the words
        i += 1
with open('./Corpus/buryat_dictionary.npy', 'wb') as f:
        np.save(f, dictionary)

In [None]:
#получаем датасет с заданными параметрами N - длина N-грам и M - размерность слова
N = 3
M = 10
new_dict = np.load('./Corpus/buryat_dictionary.npy', allow_pickle='TRUE')
dictionary = new_dict.item()
cnt = 0
ngrams = []

with open('./Corpus/processed.txt',  'r', encoding="utf8") as f:
    for line in f:
        current = line.split(' ')
        for i in range(len(current) - N + 1):
            if current[i] in dictionary and current[i + 1] in dictionary and current[i + 2] in dictionary:
                result = [*dictionary[current[i]][:M], *dictionary[current[i + 1]][:M], *dictionary[current[i + 2]][:M]]
                ngrams.append(result)
res = np.array(ngrams)
print(res.shape)
with open('./Corpus/result_dataset.npy', 'wb') as f:
    np.save(f, ngrams)