# Микродиахроническое исследование русских приставок методами дистрибутивной семантики
## Автор: Елизавета Клыкова, БКЛ181
### Часть 2: подготовка данных
1. Считать лемматизированные предложения из json-файлов
2. Преобразовать предложения в списки словарей вида {token, lemma, pos, gram}
3. Записать тексты в базу, сохраняя информацию о леммах и токенах в словари
4. Сохранить леммы и токены в файлы .pickle

#### Импорт модулей

In [1]:
%load_ext pycodestyle_magic
%pycodestyle_on

In [2]:
import re
import os
import json
import pickle
import pymongo
import jsonlines
from tqdm.auto import tqdm
from collections import Counter

#### Cчитываем файлы, приводим к нужному виду

In [3]:
presov_files, sov_files, postsov_files = [], [], []

for fname in os.listdir():
    if fname.startswith('presov_lem'):
        presov_files.append(fname)
    elif fname.startswith('sov_lem'):
        sov_files.append(fname)
    elif fname.startswith('postsov_lem'):
        postsov_files.append(fname)

In [4]:
def read_mystem_results(filename):
    print('Loading file {}...'.format(filename))
    with open(filename, encoding='utf-8') as f:
        lines = []
        for line in f.readlines():
            lines.append(json.loads(line))
    print('File {} loaded successfully!'.format(filename))
    return lines

In [5]:
def clean_mystem_results(mystem_lines):

    print('Processing sentences...')
    sent_info = []

    for line in tqdm(mystem_lines):
        orig_sent = []
        parsed_sent = []

        for el in line:
            # восстанавливаем предложение
            token = el['text']
            if token != '\r\n' and token != '\n':
                orig_sent.append(token)
            # получаем информацию о токенах
            if 'analysis' in el:
                token_info = {'token': token, 'lemma': token}
                if el['analysis']:
                    token_info['lemma'] = el['analysis'][0]['lex']
                    # доработать
                    gram = el['analysis'][0]['gr']
                    token_info['pos'] = gram.split(',')[0].split('=')[0]
                    token_info['gram'] = gram
                elif re.search('[a-zA-Z]', token):
                    token_info['pos'] = 'FOREIGN'
                elif re.search('[0-9]', token):
                    token_info['pos'] = 'NUMBER'
                else:
                    token_info['pos'] = 'UNK'
                parsed_sent.append(token_info)
            else:
                full_token = re.sub('\\r|\\n', '', token)
                token_parts = list(full_token)
                for part in token_parts:
                    parsed_sent.append({'token': part,
                                        'lemma': part,
                                        'pos': 'PUNCT'})
        sent_info.append((orig_sent, parsed_sent))
    print()
    return sent_info

#### Подключаемся к базе данных

In [6]:
client = pymongo.MongoClient('localhost', 27017)
db = client['thesis']
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'thesis')

In [7]:
sentences = db.sentences
lemmas = db.lemmas
tokens = db.tokens

In [8]:
# sentences.delete_many({})
# lemmas.delete_many({})
# tokens.delete_many({})

Коллекция **sentences** содержит поля:
* sent_text -- текст предложения
* period -- период, к которому относится предложение (0, 1 или 2)
* token_info -- массивы с информацией, полученной от Mystem

Коллекция **lemmas** содержит поля:
* lemma -- начальная форма
* pos -- часть речи
* freq -- частотность (абсолютная)
* docs -- id предложений, содержащих лемму (id повторяются, если лемма встречается в предложении несколько раз)

Коллекция **tokens** содержит поля:
* token -- словоформа
* lemma -- начальная форма
* pos -- часть речи
* freq -- частотность (абсолютная)
* gram -- (опциональная) строка с грамматической информацией
* docs -- id предложений, содержащих токен (id повторяются, если токен встречается в предложении несколько раз)

In [9]:
def insert_sentences_and_get_lemmas(sentence_info, period,
                                    lemma_dict, token_dict):

    for i, sentence in enumerate(tqdm(sentence_info)):

        sent_text, token_info = sentence
        sent_dict = {'sent_text': ''.join(sent_text),
                     'period': period,
                     'token_info': token_info}
        # запись в базу
        sent_id = sentences.insert_one(sent_dict).inserted_id

        # сохраняем информацию о леммах в словарь
        for token in token_info:

            # пропускаем пунктуацию
            if token['pos'] == 'PUNCT':
                continue

            lemma = token['lemma']
            pos = token['pos']
            lem_key = lemma + '_' + pos

            token_text = token['token']
            # gram = '_' + token['gram'] if 'gram' in token else ''
            tok_key = token_text + '_' + lem_key

            # добавляем леммы / обновляем информацию
            if lem_key not in lemma_dict:
                lemma_dict[lem_key] = {'lemma': lemma,
                                       'pos': pos,
                                       'freq': 1,
                                       'freq_0': 0,
                                       'freq_1': 0,
                                       'freq_2': 0,
                                       'docs': set([sent_id])}
                lemma_dict[lem_key][f'freq_{period}'] += 1
            else:
                lemma_dict[lem_key]['freq'] += 1
                lemma_dict[lem_key][f'freq_{period}'] += 1
                lemma_dict[lem_key]['docs'].add(sent_id)

            # добавляем токены
            if tok_key not in token_dict:
                token_dict[tok_key] = {'token': token_text,
                                       'lemma': lemma,
                                       'pos': pos,
                                       'freq': 1,
                                       'freq_0': 0,
                                       'freq_1': 0,
                                       'freq_2': 0,
                                       'docs': set([sent_id])}
                token_dict[tok_key][f'freq_{period}'] += 1
            else:
                token_dict[tok_key]['freq'] += 1
                token_dict[tok_key][f'freq_{period}'] += 1
                token_dict[tok_key]['docs'].add(sent_id)

    # возвращаем обновленные словари
    return lemma_dict, token_dict

In [10]:
def insert_all_batches(filenames, period, lemma_dict, token_dict):

    for filename in filenames:
        sents = clean_mystem_results(read_mystem_results(filename))
        lemma_dict, token_dict = insert_sentences_and_get_lemmas(
            sents, period, lemma_dict, token_dict)
        del sents

    return lemma_dict, token_dict

In [11]:
lemma_dict, token_dict = insert_all_batches(presov_files, 0, {}, {})

Loading file presov_lem0_1_1.json...
File presov_lem0_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem0_1_2.json...
File presov_lem0_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem0_2_1.json...
File presov_lem0_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem0_2_2.json...
File presov_lem0_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem1_1_1.json...
File presov_lem1_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem1_1_2.json...
File presov_lem1_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem1_2_1.json...
File presov_lem1_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem1_2_2.json...
File presov_lem1_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem2_1_1.json...
File presov_lem2_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem2_1_2.json...
File presov_lem2_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem2_2_1.json...
File presov_lem2_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem2_2_2.json...
File presov_lem2_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem3_1_1.json...
File presov_lem3_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem3_1_2.json...
File presov_lem3_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem3_2_1.json...
File presov_lem3_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem3_2_2.json...
File presov_lem3_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem4_1_1.json...
File presov_lem4_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem4_1_2.json...
File presov_lem4_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem4_2_1.json...
File presov_lem4_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem4_2_2.json...
File presov_lem4_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem5_1_1.json...
File presov_lem5_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem5_1_2.json...
File presov_lem5_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem5_2_1.json...
File presov_lem5_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem5_2_2.json...
File presov_lem5_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem6_1_1.json...
File presov_lem6_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem6_1_2.json...
File presov_lem6_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem6_2_1.json...
File presov_lem6_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem6_2_2.json...
File presov_lem6_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem7_1_1.json...
File presov_lem7_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem7_1_2.json...
File presov_lem7_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem7_2_1.json...
File presov_lem7_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem7_2_2.json...
File presov_lem7_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem8_1_1.json...
File presov_lem8_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem8_1_2.json...
File presov_lem8_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem8_2_1.json...
File presov_lem8_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file presov_lem8_2_2.json...
File presov_lem8_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/73941 [00:00<?, ?it/s]




  0%|          | 0/73941 [00:00<?, ?it/s]

In [12]:
# сохранение в файлы
with open('presov_lemmas.pickle', 'wb') as fl1:
    pickle.dump(lemma_dict, fl1)

In [13]:
with open('presov_tokens.pickle', 'wb') as ft1:
    pickle.dump(token_dict, ft1)

In [14]:
del lemma_dict
del token_dict

In [11]:
lemma_dict, token_dict = insert_all_batches(sov_files, 1, {}, {})

Loading file sov_lem0_1_1.json...
File sov_lem0_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem0_1_2.json...
File sov_lem0_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem0_2_1.json...
File sov_lem0_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem0_2_2.json...
File sov_lem0_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem10_1_1.json...
File sov_lem10_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem10_1_2.json...
File sov_lem10_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem10_2_1.json...
File sov_lem10_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem10_2_2.json...
File sov_lem10_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem11_1_1.json...
File sov_lem11_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem11_1_2.json...
File sov_lem11_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem11_2_1.json...
File sov_lem11_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem11_2_2.json...
File sov_lem11_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem12_1_1.json...
File sov_lem12_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem12_1_2.json...
File sov_lem12_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem12_2_1.json...
File sov_lem12_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem12_2_2.json...
File sov_lem12_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem13_1_1.json...
File sov_lem13_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem13_1_2.json...
File sov_lem13_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem13_2_1.json...
File sov_lem13_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem13_2_2.json...
File sov_lem13_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem14_1_1.json...
File sov_lem14_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/33181 [00:00<?, ?it/s]




  0%|          | 0/33181 [00:00<?, ?it/s]

Loading file sov_lem1_1_1.json...
File sov_lem1_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem1_1_2.json...
File sov_lem1_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem1_2_1.json...
File sov_lem1_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem1_2_2.json...
File sov_lem1_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem2_1_1.json...
File sov_lem2_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem2_1_2.json...
File sov_lem2_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem2_2_1.json...
File sov_lem2_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem2_2_2.json...
File sov_lem2_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem3_1_1.json...
File sov_lem3_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem3_1_2.json...
File sov_lem3_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem3_2_1.json...
File sov_lem3_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem3_2_2.json...
File sov_lem3_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem4_1_1.json...
File sov_lem4_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem4_1_2.json...
File sov_lem4_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem4_2_1.json...
File sov_lem4_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem4_2_2.json...
File sov_lem4_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem5_1_1.json...
File sov_lem5_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem5_1_2.json...
File sov_lem5_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem5_2_1.json...
File sov_lem5_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem5_2_2.json...
File sov_lem5_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem6_1_1.json...
File sov_lem6_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem6_1_2.json...
File sov_lem6_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem6_2_1.json...
File sov_lem6_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem6_2_2.json...
File sov_lem6_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem7_1_1.json...
File sov_lem7_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem7_1_2.json...
File sov_lem7_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem7_2_1.json...
File sov_lem7_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem7_2_2.json...
File sov_lem7_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem8_1_1.json...
File sov_lem8_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem8_1_2.json...
File sov_lem8_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem8_2_1.json...
File sov_lem8_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem8_2_2.json...
File sov_lem8_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem9_1_1.json...
File sov_lem9_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem9_1_2.json...
File sov_lem9_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem9_2_1.json...
File sov_lem9_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file sov_lem9_2_2.json...
File sov_lem9_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

In [12]:
# сохранение в файлы
with open('sov_lemmas.pickle', 'wb') as fl2:
    pickle.dump(lemma_dict, fl2)

In [13]:
with open('sov_tokens.pickle', 'wb') as ft2:
    pickle.dump(token_dict, ft2)

In [14]:
del lemma_dict
del token_dict

In [11]:
lemma_dict, token_dict = insert_all_batches(postsov_files, 2, {}, {})

Loading file postsov_lem0_1_1.json...
File postsov_lem0_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem0_1_2.json...
File postsov_lem0_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem0_2_1.json...
File postsov_lem0_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem0_2_2.json...
File postsov_lem0_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem10_1_1.json...
File postsov_lem10_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem10_1_2.json...
File postsov_lem10_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem10_2_1.json...
File postsov_lem10_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem10_2_2.json...
File postsov_lem10_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem11_1_1.json...
File postsov_lem11_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem11_1_2.json...
File postsov_lem11_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem11_2_1.json...
File postsov_lem11_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/103824 [00:00<?, ?it/s]




  0%|          | 0/103824 [00:00<?, ?it/s]

Loading file postsov_lem11_2_2.json...
File postsov_lem11_2_2.json loaded successfully!
Processing sentences...


0it [00:00, ?it/s]




0it [00:00, ?it/s]

Loading file postsov_lem1_1_1.json...
File postsov_lem1_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem1_1_2.json...
File postsov_lem1_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem1_2_1.json...
File postsov_lem1_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem1_2_2.json...
File postsov_lem1_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem2_1_1.json...
File postsov_lem2_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem2_1_2.json...
File postsov_lem2_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem2_2_1.json...
File postsov_lem2_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem2_2_2.json...
File postsov_lem2_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem3_1_1.json...
File postsov_lem3_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem3_1_2.json...
File postsov_lem3_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem3_2_1.json...
File postsov_lem3_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem3_2_2.json...
File postsov_lem3_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem4_1_1.json...
File postsov_lem4_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem4_1_2.json...
File postsov_lem4_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem4_2_1.json...
File postsov_lem4_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem4_2_2.json...
File postsov_lem4_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem5_1_1.json...
File postsov_lem5_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem5_1_2.json...
File postsov_lem5_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem5_2_1.json...
File postsov_lem5_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem5_2_2.json...
File postsov_lem5_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem6_1_1.json...
File postsov_lem6_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem6_1_2.json...
File postsov_lem6_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem6_2_1.json...
File postsov_lem6_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem6_2_2.json...
File postsov_lem6_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem7_1_1.json...
File postsov_lem7_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem7_1_2.json...
File postsov_lem7_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem7_2_1.json...
File postsov_lem7_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem7_2_2.json...
File postsov_lem7_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem8_1_1.json...
File postsov_lem8_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem8_1_2.json...
File postsov_lem8_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem8_2_1.json...
File postsov_lem8_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem8_2_2.json...
File postsov_lem8_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem9_1_1.json...
File postsov_lem9_1_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem9_1_2.json...
File postsov_lem9_1_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem9_2_1.json...
File postsov_lem9_2_1.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

Loading file postsov_lem9_2_2.json...
File postsov_lem9_2_2.json loaded successfully!
Processing sentences...


  0%|          | 0/125000 [00:00<?, ?it/s]




  0%|          | 0/125000 [00:00<?, ?it/s]

In [12]:
# сохранение в файлы
with open('postsov_lemmas.pickle', 'wb') as fl3:
    pickle.dump(lemma_dict, fl3)

In [13]:
with open('postsov_tokens.pickle', 'wb') as ft3:
    pickle.dump(token_dict, ft3)

In [14]:
del lemma_dict
del token_dict

#### Индексируем коллекцию предложений

In [15]:
sentences.create_index([('sent_text', pymongo.TEXT)], unique=False)

'sent_text_text'