# Микродиахроническое исследование русских приставок методами дистрибутивной семантики
## Автор: Елизавета Клыкова, БКЛ181
### Часть 3: запись лемм и токенов в базу
1. Считать леммы и токены из pickle-файлов
2. Объединить словари
3. Записать в базу
4. Проиндексировать все коллекции

#### Импорт модулей

In [1]:
%load_ext pycodestyle_magic
%pycodestyle_on

In [2]:
import pickle
import gridfs
import pymongo
from tqdm.auto import tqdm
from collections import Counter
from pymongo.errors import DocumentTooLarge

#### Открываем и считываем файлы с леммами

In [3]:
with open('presov_lemmas.pickle', 'rb') as fl1:
    presov_lemmas = pickle.load(fl1)

In [4]:
len(presov_lemmas)

429760

In [5]:
with open('sov_lemmas.pickle', 'rb') as fl2:
    sov_lemmas = pickle.load(fl2)

In [6]:
len(sov_lemmas)

475193

In [7]:
with open('postsov_lemmas.pickle', 'rb') as fl3:
    postsov_lemmas = pickle.load(fl3)

In [8]:
len(postsov_lemmas)

471930

#### Объединяем словари

In [9]:
all_lemmas = {}

In [10]:
# 1 + 2 и 3
for lem_key, lemma in tqdm(list(presov_lemmas.items())):

    if lem_key not in sov_lemmas and lem_key not in postsov_lemmas:
        all_lemmas[lem_key] = lemma

    else:
        combined_freq = lemma['freq']
        combined_freq0 = lemma['freq_0']
        combined_freq1 = lemma['freq_1']
        combined_freq2 = lemma['freq_2']
        combined_docs = lemma['docs']

        if lem_key in sov_lemmas:
            combined_freq += sov_lemmas[lem_key]['freq']
            combined_freq0 += sov_lemmas[lem_key]['freq_0']
            combined_freq1 += sov_lemmas[lem_key]['freq_1']
            combined_freq2 += sov_lemmas[lem_key]['freq_2']
            combined_docs.update(sov_lemmas[lem_key]['docs'])
            del sov_lemmas[lem_key]

        if lem_key in postsov_lemmas:
            combined_freq += postsov_lemmas[lem_key]['freq']
            combined_freq0 += postsov_lemmas[lem_key]['freq_0']
            combined_freq1 += postsov_lemmas[lem_key]['freq_1']
            combined_freq2 += postsov_lemmas[lem_key]['freq_2']
            combined_docs.update(postsov_lemmas[lem_key]['docs'])
            del postsov_lemmas[lem_key]

        all_lemmas[lem_key] = {'lemma': lemma['lemma'],
                               'pos': lemma['pos'],
                               'freq': combined_freq,
                               'freq_0': combined_freq0,
                               'freq_1': combined_freq1,
                               'freq_2': combined_freq2,
                               'docs': combined_docs}

  0%|          | 0/429760 [00:00<?, ?it/s]

In [11]:
# 2 + 3
for lem_key, lemma in tqdm(list(sov_lemmas.items())):

    if lem_key not in postsov_lemmas:
        all_lemmas[lem_key] = lemma

    else:
        combined_freq = lemma['freq'] + postsov_lemmas[lem_key]['freq']
        combined_freq0 = lemma['freq_0'] + postsov_lemmas[lem_key]['freq_0']
        combined_freq1 = lemma['freq_1'] + postsov_lemmas[lem_key]['freq_1']
        combined_freq2 = lemma['freq_2'] + postsov_lemmas[lem_key]['freq_2']
        combined_docs = lemma['docs']
        combined_docs.update(postsov_lemmas[lem_key]['docs'])
        del postsov_lemmas[lem_key]

        all_lemmas[lem_key] = {'lemma': lemma['lemma'],
                               'pos': lemma['pos'],
                               'freq': combined_freq,
                               'freq_0': combined_freq0,
                               'freq_1': combined_freq1,
                               'freq_2': combined_freq2,
                               'docs': combined_docs}

  0%|          | 0/294726 [00:00<?, ?it/s]

In [12]:
# то, что есть только в 3
for lem_key, lemma in tqdm(list(postsov_lemmas.items())):
    all_lemmas[lem_key] = lemma

  0%|          | 0/253995 [00:00<?, ?it/s]

In [13]:
del presov_lemmas
del sov_lemmas
del postsov_lemmas

In [14]:
len(all_lemmas)

978481

In [15]:
with open('all_lemmas.pickle', 'wb') as flem:
    pickle.dump(all_lemmas, flem)

#### Подключаемся к базе данных

In [3]:
client = pymongo.MongoClient('localhost', 27017)
db = client['thesis']
fs = gridfs.GridFS(db)
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'thesis')

In [4]:
lemmas = db.lemmas

In [5]:
# lemmas.delete_many({})

#### Записываем все леммы в базу данных

In [6]:
with open('all_lemmas.pickle', 'rb') as flem:
    all_lemmas = pickle.load(flem)

In [7]:
def insert_lemmas(lemma_dict, lemma_collection):
    lemmas_to_insert = list(lemma_dict.values())
    lemma_collection.insert_many(lemmas_to_insert)

In [None]:
# insert_lemmas(all_lemmas, lemmas)  # здесь ошибка DocumentTooLarge

In [9]:
# множества не записываются в базу
for lem_key in tqdm(all_lemmas):
    set_to_list = list(all_lemmas[lem_key]['docs'])
    all_lemmas[lem_key]['docs'] = set_to_list

  0%|          | 0/978481 [00:00<?, ?it/s]

In [14]:
def insert_lemmas_one_by_one(lemma_dict, lemma_collection):
    big_lemmas = []
    for lem_key in tqdm(lemma_dict):
        try:
            lemma_collection.insert_one(lemma_dict[lem_key])
        except DocumentTooLarge:
            big_lemmas.append(lemma_dict[lem_key])
    return big_lemmas

In [15]:
big_lemmas = insert_lemmas_one_by_one(all_lemmas, lemmas)

  0%|          | 0/978481 [00:00<?, ?it/s]

In [22]:
for i, lemma in enumerate(tqdm(big_lemmas)):
    with open(f'lemma_{i}.pickle', 'wb') as lem:
        pickle.dump(lemma, lem)
    with open(f'lemma_{i}.pickle', 'rb') as lem:
        lem_id = fs.put(lem)
    lemma['file_id'] = lem_id

  0%|          | 0/26 [00:00<?, ?it/s]

In [23]:
for lemma in tqdm(big_lemmas):
    info_to_insert = lemma
    info_to_insert['docs'] = lemma['docs'][0:100000]
    lemmas.insert_one(info_to_insert)

  0%|          | 0/26 [00:00<?, ?it/s]

#### Индексируем

In [24]:
# индексируем
lemmas.create_index([('lemma', pymongo.ASCENDING),
                     ('pos', pymongo.ASCENDING)], unique=True)

'lemma_1_pos_1'

In [None]:
# lemmas.drop_index('lemma_1_pos_1')