# Микродиахроническое исследование русских приставок методами дистрибутивной семантики
## Автор: Елизавета Клыкова, БКЛ181
### Часть 4: запись токенов в базу
1. Считать токены из pickle-файлов
2. Объединить словари
3. Записать в базу
4. Проиндексировать коллекцию

#### Импорт модулей

In [1]:
%load_ext pycodestyle_magic
%pycodestyle_on

In [2]:
import pickle
import gridfs
import pymongo
from tqdm.auto import tqdm
from collections import Counter
from pymongo.errors import DocumentTooLarge, OperationFailure

#### Открытие файлов

In [3]:
with open('presov_tokens.pickle', 'rb') as ft1:
    presov_tokens = pickle.load(ft1)

In [4]:
len(presov_tokens)

1356924

In [5]:
with open('sov_tokens.pickle', 'rb') as ft2:
    sov_tokens = pickle.load(ft2)

In [6]:
len(sov_tokens)

1618041

In [7]:
with open('postsov_tokens.pickle', 'rb') as ft3:
    postsov_tokens = pickle.load(ft3)

In [8]:
len(postsov_tokens)

1519257

#### Объединение

In [9]:
all_tokens = {}

In [10]:
# 1 + 2 и 3
for tok_key, token in tqdm(list(presov_tokens.items())):

    if tok_key not in sov_tokens and tok_key not in postsov_tokens:
        all_tokens[tok_key] = token

    else:
        combined_freq = token['freq']
        combined_freq0 = token['freq_0']
        combined_freq1 = token['freq_1']
        combined_freq2 = token['freq_2']
        combined_docs = token['docs']

        if tok_key in sov_tokens:
            combined_freq += sov_tokens[tok_key]['freq']
            combined_freq0 += sov_tokens[tok_key]['freq_0']
            combined_freq1 += sov_tokens[tok_key]['freq_1']
            combined_freq2 += sov_tokens[tok_key]['freq_2']
            combined_docs.update(sov_tokens[tok_key]['docs'])
            del sov_tokens[tok_key]

        if tok_key in postsov_tokens:
            combined_freq += postsov_tokens[tok_key]['freq']
            combined_freq0 += postsov_tokens[tok_key]['freq_0']
            combined_freq1 += postsov_tokens[tok_key]['freq_1']
            combined_freq2 += postsov_tokens[tok_key]['freq_2']
            combined_docs.update(postsov_tokens[tok_key]['docs'])
            del postsov_tokens[tok_key]

        all_tokens[tok_key] = {'token': token['token'],
                               'lemma': token['lemma'],
                               'pos': token['pos'],
                               'freq': combined_freq,
                               'freq_0': combined_freq0,
                               'freq_1': combined_freq1,
                               'freq_2': combined_freq2,
                               'docs': combined_docs}

  0%|          | 0/1356924 [00:00<?, ?it/s]

In [11]:
# 2 + 3
for tok_key, token in tqdm(list(sov_tokens.items())):

    if tok_key not in postsov_tokens:
        all_tokens[tok_key] = token

    else:
        combined_freq = token['freq'] + postsov_tokens[tok_key]['freq']
        combined_freq0 = token['freq_0'] + postsov_tokens[tok_key]['freq_0']
        combined_freq1 = token['freq_1'] + postsov_tokens[tok_key]['freq_1']
        combined_freq2 = token['freq_2'] + postsov_tokens[tok_key]['freq_2']
        combined_docs = token['docs']
        combined_docs.update(postsov_tokens[tok_key]['docs'])
        del postsov_tokens[tok_key]

        all_tokens[tok_key] = {'token': token['token'],
                               'lemma': token['lemma'],
                               'pos': token['pos'],
                               'freq': combined_freq,
                               'freq_0': combined_freq0,
                               'freq_1': combined_freq1,
                               'freq_2': combined_freq2,
                               'docs': combined_docs}

  0%|          | 0/882148 [00:00<?, ?it/s]

In [12]:
# то, что есть только в 3
for tok_key, token in tqdm(list(postsov_tokens.items())):
    all_tokens[tok_key] = token

  0%|          | 0/631597 [00:00<?, ?it/s]

In [13]:
del presov_tokens
del sov_tokens
del postsov_tokens

In [14]:
len(all_tokens)

2870669

In [15]:
with open('all_tokens.pickle', 'wb') as ftok:
    pickle.dump(all_tokens, ftok)

#### Считываем токены из файла

In [3]:
with open('all_tokens.pickle', 'rb') as ftok:
    all_tokens = pickle.load(ftok)

In [4]:
# множества не записываются в базу
for tok_key in tqdm(all_tokens):
    all_tokens[tok_key]['docs'] = list(all_tokens[tok_key]['docs'])

  0%|          | 0/2870669 [00:00<?, ?it/s]

**Подключение к базе**

In [5]:
client = pymongo.MongoClient('localhost', 27017)
db = client['thesis']
fs = gridfs.GridFS(db)
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'thesis')

In [6]:
tokens = db.tokens

In [7]:
# tokens.delete_many({})

#### Запись в базу

In [8]:
def insert_tokens(token_dict, token_collection):
    tokens_to_insert = list(token_dict.values())
    token_collection.insert_many(tokens_to_insert)

In [9]:
insert_tokens(all_tokens, tokens)  # здесь ошибка DocumentTooLarge

OperationFailure: BSONObj size: 46680242 (0x2C848B2) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('627e58f97be76f179fef699d'), full error: {'ok': 0.0, 'errmsg': "BSONObj size: 46680242 (0x2C848B2) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('627e58f97be76f179fef699d')", 'code': 10334, 'codeName': 'BSONObjectTooLarge'}

In [11]:
def insert_tokens_one_by_one(token_dict, token_collection):
    big_tokens = []
    for tok_key in tqdm(token_dict):
        try:
            token_collection.insert_one(token_dict[tok_key])
        except (DocumentTooLarge, OperationFailure):
            big_tokens.append(token_dict[tok_key])
    return big_tokens

In [13]:
big_tokens = insert_tokens_one_by_one(all_tokens, tokens)

  0%|          | 0/2870669 [00:00<?, ?it/s]

In [14]:
len(big_tokens)

13

In [15]:
for i, token in enumerate(tqdm(big_tokens)):
    with open(f'token_{i}.pickle', 'wb') as tok:
        pickle.dump(token, tok)
    with open(f'token_{i}.pickle', 'rb') as tok:
        tok_id = fs.put(tok)
    token['file_id'] = tok_id

  0%|          | 0/13 [00:00<?, ?it/s]

In [16]:
for token in tqdm(big_tokens):
    info_to_insert = token
    info_to_insert['docs'] = token['docs'][0:100000]
    tokens.insert_one(info_to_insert)

  0%|          | 0/13 [00:00<?, ?it/s]

**Индексация**

In [17]:
tokens.create_index([('token', pymongo.ASCENDING),
                     ('lemma', pymongo.ASCENDING),
                     ('pos', pymongo.ASCENDING)],
                    unique=True)

'token_1_lemma_1_pos_1'

In [None]:
# tokens.drop_index('token_1_lemma_1_pos_1')