# Микродиахроническое исследование русских приставок методами дистрибутивной семантики
## Автор: Елизавета Клыкова, БКЛ181
### Получение предложений с наиболее изменившимся глаголами для кластеризации ELMo

#### Импорт модулей

In [2]:
# %load_ext pycodestyle_magic
# %pycodestyle_on

In [3]:
import random
import pickle
import pymongo
import pandas as pd
from tqdm.auto import tqdm

In [4]:
random.seed(42)

#### Подключение к базе данных

In [5]:
client = pymongo.MongoClient('localhost', 27017)
db = client['thesis']
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'thesis')

In [6]:
sentences = db.sentences
lemmas = db.lemmas
tokens = db.tokens

#### Получение списка глаголов

In [7]:
verbs_to_search = ['заметать', 'постановлять', 'изготавливаться', 'наследовать', 'претерпеть', 'взводить', 'засылать', 'дозволять', 'устрашать', 'округлять']

1:80: E501 line too long (157 > 79 characters)


#### Поиск предложений с глаголами в базе

In [8]:
# для каждого глагола получаем список предложений, в которых он встречается
verbs_with_sents = {}
for verb in tqdm(verbs_to_search):
    sents_with_verb = lemmas.find_one({'lemma': verb, 'pos': 'V'},
                                      {'docs': True, '_id': False})
    verbs_with_sents[verb] = sents_with_verb['docs']

  0%|          | 0/10 [00:00<?, ?it/s]

In [9]:
presov_sents = {verb: [] for verb in verbs_with_sents}
sov_sents = {verb: [] for verb in verbs_with_sents}
postsov_sents = {verb: [] for verb in verbs_with_sents}

In [10]:
ids_to_search = {verb: random.sample(verbs_with_sents[verb],
                                     min(len(verbs_with_sents[verb]), 500))
                 for verb in verbs_with_sents}

In [11]:
for verb in tqdm(verbs_with_sents):
    for sentence in sentences.find({'_id': {'$in': ids_to_search[verb]}, 'period': 0},
                                   {'token_info': True, '_id': True}).limit(50):
        tok_list = [tok['token'] for tok in sentence['token_info']]
        target_tok = [tok['lemma'] for tok in sentence['token_info']].index(verb)
        presov_sents[verb].append((tok_list, target_tok))
        ids_to_search[verb].remove(sentence['_id'])

  0%|          | 0/10 [00:00<?, ?it/s]

2:80: E501 line too long (86 > 79 characters)
3:80: E501 line too long (80 > 79 characters)
5:80: E501 line too long (81 > 79 characters)


In [12]:
with open('presov_sents_for_clust.pickle', 'wb') as f:
    pickle.dump(presov_sents, f)

In [13]:
del presov_sents

In [14]:
for verb in tqdm(verbs_with_sents):
    for sentence in sentences.find({'_id': {'$in': ids_to_search[verb]}, 'period': 1},
                                   {'token_info': True, '_id': True}).limit(50):
        tok_list = [tok['token'] for tok in sentence['token_info']]
        target_tok = [tok['lemma'] for tok in sentence['token_info']].index(verb)
        sov_sents[verb].append((tok_list, target_tok))
        ids_to_search[verb].remove(sentence['_id'])

  0%|          | 0/10 [00:00<?, ?it/s]

2:80: E501 line too long (86 > 79 characters)
3:80: E501 line too long (80 > 79 characters)
5:80: E501 line too long (81 > 79 characters)


In [15]:
with open('sov_sents_for_clust.pickle', 'wb') as f:
    pickle.dump(sov_sents, f)

In [16]:
del sov_sents

In [17]:
for verb in tqdm(verbs_with_sents):
    for sentence in sentences.find({'_id': {'$in': ids_to_search[verb]}, 'period': 2},
                                   {'token_info': True, '_id': True}).limit(50):
        tok_list = [tok['token'] for tok in sentence['token_info']]
        target_tok = [tok['lemma'] for tok in sentence['token_info']].index(verb)
        postsov_sents[verb].append((tok_list, target_tok))
        ids_to_search[verb].remove(sentence['_id'])

  0%|          | 0/10 [00:00<?, ?it/s]

2:80: E501 line too long (86 > 79 characters)
3:80: E501 line too long (80 > 79 characters)
5:80: E501 line too long (81 > 79 characters)


In [18]:
with open('postsov_sents_for_clust.pickle', 'wb') as f:
    pickle.dump(postsov_sents, f)

In [19]:
del postsov_sents