# Микродиахроническое исследование русских приставок методами дистрибутивной семантики
## Автор: Елизавета Клыкова, БКЛ181
### Получение предложений со всеми глаголами для ELMo

#### Импорт модулей

In [1]:
# %load_ext pycodestyle_magic
# %pycodestyle_on

In [2]:
import random
import pickle
import pymongo
import pandas as pd
from tqdm.auto import tqdm

In [3]:
random.seed(42)

#### Подключение к базе данных

In [4]:
client = pymongo.MongoClient('localhost', 27017)
db = client['thesis']
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'thesis')

In [5]:
sentences = db.sentences
lemmas = db.lemmas
tokens = db.tokens

#### Получение списка глаголов

In [6]:
pref_df = pd.read_csv('prefixes_and_lemmas_dropna.tsv', sep='\t')
pref_df

Unnamed: 0,prefix,lemma,abs_freq,abs_freq0,abs_freq1,abs_freq2,freq,freq0,freq1,freq2
0,бе[зс],бездействовать,541,155,255,131,2.194,2.147,2.741,1.609
1,бе[зс],бездельничать,308,41,169,98,1.249,0.568,1.817,1.204
2,бе[зс],безмолвствовать,726,361,191,174,2.944,5.000,2.053,2.138
3,бе[зс],безобразить,149,99,43,7,0.604,1.371,0.462,0.086
4,бе[зс],безобразничать,306,110,136,60,1.241,1.524,1.462,0.737
...,...,...,...,...,...,...,...,...,...,...
8429,у,ущемляться,62,8,16,38,0.251,0.111,0.172,0.467
8430,у,ущипывать,510,147,222,141,2.068,2.036,2.386,1.732
8431,у,уязвлять,1305,606,411,288,5.291,8.393,4.418,3.538
8432,у,уяснять,1803,677,740,386,7.311,9.377,7.954,4.742


In [7]:
# list of dicts
pref_dict = pref_df.to_dict(orient='records')

In [8]:
verbs_to_search = sorted(set([v['lemma'] for v in pref_dict]))

#### Поиск предложений с глаголами в базе

In [9]:
# для каждого глагола получаем список предложений, в которых он встречается
verbs_with_sents = {}
for verb in tqdm(verbs_to_search):
    sents_with_verb = lemmas.find_one({'lemma': verb, 'pos': 'V'},
                                      {'docs': True, '_id': False})
    verbs_with_sents[verb] = sents_with_verb['docs']

  0%|          | 0/8434 [00:00<?, ?it/s]

In [10]:
presov_sents = {verb: [] for verb in verbs_with_sents}
sov_sents = {verb: [] for verb in verbs_with_sents}
postsov_sents = {verb: [] for verb in verbs_with_sents}

In [11]:
ids_to_search = {verb: random.sample(verbs_with_sents[verb],
                                     min(len(verbs_with_sents[verb]), 500))
                 for verb in verbs_with_sents}

In [12]:
for verb in tqdm(verbs_with_sents):
    for sentence in sentences.find({'_id': {'$in': ids_to_search[verb]}, 'period': 0},
                                   {'token_info': True, '_id': True}).limit(100):
        tok_list = [tok['token'] for tok in sentence['token_info']]
        target_tok = [tok['lemma'] for tok in sentence['token_info']].index(verb)
        presov_sents[verb].append((tok_list, target_tok))
        ids_to_search[verb].remove(sentence['_id'])

  0%|          | 0/8434 [00:00<?, ?it/s]

In [13]:
with open('presov_sents.pickle', 'wb') as f:
    pickle.dump(presov_sents, f)

In [14]:
del presov_sents

In [15]:
for verb in tqdm(verbs_with_sents):
    for sentence in sentences.find({'_id': {'$in': ids_to_search[verb]}, 'period': 1},
                                   {'token_info': True, '_id': True}).limit(100):
        tok_list = [tok['token'] for tok in sentence['token_info']]
        target_tok = [tok['lemma'] for tok in sentence['token_info']].index(verb)
        sov_sents[verb].append((tok_list, target_tok))
        ids_to_search[verb].remove(sentence['_id'])

  0%|          | 0/8434 [00:00<?, ?it/s]

In [16]:
with open('sov_sents.pickle', 'wb') as f:
    pickle.dump(sov_sents, f)

In [17]:
del sov_sents

In [18]:
for verb in tqdm(verbs_with_sents):
    for sentence in sentences.find({'_id': {'$in': ids_to_search[verb]}, 'period': 2},
                                   {'token_info': True, '_id': True}).limit(100):
        tok_list = [tok['token'] for tok in sentence['token_info']]
        target_tok = [tok['lemma'] for tok in sentence['token_info']].index(verb)
        postsov_sents[verb].append((tok_list, target_tok))
        ids_to_search[verb].remove(sentence['_id'])

  0%|          | 0/8434 [00:00<?, ?it/s]

In [19]:
with open('postsov_sents.pickle', 'wb') as f:
    pickle.dump(postsov_sents, f)

In [20]:
del postsov_sents