# Микродиахроническое исследование русских приставок методами дистрибутивной семантики
## Автор: Елизавета Клыкова, БКЛ181
### Анализ частотности приставок и глаголов
1. В каждом периоде составить список глаголов для каждой приставки
2. Отранжировать глаголы по частотности
3. Сравнить списки по периодам с использованием расстояния Левенштейна и коэффициента корреляции Спирмена

#### Импорт модулей

In [1]:
%load_ext pycodestyle_magic
%pycodestyle_on

In [2]:
import pymongo
import pandas as pd
from tqdm.auto import tqdm
from textdistance import levenshtein
from scipy.stats import spearmanr

#### Получение списка глаголов

In [3]:
pref_df = pd.read_csv('prefixes_and_lemmas.tsv', sep='\t')
pref_df

Unnamed: 0,prefix,lemma,freq,freq_0,freq_1,freq_2
0,бе[зс],бездействовать,541,155,255,131
1,бе[зс],бездельничать,308,41,169,98
2,бе[зс],безмолвствовать,726,361,191,174
3,бе[зс],безобразить,149,99,43,7
4,бе[зс],безобразничать,306,110,136,60
...,...,...,...,...,...,...
8440,у,ущемляться,62,8,16,38
8441,у,ущипывать,510,147,222,141
8442,у,уязвлять,1305,606,411,288
8443,у,уяснять,1803,677,740,386


In [4]:
# list of dicts
pref_dict = pref_df.to_dict(orient='records')

#### Переводим частотности в абсолютные

In [5]:
presov_vol = 72199757
sov_vol = 93031557
postsov_vol = 81392744
total_vol = 246624058

In [6]:
for verb in pref_dict:
    verb['abs_freq'] = verb['freq']
    verb['abs_freq0'] = verb['freq_0']
    verb['abs_freq1'] = verb['freq_1']
    verb['abs_freq2'] = verb['freq_2']
    del verb['freq']
    del verb['freq_0']
    del verb['freq_1']
    del verb['freq_2']
    verb['freq'] = round(verb['abs_freq'] / total_vol * 1000000, 3)
    verb['freq0'] = round(verb['abs_freq0'] / presov_vol * 1000000, 3)
    verb['freq1'] = round(verb['abs_freq1'] / sov_vol * 1000000, 3)
    verb['freq2'] = round(verb['abs_freq2'] / postsov_vol * 1000000, 3)

In [7]:
pref_df_new = pd.DataFrame(pref_dict)
pref_df_new

Unnamed: 0,prefix,lemma,abs_freq,abs_freq0,abs_freq1,abs_freq2,freq,freq0,freq1,freq2
0,бе[зс],бездействовать,541,155,255,131,2.194,2.147,2.741,1.609
1,бе[зс],бездельничать,308,41,169,98,1.249,0.568,1.817,1.204
2,бе[зс],безмолвствовать,726,361,191,174,2.944,5.000,2.053,2.138
3,бе[зс],безобразить,149,99,43,7,0.604,1.371,0.462,0.086
4,бе[зс],безобразничать,306,110,136,60,1.241,1.524,1.462,0.737
...,...,...,...,...,...,...,...,...,...,...
8440,у,ущемляться,62,8,16,38,0.251,0.111,0.172,0.467
8441,у,ущипывать,510,147,222,141,2.068,2.036,2.386,1.732
8442,у,уязвлять,1305,606,411,288,5.291,8.393,4.418,3.538
8443,у,уяснять,1803,677,740,386,7.311,9.377,7.954,4.742


In [8]:
pref_df_new.to_csv('prefixes_and_lemmas_freq.tsv', sep='\t', index=False)
pref_dict = pref_df_new.to_dict(orient='records')

### Подход №1. Общий список глаголов для всех периодов

In [9]:
prefixes = sorted(list(set([verb['prefix'] for verb in pref_dict])))

In [10]:
prefs_with_verbs = {pref: [verb for verb in pref_dict
                           if verb['prefix'] == pref]
                    for pref in prefixes}

In [11]:
presov_ranged = {pref: [verb['lemma']
                        for verb in sorted(prefs_with_verbs[pref],
                                           key=lambda x: x['freq0'],
                                           reverse=True)]
                 for pref in prefs_with_verbs}

In [12]:
sov_ranged = {pref: [verb['lemma']
                     for verb in sorted(prefs_with_verbs[pref],
                                        key=lambda x: x['freq1'],
                                        reverse=True)]
              for pref in prefs_with_verbs}

In [13]:
postsov_ranged = {pref: [verb['lemma']
                         for verb in sorted(prefs_with_verbs[pref],
                                            key=lambda x: x['freq2'],
                                            reverse=True)]
                  for pref in prefs_with_verbs}

#### Подсчет расстояния Левенштейна и коэффициента корреляции Спирмена

In [14]:
stats = {pref: {'pre_to_sov_lev': levenshtein.distance(presov_ranged[pref],
                                                       sov_ranged[pref]),
                'sov_to_post_lev': levenshtein.distance(sov_ranged[pref],
                                                        postsov_ranged[pref]),
                'pre_to_post_lev': levenshtein.distance(presov_ranged[pref],
                                                        postsov_ranged[pref]),
                'pre_to_sov_sp': spearmanr(presov_ranged[pref],
                                           sov_ranged[pref]),
                'sov_to_post_sp': spearmanr(sov_ranged[pref],
                                            postsov_ranged[pref]),
                'pre_to_post_sp': spearmanr(presov_ranged[pref],
                                            postsov_ranged[pref])}
         for pref in tqdm(prefixes)}

  0%|          | 0/26 [00:00<?, ?it/s]

In [15]:
for pref in stats:
    pre_to_sov_sp = str(round(stats[pref]['pre_to_sov_sp'][0], 3))
    pre_to_sov_conf = stats[pref]['pre_to_sov_sp'][1]
    if pre_to_sov_conf < 0.05:
        stats[pref]['pre_to_sov_sp'] = pre_to_sov_sp + '*'
    else:
        stats[pref]['pre_to_sov_sp'] = pre_to_sov_sp

    sov_to_post_sp = str(round(stats[pref]['sov_to_post_sp'][0], 3))
    sov_to_post_conf = stats[pref]['sov_to_post_sp'][1]
    if sov_to_post_conf < 0.05:
        stats[pref]['sov_to_post_sp'] = sov_to_post_sp + '*'
    else:
        stats[pref]['sov_to_post_sp'] = sov_to_post_sp

    pre_to_post_sp = str(round(stats[pref]['pre_to_post_sp'][0], 3))
    pre_to_post_conf = stats[pref]['pre_to_post_sp'][1]
    if pre_to_post_conf < 0.05:
        stats[pref]['pre_to_post_sp'] = pre_to_post_sp + '*'
    else:
        stats[pref]['pre_to_post_sp'] = pre_to_post_sp

In [16]:
pref_stats = pref_df_new.groupby(['prefix']).size().to_dict()

In [17]:
stats_for_df = []
for pref in pref_stats:
    stats_for_df.append({'prefix': pref,
                         'verbs': pref_stats[pref],
                         'pre_to_sov_lev': stats[pref]['pre_to_sov_lev'],
                         'sov_to_post_lev': stats[pref]['sov_to_post_lev'],
                         'pre_to_post_lev': stats[pref]['pre_to_post_lev'],
                         'pre_to_sov_sp': stats[pref]['pre_to_sov_sp'],
                         'sov_to_post_sp': stats[pref]['sov_to_post_sp'],
                         'pre_to_post_sp': stats[pref]['pre_to_post_sp']
                         })

In [18]:
stats_df = pd.DataFrame(stats_for_df)
stats_df

Unnamed: 0,prefix,verbs,pre_to_sov_lev,sov_to_post_lev,pre_to_post_lev,pre_to_sov_sp,sov_to_post_sp,pre_to_post_sp
0,бе[зс],11,6,6,6,0.536,0.455,0.173
1,в[зс],150,137,132,137,-0.007,0.061,0.123
2,во?,137,128,120,127,0.059,0.105,0.088
3,во[зс],122,111,105,113,0.07,0.066,-0.006
4,вы,444,429,415,433,0.067,0.005,0.034
5,до,153,140,135,141,0.092,-0.014,0.157
6,за,937,915,909,924,0.049,0.049,0.008
7,и[зс],276,265,257,262,0.116,0.121*,-0.086
8,на,524,507,496,510,0.053,0.092*,0.105*
9,над,17,12,12,13,0.093,0.01,-0.118


### Подход №2. Разные глаголы для разных периодов, порог частотности = 100
#### Подключение к базе данных

In [19]:
client = pymongo.MongoClient('localhost', 27017)
db = client['thesis']
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'thesis')

In [20]:
sentences = db.sentences
lemmas = db.lemmas
tokens = db.tokens

#### Извлекаем нужные леммы из базы данных

In [21]:
prefixes = pref_df['prefix'].unique().tolist()

In [22]:
def find_verbs_with_prefix(prefix_name, freq):
    regexp = '^{}'.format(prefix_name)
    found = lemmas.find({'lemma': {'$regex': regexp}, 'pos': 'V',
                         freq: {'$gt': 100}},
                        {'lemma': True, '_id': False}
                        ).sort(freq, -1).limit(100)
    return list(found)

In [23]:
prefix_dict_presov = {prefix: find_verbs_with_prefix(prefix, 'freq_0')
                      for prefix in tqdm(prefixes)}
prefix_dict_sov = {prefix: find_verbs_with_prefix(prefix, 'freq_1')
                   for prefix in tqdm(prefixes)}
prefix_dict_postsov = {prefix: find_verbs_with_prefix(prefix, 'freq_2')
                       for prefix in tqdm(prefixes)}

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

#### Приводим списки к одинаковой длине

In [24]:
for prefix in prefixes:
    presov = len(prefix_dict_presov[prefix])
    sov = len(prefix_dict_sov[prefix])
    postsov = len(prefix_dict_postsov[prefix])
    min_len = min(presov, sov, postsov)

    if min_len < 100:
        del prefix_dict_presov[prefix]
        del prefix_dict_sov[prefix]
        del prefix_dict_postsov[prefix]
    else:
        prefix_dict_presov[prefix] = [verb['lemma']
                                      for verb in prefix_dict_presov[prefix]]
        prefix_dict_sov[prefix] = [verb['lemma']
                                   for verb in prefix_dict_sov[prefix]]
        prefix_dict_postsov[prefix] = [verb['lemma']
                                       for verb in prefix_dict_postsov[prefix]]

In [25]:
stats = {pref: {'pre_to_sov_lev': levenshtein.distance(prefix_dict_presov[pref],
                                                       prefix_dict_sov[pref]),
                'sov_to_post_lev': levenshtein.distance(prefix_dict_sov[pref],
                                                        prefix_dict_postsov[pref]),
                'pre_to_post_lev': levenshtein.distance(prefix_dict_presov[pref],
                                                        prefix_dict_postsov[pref]),
                'pre_to_sov_sp': spearmanr(prefix_dict_presov[pref],
                                           prefix_dict_sov[pref]),
                'sov_to_post_sp': spearmanr(prefix_dict_sov[pref],
                                            prefix_dict_postsov[pref]),
                'pre_to_post_sp': spearmanr(prefix_dict_presov[pref],
                                            prefix_dict_postsov[pref])}
         for pref in tqdm(list(prefix_dict_presov.keys()))}

  0%|          | 0/16 [00:00<?, ?it/s]

1:80: E501 line too long (80 > 79 characters)
4:80: E501 line too long (83 > 79 characters)
5:80: E501 line too long (81 > 79 characters)
6:80: E501 line too long (83 > 79 characters)


In [26]:
for pref in stats:
    pre_to_sov_sp = str(round(stats[pref]['pre_to_sov_sp'][0], 3))
    pre_to_sov_conf = stats[pref]['pre_to_sov_sp'][1]
    if pre_to_sov_conf < 0.05:
        stats[pref]['pre_to_sov_sp'] = pre_to_sov_sp + '*'
    else:
        stats[pref]['pre_to_sov_sp'] = pre_to_sov_sp

    sov_to_post_sp = str(round(stats[pref]['sov_to_post_sp'][0], 3))
    sov_to_post_conf = stats[pref]['sov_to_post_sp'][1]
    if sov_to_post_conf < 0.05:
        stats[pref]['sov_to_post_sp'] = sov_to_post_sp + '*'
    else:
        stats[pref]['sov_to_post_sp'] = sov_to_post_sp

    pre_to_post_sp = str(round(stats[pref]['pre_to_post_sp'][0], 3))
    pre_to_post_conf = stats[pref]['pre_to_post_sp'][1]
    if pre_to_post_conf < 0.05:
        stats[pref]['pre_to_post_sp'] = pre_to_post_sp + '*'
    else:
        stats[pref]['pre_to_post_sp'] = pre_to_post_sp

In [27]:
stats_for_df = []
for pref in list(prefix_dict_presov.keys()):
    stats_for_df.append({'prefix': pref,
                         'verbs': len(prefix_dict_presov[pref]),
                         'pre_to_sov_lev': stats[pref]['pre_to_sov_lev'],
                         'sov_to_post_lev': stats[pref]['sov_to_post_lev'],
                         'pre_to_post_lev': stats[pref]['pre_to_post_lev'],
                         'pre_to_sov_sp': stats[pref]['pre_to_sov_sp'],
                         'sov_to_post_sp': stats[pref]['sov_to_post_sp'],
                         'pre_to_post_sp': stats[pref]['pre_to_post_sp']
                         })

In [28]:
stats_df = pd.DataFrame(stats_for_df)
stats_df

Unnamed: 0,prefix,verbs,pre_to_sov_lev,sov_to_post_lev,pre_to_post_lev,pre_to_sov_sp,sov_to_post_sp,pre_to_post_sp
0,в[зс],100,86,87,91,-0.015,0.219*,0.086
1,во?,100,91,87,95,-0.074,0.108,0.106
2,вы,100,94,89,94,0.101,-0.059,0.017
3,за,100,90,87,95,-0.02,0.004,-0.163
4,и[зс],100,93,90,93,0.107,0.087,-0.018
5,на,100,90,84,91,0.008,0.067,0.191
6,о,100,94,92,97,0.033,-0.157,-0.026
7,от,100,89,85,94,0.07,0.154,-0.101
8,пере,100,95,88,94,-0.064,-0.106,0.213*
9,по,100,90,92,94,0.014,-0.07,-0.141
