# Микродиахроническое исследование русских приставок методами дистрибутивной семантики
## Автор: Елизавета Клыкова, БКЛ181
### Оценка на материале [RuShiftEval](https://github.com/akutuzov/rushifteval_public)

#### Импорт модулей

In [1]:
# %load_ext pycodestyle_magic
# %pycodestyle_on

In [2]:
import gensim
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from gensim.models.fasttext import load_facebook_model
from tqdm.auto import tqdm
from scipy.spatial.distance import cityblock

import warnings
warnings.filterwarnings('ignore')

#### Подготовка датасета

In [None]:
!wget https://raw.githubusercontent.com/akutuzov/rushifteval_public/main/annotated_devset.tsv
!wget https://raw.githubusercontent.com/akutuzov/rushifteval_public/main/annotated_testset.tsv

In [None]:
colnames = ['lemma', 'presov_to_sov', 'sov_to_postsov', 'presov_to_postsov']
part1 = pd.read_csv('annotated_devset.tsv', sep='\t', names=colnames, header=None)
part2 = pd.read_csv('annotated_testset.tsv', sep='\t', names=colnames, header=None)

In [None]:
rushifteval = pd.concat([part1, part2]).sort_values(by='lemma')

In [None]:
rushifteval.to_csv('RuShiftEval_full.tsv', sep='\t', index=None)

#### Считываем заранее подготовленный датасет

In [3]:
rse = pd.read_csv('RuShiftEval_full.tsv', sep='\t')
rse

Unnamed: 0,lemma,presov_to_sov,sov_to_postsov,presov_to_postsov
0,авторитет,3.233333,2.955556,2.844444
1,амбиция,3.111111,3.444444,3.333333
2,апостол,3.494118,3.426966,3.423529
3,благодарность,3.233333,3.566667,3.655556
4,блин,3.213483,1.662921,2.577778
...,...,...,...,...
106,штаб,3.633333,3.388889,3.500000
107,эшелон,2.922222,2.288889,2.333333
108,юбилей,3.688889,3.700000,3.788889
109,ядро,1.554348,1.911111,1.477778


In [4]:
gold_rows = rse.to_dict(orient='records')

In [5]:
gold_pre_to_sov = [row['lemma'] for row in sorted(gold_rows,
                                                  key=lambda x: x['presov_to_sov'])]

In [6]:
gold_sov_to_post = [row['lemma'] for row in sorted(gold_rows,
                                                   key=lambda x: x['sov_to_postsov'])]

In [7]:
gold_pre_to_post = [row['lemma'] for row in sorted(gold_rows,
                                                   key=lambda x: x['presov_to_postsov'])]

#### Загружаем нужные модели

In [8]:
def load_word2vec_model(file_with_model):
    model = gensim.models.KeyedVectors.load_word2vec_format(file_with_model,
                                                            binary=True)
    print('Размер словаря:', len(model.wv.vocab))
    return model

In [9]:
wv_presov = load_word2vec_model('word2vec/word2vec_presov_lemmas.bin')

Размер словаря: 127656


In [10]:
wv_sov = load_word2vec_model('word2vec/word2vec_sov_lemmas.bin')

Размер словаря: 151890


In [11]:
wv_postsov = load_word2vec_model('word2vec/word2vec_postsov_lemmas.bin')

Размер словаря: 149617


In [12]:
wv_pos_presov = load_word2vec_model('word2vec/word2vec_presov_lemmas_pos.bin')

Размер словаря: 123804


In [13]:
wv_pos_sov = load_word2vec_model('word2vec/word2vec_sov_lemmas_pos.bin')

Размер словаря: 147146


In [14]:
wv_pos_postsov = load_word2vec_model('word2vec/word2vec_postsov_lemmas_pos.bin')

Размер словаря: 144650


In [15]:
ft_presov = load_facebook_model('fasttext/fasttext_presov_lemmas.bin')
ft_presov.init_sims(replace=True)
print('Размер словаря:', len(ft_presov.wv.vocab))

Размер словаря: 127657


In [16]:
ft_sov = load_facebook_model('fasttext/fasttext_sov_lemmas.bin')
ft_sov.init_sims(replace=True)
print('Размер словаря:', len(ft_sov.wv.vocab))

Размер словаря: 151891


In [17]:
ft_postsov = load_facebook_model('fasttext/fasttext_postsov_lemmas.bin')
ft_postsov.init_sims(replace=True)
print('Размер словаря:', len(ft_postsov.wv.vocab))

Размер словаря: 149618


#### Выравнивание

In [18]:
def intersection_align_gensim(m1, m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.wv.vocab.keys())
    vocab_m2 = set(m2.wv.vocab.keys())

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words:
        common_vocab &= set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1-common_vocab and not vocab_m2-common_vocab:
        return (m1, m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.vocab[w].count + m2.wv.vocab[w].count, reverse=True)

    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.vocab[w].index for w in common_vocab]
        old_arr = m.wv.syn0norm
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.syn0norm = m.wv.syn0 = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        m.wv.index2word = common_vocab
        old_vocab = m.wv.vocab
        new_vocab = {}
        for new_index, word in enumerate(common_vocab):
            old_vocab_obj = old_vocab[word]
            new_vocab[word] = gensim.models.word2vec.Vocab(index=new_index, count=old_vocab_obj.count)
        m.wv.vocab = new_vocab

    return (m1, m2)

In [19]:
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    """
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        (With help from William. Thank you!)

    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.

    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """

    # patch by Richard So [https://twitter.com/richardjeanso] (thanks!) to update this code for new version of gensim
    base_embed.init_sims()
    other_embed.init_sims()

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

    # get the embedding matrices
    base_vecs = in_base_embed.wv.syn0norm
    other_vecs = in_other_embed.wv.syn0norm

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs)
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v)
    # Replace original array with modified one
    # i.e. multiplying the embedding matrix (syn0norm)by "ortho"
    other_embed.wv.syn0norm = other_embed.wv.syn0 = (other_embed.wv.syn0norm).dot(ortho)

    return other_embed

In [20]:
wv_sov_to_presov = smart_procrustes_align_gensim(wv_presov, wv_sov)

In [21]:
wv_postsov_to_sov = smart_procrustes_align_gensim(wv_sov, wv_postsov)

In [22]:
wv_pos_sov_to_presov = smart_procrustes_align_gensim(wv_pos_presov, wv_pos_sov)

In [23]:
wv_pos_postsov_to_sov = smart_procrustes_align_gensim(wv_pos_sov, wv_pos_postsov)

In [24]:
ft_sov_to_presov = smart_procrustes_align_gensim(ft_presov, ft_sov)

In [25]:
ft_postsov_to_sov = smart_procrustes_align_gensim(ft_sov, ft_postsov)

#### Получаем нужные векторы

In [26]:
nouns = rse['lemma'].tolist()

In [27]:
noun_dict = {}
for noun in nouns:
    noun_dict[noun] = {}
    # word2vec на леммах
    noun_dict[noun]['wv_presov'] = wv_presov[noun] if noun in wv_presov else None
    noun_dict[noun]['wv_sov'] = wv_sov_to_presov[noun] if noun in wv_sov_to_presov else None
    noun_dict[noun]['wv_postsov'] = wv_postsov_to_sov[noun] if noun in wv_postsov_to_sov else None
    # word2vec с частями речи
    lemma = noun + '_S'
    noun_dict[noun]['wvpos_presov'] = wv_pos_presov[lemma] if lemma in wv_pos_presov else None
    noun_dict[noun]['wvpos_sov'] = wv_pos_sov_to_presov[lemma] if lemma in wv_pos_sov_to_presov else None
    noun_dict[noun]['wvpos_postsov'] = wv_pos_postsov_to_sov[lemma] if lemma in wv_pos_postsov_to_sov else None
    # fasttext
    noun_dict[noun]['ft_presov'] = ft_presov[noun] if noun in ft_presov else None
    noun_dict[noun]['ft_sov'] = ft_sov_to_presov[noun] if noun in ft_sov_to_presov else None
    noun_dict[noun]['ft_postsov'] = ft_postsov_to_sov[noun] if noun in ft_postsov_to_sov else None

In [28]:
noun_clean = {noun: noun_dict[noun] for noun in noun_dict
              if not [item for item in list(noun_dict[noun].values()) if item is None]}

In [29]:
len(noun_clean)

111

#### Рассчитываем изменения

In [30]:
for idx, verb in tqdm(list(noun_clean.items())):
    # word2vec на леммах
    verb['wv_pre_to_sov'] = round(cityblock(verb['wv_presov'], verb['wv_sov']), 3)
    verb['wv_sov_to_post'] = round(cityblock(verb['wv_sov'], verb['wv_postsov']), 3)
    verb['wv_pre_to_post'] = round(cityblock(verb['wv_presov'], verb['wv_postsov']), 3)
    # word2vec с частями речи
    verb['wvpos_pre_to_sov'] = round(cityblock(verb['wvpos_presov'], verb['wvpos_sov']), 3)
    verb['wvpos_sov_to_post'] = round(cityblock(verb['wvpos_sov'], verb['wvpos_postsov']), 3)
    verb['wvpos_pre_to_post'] = round(cityblock(verb['wvpos_presov'], verb['wvpos_postsov']), 3)
    # fasttext
    verb['ft_pre_to_sov'] = round(cityblock(verb['ft_presov'], verb['ft_sov']), 3)
    verb['ft_sov_to_post'] = round(cityblock(verb['ft_sov'], verb['ft_postsov']), 3)
    verb['ft_pre_to_post'] = round(cityblock(verb['ft_presov'], verb['ft_postsov']), 3)

  0%|          | 0/111 [00:00<?, ?it/s]

#### Ранжируем

In [31]:
w2vlem_pre_to_sov = [noun for noun in sorted(noun_clean,
                                             key=lambda x: noun_clean[x]['wv_pre_to_sov'],
                                             reverse=True)]

In [32]:
w2vpos_pre_to_sov = [noun for noun in sorted(noun_clean,
                                             key=lambda x: noun_clean[x]['wvpos_pre_to_sov'],
                                             reverse=True)]

In [33]:
ft_pre_to_sov = [noun for noun in sorted(noun_clean,
                                         key=lambda x: noun_clean[x]['ft_pre_to_sov'],
                                         reverse=True)]

In [34]:
w2vlem_sov_to_post = [noun for noun in sorted(noun_clean,
                                              key=lambda x: noun_clean[x]['wv_sov_to_post'],
                                              reverse=True)]

In [35]:
w2vpos_sov_to_post = [noun for noun in sorted(noun_clean,
                                              key=lambda x: noun_clean[x]['wvpos_sov_to_post'],
                                              reverse=True)]

In [36]:
ft_sov_to_post = [noun for noun in sorted(noun_clean,
                                          key=lambda x: noun_clean[x]['ft_sov_to_post'],
                                          reverse=True)]

In [37]:
w2vlem_pre_to_post = [noun for noun in sorted(noun_clean,
                                              key=lambda x: noun_clean[x]['wv_pre_to_post'],
                                              reverse=True)]

In [38]:
w2vpos_pre_to_post = [noun for noun in sorted(noun_clean,
                                              key=lambda x: noun_clean[x]['wvpos_pre_to_post'],
                                              reverse=True)]

In [39]:
ft_pre_to_post = [noun for noun in sorted(noun_clean,
                                          key=lambda x: noun_clean[x]['ft_pre_to_post'],
                                          reverse=True)]

#### Собираем в датафрейм

In [40]:
ranked_nouns = pd.DataFrame({
    'w2vlem_pre_to_sov': w2vlem_pre_to_sov,
    'w2vpos_pre_to_sov': w2vpos_pre_to_sov,
    'ft_pre_to_sov': ft_pre_to_sov,
    'gold_pre_to_sov': gold_pre_to_sov,
    'w2vlem_sov_to_post': w2vlem_sov_to_post,
    'w2vpos_sov_to_post': w2vpos_sov_to_post,
    'ft_sov_to_post': ft_sov_to_post,
    'gold_sov_to_post': gold_sov_to_post,
    'w2vlem_pre_to_post': w2vlem_pre_to_post,
    'w2vpos_pre_to_post': w2vpos_pre_to_post,
    'ft_pre_to_post': ft_pre_to_post,
    'gold_pre_to_post': gold_pre_to_post
})
ranked_nouns

Unnamed: 0,w2vlem_pre_to_sov,w2vpos_pre_to_sov,ft_pre_to_sov,gold_pre_to_sov,w2vlem_sov_to_post,w2vpos_sov_to_post,ft_sov_to_post,gold_sov_to_post,w2vlem_pre_to_post,w2vpos_pre_to_post,ft_pre_to_post,gold_pre_to_post
0,лох,лох,тупик,лох,лох,лох,дядя,линейка,лох,лох,фаворит,лох
1,монстр,монстр,хрен,радикал,маньяк,монстр,маньяк,полоса,роспись,амбиция,хрен,линейка
2,амбиция,амбиция,бригада,роспись,фаворит,фаворит,ясли,дух,дядька,монстр,маньяк,полоса
3,роспись,маньяк,роспись,ядро,линейка,маньяк,монстр,блин,амбиция,дядька,монстр,ядро
4,маньяк,роспись,эшелон,хрен,монстр,наложение,фаворит,головка,линейка,роспись,бригада,роспись
...,...,...,...,...,...,...,...,...,...,...,...,...
106,земля,земля,проникновение,обоснование,путь,путь,предательство,наволочка,облако,карман,лечение,сверстник
107,путь,путь,лишение,понедельник,стол,стена,стена,издательство,земля,любовь,установление,понедельник
108,любовь,любовь,наложение,стипендия,земля,земля,размышление,понедельник,любовь,стена,проникновение,верховье
109,стена,стена,лечение,сверстник,стена,стол,установление,сверстник,стена,земля,предательство,наволочка


#### Считаем корреляцию с золотым стандартом (RuShiftEval)

In [41]:
spearmanr(w2vlem_pre_to_sov, gold_pre_to_sov)

SpearmanrResult(correlation=-0.004308529308529308, pvalue=0.9642033377767343)

In [42]:
spearmanr(w2vpos_pre_to_sov, gold_pre_to_sov)

SpearmanrResult(correlation=-0.040400140400140394, pvalue=0.6737583232800901)

In [43]:
spearmanr(ft_pre_to_sov, gold_pre_to_sov)

SpearmanrResult(correlation=-0.03484555984555985, pvalue=0.7165486877420728)

In [44]:
spearmanr(w2vlem_sov_to_post, gold_sov_to_post)

SpearmanrResult(correlation=0.03747806247806248, pvalue=0.6961495864808469)

In [45]:
spearmanr(w2vpos_sov_to_post, gold_sov_to_post)

SpearmanrResult(correlation=-0.02733415233415233, pvalue=0.7758154547124656)

In [46]:
spearmanr(ft_sov_to_post, gold_sov_to_post)

SpearmanrResult(correlation=-0.07847490347490348, pvalue=0.4129640066020368)

In [47]:
spearmanr(w2vlem_pre_to_post, gold_pre_to_post)

SpearmanrResult(correlation=-0.13793436293436295, pvalue=0.14882659186922198)

In [48]:
spearmanr(w2vpos_pre_to_post, gold_pre_to_post)

SpearmanrResult(correlation=-0.1158037908037908, pvalue=0.22615067747649165)

In [49]:
spearmanr(ft_pre_to_post, gold_pre_to_post)

SpearmanrResult(correlation=0.004852579852579852, pvalue=0.9596867712809428)