In [1]:
import numpy as np
import pandas as pd

import re
import nltk
import gensim
from pymystem3 import Mystem

from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD



In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append("../modules/simple_elmo/")

from simple_elmo import ElmoModel

### load data

In [4]:
data = pd.read_csv(
    "../data/russe-wsi-kit/data/additional/active-rutenten/train.csv",
    sep="\t",
)

In [5]:
data.head()

Unnamed: 0,context_id,word,gold_sense_id,predict_sense_id,positions,context
0,1,альбом,2,,88-94,достаточно лишь колесиком мышки крутить вниз. ...
1,2,альбом,3,,85-91,"выступал в составе команды с таким названием, ..."
2,3,альбом,2,,81-87,". Работает так себе, поскольку функция заточен..."
3,4,альбом,3,,84-89,одержала победу в двух из пяти номинаций: 'Луч...
4,5,альбом,3,,83-88,встречи с Божественным. Вы испытаете ни с чем ...


### init model

In [6]:
stemmer = Mystem()

embedder = gensim.models.KeyedVectors.load_word2vec_format(
    "../modules/ruscorpora_mean_hs.model.bin.gz",
    binary=True,
)

In [7]:
stopwords = nltk.corpus.stopwords.words("russian")

### find senses with ELMo

In [8]:
def parse_context(
    context,
    target_word=None,
    re_pattern="[\w\-]+$",
    stopwords=stopwords,
    stemmer=stemmer,
):
    
    target_word_lemma = stemmer.lemmatize(target_word)[0]
    context = [
        lemma for lemma in stemmer.lemmatize(context) \
        if re.match('[\w\-]+$', lemma) and (lemma not in stopwords)
    ]
    
    target_word_idx = [i for i, lemma in enumerate(context) if lemma == target_word_lemma]
    
    return context, target_word_idx

In [9]:
parsed_context = data.apply(
    lambda row: parse_context(
        context=row["context"],
        target_word=row["word"],
    ),
    axis=1,
)

In [10]:
sentences = parsed_context.apply(lambda x: x[0])
target_word_idx = parsed_context.apply(lambda x: x[1])

In [11]:
sentences.head()

0    [достаточно, лишь, колесико, мышка, крутить, в...
1    [выступать, состав, команда, название, однако,...
2    [работать, поскольку, функция, заточить, банал...
3    [одерживать, победа, пять, номинация, хороший,...
4    [встреча, божественный, испытывать, сравнимый,...
dtype: object

In [12]:
target_word_idx.head()

0    [9]
1    [7]
2    [7]
3    [8]
4    [6]
dtype: object

In [13]:
target_word_idx.apply(len).value_counts()

1    3117
2     458
3      68
0      20
4       7
5       1
dtype: int64

### ELMo

In [14]:
model = ElmoModel()

In [15]:
model.load("../modules/ruwikiruscorpora_lemmas_elmo_1024_2019/")

2021-07-03 07:09:41,844 : INFO : Loading model from ../modules/ruwikiruscorpora_lemmas_elmo_1024_2019/...
2021-07-03 07:09:41,849 : INFO : We will cache the vocabulary of 100 tokens.


'The model is now loaded.'

In [16]:
sentence_embeddings = model.get_elmo_vectors(texts=sentences)

2021-07-03 07:09:54,757 : INFO : Warming up ELMo on 32 sentences...
2021-07-03 07:09:56,322 : INFO : Warming up finished.
2021-07-03 07:09:56,328 : INFO : Texts in the current batch: 32
2021-07-03 07:09:57,323 : INFO : Texts in the current batch: 32
2021-07-03 07:09:58,352 : INFO : Texts in the current batch: 32
2021-07-03 07:09:59,305 : INFO : Texts in the current batch: 32
2021-07-03 07:10:00,276 : INFO : Texts in the current batch: 32
2021-07-03 07:10:01,327 : INFO : Texts in the current batch: 32
2021-07-03 07:10:02,259 : INFO : Texts in the current batch: 32
2021-07-03 07:10:03,183 : INFO : Texts in the current batch: 32
2021-07-03 07:10:04,102 : INFO : Texts in the current batch: 32
2021-07-03 07:10:05,085 : INFO : Texts in the current batch: 32
2021-07-03 07:10:06,211 : INFO : Texts in the current batch: 32
2021-07-03 07:10:07,169 : INFO : Texts in the current batch: 32
2021-07-03 07:10:08,227 : INFO : Texts in the current batch: 32
2021-07-03 07:10:09,272 : INFO : Texts in the 

In [17]:
sentence_embeddings.shape

(3671, 42, 1024)

In [18]:
sentence_embeddings_average = sentence_embeddings.mean(axis=1)

sentence_embeddings_target_word = np.zeros((
    sentence_embeddings.shape[0],
    sentence_embeddings.shape[2]
))

for i in range(sentence_embeddings.shape[0]):
    if len(target_word_idx[i]) > 0:
        sentence_embeddings_target_word[i, :] = sentence_embeddings[i][target_word_idx[i]].mean(axis=0)
    else:
        sentence_embeddings_target_word[i, :] = sentence_embeddings[i].mean(axis=0)

In [19]:
sentence_embeddings_average.shape  # ari: 0.099504

(3671, 1024)

In [20]:
sentence_embeddings_target_word.shape  # ari: 0.081278

(3671, 1024)

### clustering

In [21]:
n_clusters = 3
n_components_svd = 15

In [22]:
prediction = []

for indices in data.groupby("word").indices.values():
    cluster = KMeans(
        n_clusters=n_clusters,
        random_state=0,
    )
    svd = TruncatedSVD(
        n_components=n_components_svd,
        algorithm="arpack",
        random_state=42,
    )
    prediction.extend(
        cluster.fit_predict(
            svd.fit_transform(
                sentence_embeddings_average[indices]
            )
        )
    )

In [23]:
data["predict_sense_id"] = prediction

In [24]:
data.to_csv(
    "predictions/elmo_prediction.tsv",
    sep="\t",
    index=False,
)

### valudate

In [25]:
!python3 ../data/russe-wsi-kit/evaluate.py predictions/elmo_prediction.tsv

word	ari	count
альбом	0.113449	450
анатомия	0.012595	95
базар	0.018194	90
балет	-0.006860	94
беда	0.004462	93
бездна	0.040665	87
билет	-0.011528	447
блок	0.214035	206
блоха	-0.009533	86
брак	0.060003	96
бритва	-0.004478	85
будущее	-0.006368	83
вешалка	0.014040	390
вилка	0.307757	302
винт	0.223169	358
галерея	0.064159	24
горбуша	0.336936	93
горшок	0.120861	406
гроза	-0.058905	95
группа	0.114872	91
	0.099504	3671
