In [18]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pylab as plot
import sys
from collections import defaultdict
from navec import Navec
from os import path
from pandas import read_csv
from sklearn.cluster import AgglomerativeClustering, Birch
from sklearn.metrics import adjusted_rand_score, silhouette_score
from sklearn.decomposition import PCA

In [5]:
path = "C:/Users/mosei/Downloads/navec_hudlit_v1_12B_500K_300d_100q.tar"
navec = Navec.load(path)

In [6]:
# Сброс ограничений на количество выводимых рядов
pd.set_option('display.max_rows', None)

# Сброс ограничений на число столбцов
pd.set_option('display.max_columns', None)
 
# Сброс ограничений на количество символов в записи
pd.set_option('display.max_colwidth', None)

In [7]:
def gold_predict(df):
    """ This method assigns the gold and predict fields to the data frame. """

    df = df.copy()

    df['gold'] = df['word'] + '_' + df['gold_sense_id']
    df['predict'] = df['word'] + '_' + df['predict_sense_id']

    return df

def ari_per_word_weighted(df):
    """ This method computes the ARI score weighted by the number of sentences per word. """

    df = gold_predict(df)

    words = {word: (adjusted_rand_score(df_word.gold, df_word.predict), len(df_word))
             for word in df.word.unique()
             for df_word in (df.loc[df['word'] == word],)}

    cumsum = sum(ari * count for ari, count in words.values())
    total = sum(count for _, count in words.values())

    assert total == len(df), 'please double-check the format of your data'

    return cumsum / total, words

def evaluate(dataset_fpath):
    df = read_csv(dataset_fpath, sep='\t', dtype={'gold_sense_id': str, 'predict_sense_id': str})
    ari, words = ari_per_word_weighted(df)
    vocab.append(words)
    print('{}\t{}\t{}'.format('word', 'ari', 'count'))

    for word in sorted(words.keys()):
        print('{}\t{:.6f}\t{:d}'.format(word, *words[word]))

    print('\t{:.6f}\t{:d}'.format(ari, len(df)))
    return ari

def save(df, corpus):
    """
    :param df: Data Frame with predictions
    :param corpus: dataset name
    :return: path to the saved file
    """
    output_fpath = corpus + "_predictions.csv"
    df.to_csv(output_fpath, sep="\t", encoding="utf-8", index=False)
    print("Generated dataset: {}".format(output_fpath))
    return output_fpath

In [13]:
PATH_BTS='C:/Users/mosei/OneDrive/Desktop/diploma/RUSSE_data/bts-rnc/'
dataset = pd.read_csv(PATH_BTS +'train.csv', sep='\t')

In [9]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mosei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data processing

In [10]:
def average_vector(context):
    """
    paramtres:
        context -- list of words
    output: 
        average vector of word embeddings of words in context
    """
    vocab = set(context)
    # length_con = len(context)
    vectors = []
    for word in vocab:
        # tf = context.count(word) / length_con
        if word in navec:
            vectors.append(navec[word]) # np.linalg.norm((tf * idf[word]) * navec[word])
    if not vectors:
        vectors.append(navec['<pad>'])
    vectors = np.array(vectors)
    rows_num = np.shape(vectors)[0]
    average_vector = np.sum(vectors, axis=0) / rows_num
    return average_vector

In [21]:
def number_of_clusters(X_array):
    clustering = [AgglomerativeClustering(n_clusters=i) for i in range(2, 9, 1)]
    k = [2, 3, 4, 5, 6, 7, 8]
    scores = [] 
    for j in range(7):
        scores.append(silhouette_score(X_array, clustering[j].fit_predict(X_array)))
    return k[np.argmax(scores)]

In [12]:
# Тут хочется взять некоторую окрестность слов вокруг целевого
# Сохраню на память: bad_words = {'горн': 'горный', 'жаба': 'жабой', 'крыло': 'крыть', 'курица': 'кура', 'кура': 'курам'}
def remove_target_word(context, positions):
    positions = [i.split('-') for i in positions.split(',')]
    for position in positions:
        start = int(position[0])
        end = int(position[1])
        return context.replace(context[start:end+1], 'target')

from natasha import (
    MorphVocab, 
    Doc,
    Segmenter,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsEmbedding
)

emb = NewsEmbedding()
segmenter = Segmenter()
morph_vocab = MorphVocab()
morph_tagger = NewsMorphTagger(emb)

def context_lemmatization(text, window_size, min_len): # (text, min_len, remove_word)
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:    
        token.lemmatize(morph_vocab)
    list_of_words = [word.lemma for word in doc.tokens if (word.pos != 'PUNCT' 
                                                  and len(word.lemma) > min_len
                                                  and word.lemma not in stopwords.words('russian'))]
    size_list = len(list_of_words)
    if 'target' not in list_of_words:
        print(list_of_words)
    target_index = list_of_words.index('target')
    inf = 0 if (target_index - window_size <= 0) else (target_index - window_size)
    sup = size_list if (target_index + window_size >= size_list) else target_index + window_size
    list_of_words = list_of_words[inf:sup]
    list_of_words.remove('target')
    return list_of_words

In [14]:
# Удалим целевое слово
dataset['context'] = dataset.apply(lambda x: remove_target_word(x.context, x.positions), axis=1)
# Убираем вообще все, что не буквы и не цифры
dataset['context'] = dataset['context'].replace("\W", ' ', regex=True)
# Цифры тоже убираем
# dataset['context'] = dataset['context'].replace("\d", ' ', regex=True)
# Количество пробелов подряд не больше одного
dataset['context'] = dataset['context'].replace(r'\s+', ' ', regex=True)

dataset['context'] = dataset.apply(lambda x: context_lemmatization(x.context, min_len=1, window_size=5), axis=1)
# (lambda x: context_lemmatization(x.context, 2, x.word), axis=1)

In [26]:
%%time
vocab = []
predicted = []
goldsenses = []
model_vector_size = 300
testing = False
pca = PCA(n_components=10)
for word in dataset.word.unique():
    print('Now analyzing', word, '...', file=sys.stderr)
    subset = dataset[dataset.word == word]
    goldsenses.append(len(subset.gold_sense_id.unique()))
    contexts = []
    matrix = np.empty((subset.shape[0], model_vector_size))
    counter = 0
    lengths = []
    for line in subset.iterrows():
        text = line[1].context
        identifier = line[1].context_id
        label = word + str(identifier)
        contexts.append(label)
        fp = average_vector(text)
        lengths.append(len(text))
        matrix[counter, :] = fp
        counter += 1
    matrix = pca.fit_transform(matrix)
    matrix = matrix.copy(order='C')
    clustering = Birch(n_clusters=number_of_clusters(matrix)).fit(matrix)
    cur_predicted = clustering.labels_.tolist()
    predicted += cur_predicted
    gold = subset.gold_sense_id
    print('Gold clusters:', len(set(gold)), file=sys.stderr)
    print('Predicted clusters:', len(set(cur_predicted)), file=sys.stderr)     

dataset.predict_sense_id = predicted
fname = PATH_BTS
save(dataset, fname)

res = evaluate(save(dataset, fname))
print('ARI:', res)
print('Average number of senses:', np.average(goldsenses))
print('Variation of the number of senses:', np.std(goldsenses))
print('Minimum number of senses:', np.min(goldsenses))
print('Maximum number of senses:', np.max(goldsenses))

Now analyzing балка ...
Gold clusters: 2
Predicted clusters: 8
Now analyzing вид ...
Gold clusters: 3
Predicted clusters: 2
Now analyzing винт ...
Gold clusters: 4
Predicted clusters: 4
Now analyzing горн ...
Gold clusters: 3
Predicted clusters: 3
Now analyzing губа ...
Gold clusters: 3
Predicted clusters: 5
Now analyzing жаба ...
Gold clusters: 4
Predicted clusters: 7
Now analyzing клетка ...
Gold clusters: 6
Predicted clusters: 2
Now analyzing крыло ...
Gold clusters: 8
Predicted clusters: 8
Now analyzing купюра ...
Gold clusters: 2
Predicted clusters: 2
Now analyzing курица ...
Gold clusters: 2
Predicted clusters: 8
Now analyzing лавка ...
Gold clusters: 2
Predicted clusters: 4
Now analyzing лайка ...
Gold clusters: 2
Predicted clusters: 2
Now analyzing лев ...
Gold clusters: 4
Predicted clusters: 2
Now analyzing лира ...
Gold clusters: 2
Predicted clusters: 7
Now analyzing мина ...
Gold clusters: 3
Predicted clusters: 7
Now analyzing мишень ...
Gold clusters: 2
Predicted clusters: 

Generated dataset: C:/Users/mosei/OneDrive/Desktop/diploma/RUSSE_data/bts-rnc/_predictions.csv
Generated dataset: C:/Users/mosei/OneDrive/Desktop/diploma/RUSSE_data/bts-rnc/_predictions.csv
word	ari	count
балка	0.113004	119
вид	0.192953	77
винт	0.344404	123
горн	0.222965	51
губа	0.056219	137
жаба	0.130896	121
клетка	0.300575	150
крыло	0.152255	91
купюра	0.131664	150
курица	0.085596	93
лавка	0.139739	149
лайка	0.352284	99
лев	-0.018000	44
лира	0.126502	49
мина	0.027411	65
мишень	0.346067	121
обед	0.031373	100
оклад	0.031408	146
опушка	1.000000	148
полис	-0.031379	142
пост	0.354477	144
поток	0.083191	136
проказа	0.182760	146
пропасть	0.050716	127
проспект	0.031824	139
пытка	-0.063925	143
рысь	0.745667	120
среда	0.213365	144
хвост	0.085758	121
штамп	0.405176	96
	0.202950	3491
ARI: 0.20295026911222272
Average number of senses: 3.2
Variation of the number of senses: 1.4
Minimum number of senses: 2
Maximum number of senses: 8
CPU times: total: 24.9 s
Wall time: 4.96 s


Gold clusters: 4
Predicted clusters: 2
