In [1]:
from evaluate import evaluate

from os import path
from sklearn.cluster import AffinityPropagation, KMeans, SpectralClustering, AgglomerativeClustering, KMeans
from sklearn.metrics import silhouette_score

import argparse
import gensim
import logging
import numpy as np
import os
import pandas as pd
import sklearn
import sys
import zipfile

In [2]:
def save_file(df, name):
    out_path = name + "_pred.csv"
    df.to_csv(out_path, sep="\t", encoding="utf-8", index=False)
    return out_path

In [3]:
def get_vectors(text, model):
    all_words = [word for word in text if word in model]
    word_list = list(set(all_words))
    if len(word_list) >= 1:
        vectors = np.zeros((len(word_list), model.vector_size))
        for i, word in enumerate(word_list):
            vectors[i, :] = model[word]
        context_vectors = np.divide(np.sum(vectors, axis=0), len(word_list))
    else:
        context_vectors = np.zeros(model.vector_size)
    return context_vectors

In [4]:
def load_rusvectores(path):
    if not os.path.isfile('180.zip'):
        !wget http://vectors.nlpl.eu/repository/20/180.zip
    if not os.path.isfile('182.zip'):
        !wget http://vectors.nlpl.eu/repository/20/182.zip
    # Load model
    with zipfile.ZipFile(path, 'r') as archive:
        stream = archive.open('model.bin')
        model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)
    # Initialize
    model.init_sims(replace=True)
    return model

In [5]:
def optimal_clusters_silhoette(X):
    range_n_clusters = range(2,21)
    silhouette_avg = []
    for n_clusters in range_n_clusters:
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(X)
        scor = silhouette_score(X, cluster_labels)
        if scor > 0:
            silhouette_avg.append(scor)
        else:
            silhouette_avg.append(100)
    print(silhouette_avg)
    return np.argmin(silhouette_avg) + 2

In [6]:
def main(parameters_dict: dict):
    #Load RusVectores model
    vec_model = load_rusvectores(parameters_dict['model'])
    # Load dataset
    data = pd.read_csv(parameters_dict['inputs'], sep="\t", encoding="utf-8")
    #Get vectors for every word
    all_labels = []
    for query in data['word'].unique():
        print(query)
        word_set = data[data['word'] == query]
        matrix = np.empty((word_set.shape[0], vec_model.vector_size))
        for i, line in enumerate(word_set.iterrows()):
            line_context = line[1].context
            if type(line_context) != str:
                con_vectors = np.zeros(vec_model.vector_size)
            else:
                line_words = line_context.split()
                line_words = [w for w in line_words if w != query]
                con_vectors = get_vectors(line_words, vec_model)
            matrix[i, :] = con_vectors
            i += 1
        # Use AffinityPropagation to determine optimal number of clusters
        cluster_algo = AffinityPropagation(
            max_iter = 350,
            convergence_iter = 60,
            preference=parameters_dict['preference'], 
            damping=parameters_dict['damping'], 
            random_state=42
        ).fit(matrix)
        n_iter_first = cluster_algo.n_iter_
        print(n_iter_first)
        nclusters = len(cluster_algo.cluster_centers_indices_)
        if nclusters == 0:
            # If no cluster assign 1
            nclusters = 1
        elif nclusters == word_set.shape[0]:
            # If too much clusters assign some median
            nclusters = 3
        # Recluster second time
        cluster_algo = SpectralClustering(n_clusters=nclusters, n_init=30,
                                        assign_labels='discretize', n_jobs=-1).fit(matrix)
        #cluster_algo = AgglomerativeClustering(n_clusters=nclusters, linkage="complete").fit(matrix)
        if not parameters_dict['testing']:
            clust_num = word_set.gold_sense_id
            print('Number of real clusters:', len(set(clust_num)))
        pred_labels = cluster_algo.labels_.tolist()
        all_labels += pred_labels
        print('Number of predicted clusters', len(set(pred_labels)))
    # Back to dataframe
    data.predict_sense_id = all_labels
    fname = path.splitext(path.basename(parameters_dict['inputs']))[0]
    if parameters_dict['testing']:
        data_test = pd.read_csv(path.dirname(parameters_dict['inputs']) + '/test.csv', sep="\t", encoding="utf-8")
        data_test.predict_sense_id = data.predict_sense_id
        save_file(data_test, fname)
    else:
        res = evaluate(save_file(data, fname))
        print('ARI', res)


## Active Dictionary

In [7]:
args = {
        'inputs':'data/active-dict/train_pos.csv',
        'model':'180.zip', #182.zip,#'ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz',
        'testing': False,
        'preference': -0.65,
        'damping': 0.8
       }
main(args)

дар_NOUN
64
Number of real clusters: 3
Number of predicted clusters 3
двигатель_NOUN
85
Number of real clusters: 2
Number of predicted clusters 4
двойник_NOUN
65
Number of real clusters: 4
Number of predicted clusters 3
дворец_NOUN
74
Number of real clusters: 2
Number of predicted clusters 4
девятка_NOUN
73
Number of real clusters: 9
Number of predicted clusters 7
дедушка_NOUN
69
Number of real clusters: 2
Number of predicted clusters 2
дежурная_ADJ
70
Number of real clusters: 2
Number of predicted clusters 2
дежурный_NOUN
66
Number of real clusters: 2
Number of predicted clusters 2
декабрист_NOUN
94
Number of real clusters: 2
Number of predicted clusters 2
декрет_NOUN
78
Number of real clusters: 2
Number of predicted clusters 2
дело_NOUN
208
Number of real clusters: 17
Number of predicted clusters 19
демобилизация_NOUN
71
Number of real clusters: 2
Number of predicted clusters 3
демократ_NOUN
71
Number of real clusters: 3
Number of predicted clusters 4
демонстрация_NOUN
78
Number of r

In [8]:
args = {
        'inputs':'data/active-dict/test_pos.csv',
        'model':'180.zip',#'ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz',
        'testing': True,
        'preference': -0.65,
        'damping': 0.8
       }
main(args)

давление_NOUN
69
Number of predicted clusters 3
дама_NOUN
86
Number of predicted clusters 2
данные_NOUN
64
Number of predicted clusters 6
дата_NOUN
72
Number of predicted clusters 2
двойка_NOUN
62
Number of predicted clusters 4
двор_NOUN
69
Number of predicted clusters 4
дворник_NOUN
80
Number of predicted clusters 1
девка_NOUN
65
Number of predicted clusters 13
девочка_NOUN
82
Number of predicted clusters 5
девушка_NOUN
68
Number of predicted clusters 5
девчонка_NOUN
68
Number of predicted clusters 7
дед_NOUN
72
Number of predicted clusters 3
дезертир_NOUN
70
Number of predicted clusters 3
действие_NOUN
70
Number of predicted clusters 10
действительность_NOUN
69
Number of predicted clusters 2
декларация_NOUN
76
Number of predicted clusters 3
декорация_NOUN
88
Number of predicted clusters 3
делегат_NOUN
67
Number of predicted clusters 1
деление_NOUN
96
Number of predicted clusters 7
дельфин_NOUN
66
Number of predicted clusters 4
демократия_NOUN
67
Number of predicted clusters 2
день_NO

### BTC-RNC

In [9]:
args = {
        'inputs':'data/bts-rnc/train_pos.csv',
        'model':'180.zip',#'ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz',
        'testing': False,
        'preference': -0.7,
        'damping': 0.6
       }
main(args)

балка_NOUN
67
Number of real clusters: 2
Number of predicted clusters 4
вид_NOUN
66
Number of real clusters: 3
Number of predicted clusters 2
винт_NOUN
67
Number of real clusters: 4
Number of predicted clusters 5
горн_NOUN
67
Number of real clusters: 3
Number of predicted clusters 2
губа_NOUN
69
Number of real clusters: 3
Number of predicted clusters 4
жаба_NOUN
67
Number of real clusters: 4
Number of predicted clusters 4
клетка_NOUN
68
Number of real clusters: 6
Number of predicted clusters 4
крыло_NOUN
72
Number of real clusters: 8
Number of predicted clusters 4
купюра_NOUN
74
Number of real clusters: 2
Number of predicted clusters 3
курица_NOUN
66
Number of real clusters: 2
Number of predicted clusters 3
лавка_NOUN
67
Number of real clusters: 2
Number of predicted clusters 4
лайка_NOUN
65
Number of real clusters: 2
Number of predicted clusters 3
лев_NOUN
65
Number of real clusters: 4
Number of predicted clusters 1
лира_NOUN
62
Number of real clusters: 2
Number of predicted clusters 

In [10]:
args = {
        'inputs':'data/bts-rnc/train_pos.csv',
        'model':'182.zip',#'ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz',
        'testing': False,
        'preference': -0.65,
        'damping': 0.8
       }
main(args)

балка_NOUN
77
Number of real clusters: 2
Number of predicted clusters 2
вид_NOUN
65
Number of real clusters: 3
Number of predicted clusters 2
винт_NOUN
107
Number of real clusters: 4
Number of predicted clusters 3
горн_NOUN
65
Number of real clusters: 3
Number of predicted clusters 1
губа_NOUN
70
Number of real clusters: 3
Number of predicted clusters 2
жаба_NOUN
78
Number of real clusters: 4
Number of predicted clusters 2
клетка_NOUN
72
Number of real clusters: 6
Number of predicted clusters 3
крыло_NOUN
78
Number of real clusters: 8
Number of predicted clusters 2
купюра_NOUN
105
Number of real clusters: 2
Number of predicted clusters 2
курица_NOUN
70
Number of real clusters: 2
Number of predicted clusters 3
лавка_NOUN
65
Number of real clusters: 2
Number of predicted clusters 2
лайка_NOUN
70
Number of real clusters: 2
Number of predicted clusters 2
лев_NOUN
65
Number of real clusters: 4
Number of predicted clusters 1
лира_NOUN
67
Number of real clusters: 2
Number of predicted cluster

In [11]:
args = {
        'inputs':'data/bts-rnc/test_pos.csv',
        'model':'182.zip',#'ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz',
        'testing': True,
        'preference': -0.45,
        'damping': 0.8
       }
main(args)

акция_NOUN
81
Number of predicted clusters 4
баба_NOUN
65
Number of predicted clusters 2
байка_NOUN
67
Number of predicted clusters 4
бум_NOUN
77
Number of predicted clusters 2
бычок_NOUN
90
Number of predicted clusters 3
вал_VERB
75
Number of predicted clusters 2
газ_NOUN
80
Number of predicted clusters 4
гвоздик_NOUN
78
Number of predicted clusters 4
гипербол_NOUN
92
Number of predicted clusters 3
град_NOUN
81
Number of predicted clusters 5
гусеница_NOUN
105
Number of predicted clusters 4
дождь_NOUN
86
Number of predicted clusters 3
домино_NOUN
77
Number of predicted clusters 5
забой_NOUN
91
Number of predicted clusters 3
икра_NOUN
93
Number of predicted clusters 4
кабачок_NOUN
80
Number of predicted clusters 5
капот_NOUN
82
Number of predicted clusters 5
карьера_NOUN
74
Number of predicted clusters 4
кличка_NOUN
66
Number of predicted clusters 3
ключ_NOUN
99
Number of predicted clusters 4
кок_NOUN
65
Number of predicted clusters 1
кольцо_NOUN
89
Number of predicted clusters 4
концер

### WIKI-WIKI

In [12]:
args = {
        'inputs':'data/wiki-wiki/train_pos.csv',
        'model':'182.zip',#'ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz',
        'testing': False,
        'preference': -0.85,
        'damping': 0.9
       }
main(args)

замок_NOUN
99
Number of real clusters: 2
Number of predicted clusters 3
лук_NOUN
91
Number of real clusters: 2
Number of predicted clusters 2
суд_NOUN
108
Number of real clusters: 2
Number of predicted clusters 3
бор_NOUN
97
Number of real clusters: 2
Number of predicted clusters 2
word	ari	count
бор_NOUN	0.925003	56
замок_NOUN	0.454188	138
лук_NOUN	0.759285	110
суд_NOUN	0.478780	135
	0.598257	439
ARI 0.59825666820205


In [13]:
args = {
        'inputs': 'data/wiki-wiki/test_pos.csv',
        'model': '182.zip',#'ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz',
        'testing': True,
        'preference': -0.85,
        'damping': 0.9
       }
main(args)

банк_NOUN
94
Number of predicted clusters 3
белок_NOUN
82
Number of predicted clusters 4
бит_NOUN
83
Number of predicted clusters 2
горе_NOUN
86
Number of predicted clusters 1
граф_NOUN
79
Number of predicted clusters 2
душа_NOUN
88
Number of predicted clusters 1
