In [21]:
from __future__ import print_function

import pandas as pd
import numpy as np
import re
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans, MiniBatchKMeans
from pymystem3 import Mystem
import tqdm

%matplotlib inline

# Data

### The table below summarizes the dataset 

|Dataset|Type|Inventory|Corpus|Split|Num. of words|Num. of senses|Avg. num. of senses|Num. of contexts|
|-----|-----|---------|-----|------|:---------:|:----------:|:----------:|:----------:|
|wiki-wiki|main|Wikipedia|Wikipedia|train|4|8|2.0|439
|bts-rnc|main|Gramota.ru|RNC|train|30|96|3.2|3491
|active-dict|main|Active Dict.|Active Dict.|train|85|312|3.7|2073
|active-rnc|additional|Active Dict.|RNC|train|20|71|3.6|1829
|active-rutenten|additional|Active Dict.|ruTenTen|train|21|71|3.4|3671
|bts-rutenten|additional|Gramota.ru|ruTenTen|train|11|25|2.3|956

In [20]:
data = pd.read_csv("../data/main/wiki-wiki/train.csv", sep='\t',
                   dtype={'gold_sense_id': str, 'predict_sense_id': str})
data.head()

Unnamed: 0,context_id,word,gold_sense_id,predict_sense_id,positions,context
0,1,замок,1,,,замок владимира мономаха в любече . многочисле...
1,2,замок,1,,,"шильонский замок замок шильйон ( ) , известный..."
2,3,замок,1,,,проведения архитектурно - археологических рабо...
3,4,замок,1,,,"топи с . , л . белокуров легенда о завещании м..."
4,5,замок,1,,,великий князь литовский гедимин после успешной...


### Show how many contexts correspond to one word meaning

In [27]:
data.ix[:,['word', 'gold_sense_id', 'context']].groupby(['word', 'gold_sense_id']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,context
word,gold_sense_id,Unnamed: 2_level_1
бор,1,14
бор,2,42
замок,1,100
замок,2,38
лук,1,65
лук,2,45
суда,1,100
суда,2,35


# TF IDF + AffinityPropagation baseline

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import AffinityPropagation

In [22]:
mystem = Mystem()

def lemmatized_context(s):
    # lemmatize all words and words with '-'
    return [w.lower() for w in mystem.lemmatize(s) if re.match('[\w\-]+$', w)]

Installing mystem to /home/fogside/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz


In [41]:
def get_train_test4one_word(word, data=data, test_split_coeff = 0.3):
    
    all_contexts = [lemmatized_context(s) for s in data[data.word == word].ix[:,'context']]
    context_ids = [c_id for c_id in data[data.word == word].ix[:, 'context_id']]
    X_train, X_test, y_train, y_test = train_test_split(all_contexts, context_ids,
                                                        test_size=test_split_coeff, random_state=42)
    return X_train, X_test, y_train, y_test

In [42]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')

In [43]:
words = data.word.unique()
print(words)

['замок' 'лук' 'суда' 'бор']


In [None]:
for word in words:
    X_train, X_test, y_train, y_test = get_train_test4one_word(word)
    train_mtx = vectorizer.fit_transform(X_train)

In [None]:
df['predict_sense_id'] = [
            disambiguate(model, word, context)
            for word, context in tqdm.tqdm(zip(df['word'], df['context']),
                                           total=len(df))]

In [None]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

# Evaluation

### baseline evaluation

In [18]:
!python3 ../evaluate.py ../data/main/wiki-wiki/train.baseline-adagram.csv

word	ari	count
бор	0.591175	56
замок	0.495386	138
лук	0.637076	110
суда	0.005465	135
	0.392449	439
