Mix labels of several trained models:

* Baseline Ada
* LibRu Ada
* BigDataset Ada
* SensGram wiki+contexts

In [1]:
import numpy as np
import pandas as pd

In [3]:
from testing_interface import make_data,\
                              visualize_pca,\
                              evaluate_weighted_ari,\
                              get_one_word_data,\
                              visualize_pca_one_word,\
                              visualize_tsne_one_word

DATASET = '../data/main/wiki-wiki/train.csv'

contexts, main_words, labels, word_list_uniq = make_data(df_file_name=DATASET,\
                 use_gensim_simple_preproc=False, use_mystem=True,\
                 throw_main_word=False, tokenize=True)

<img src="pics/table.jpg">

# Simple Rule {MAX}:

take labels with max number of clusters

In [19]:
from os.path import join

In [2]:
ls -lh ../data/main/wiki-wiki/

total 4,0M
-rw-r--r-- 1 fogside fogside 515K дек 20 11:53 [0m[00mtest.csv[0m
-rw-r--r-- 1 fogside fogside 348K ноя  7 15:48 [00mtrain.baseline-adagram.csv[0m
-rw-r--r-- 1 fogside fogside 350K ноя 21 16:40 [00mtrain.constant.csv[0m
-rw-r--r-- 1 fogside fogside 358K янв 13 20:52 [00mtrain.csv[0m
-rw-r--r-- 1 fogside fogside 361K янв 22 17:08 [00mtrain.csv_bigText.csv[0m
-rw-r--r-- 1 fogside fogside 361K янв 22 17:01 [00mtrain.csv_libru.csv[0m
-rw-r--r-- 1 fogside fogside 364K янв 22 17:25 [00mtrain.csv_sensegram.csv[0m
-rw-r--r-- 1 fogside fogside 350K ноя 21 16:40 [00mtrain.oracle.csv[0m
-rw-r--r-- 1 fogside fogside 350K ноя 21 16:40 [00mtrain.random-1-3.csv[0m
-rw-r--r-- 1 fogside fogside 350K ноя 21 16:40 [00mtrain.random-1-6.csv[0m
-rw-r--r-- 1 fogside fogside 350K ноя 21 16:40 [00mtrain.unique-dataset-wise.csv[0m


In [62]:
def check_dataset(dataset_folder):
    DATASET = join(dataset_folder+'train.csv')
    
    
    contexts, main_words, labels, word_list_uniq = make_data(df_file_name=DATASET,\
                     use_gensim_simple_preproc=False, use_mystem=True,\
                     throw_main_word=False, tokenize=True)
    
    libru = pd.read_csv(join(dataset_folder, "train.csv_libru.csv"))
    bigText = pd.read_csv(join(dataset_folder,"train.csv_bigText.csv"))
    baseline = pd.read_csv(join(dataset_folder,"train.baseline-adagram.csv"), sep='\t')
    sensegram = pd.read_csv(join(dataset_folder,"train.csv_sensegram.csv"))
    
    datasets = [libru, bigText, baseline, sensegram]
    
    prediction = []

    for w in word_list_uniq:
        clust_num = []
        for df in datasets:
            n = len(set(df[df.word == w].predict_sense_id.values))
            clust_num.append(n)
        chosen = np.argmax(clust_num)
        print("For word {} chosen {}".format(w, chosen))
        d = datasets[chosen] # датасет,из которого берем предсказания
        prediction.extend(d[d.word == w].predict_sense_id)
    print('-'*10)
    print(len(prediction))
    evaluate_weighted_ari(DATASET, prediction)

In [63]:
check_dataset("../data/main/wiki-wiki/")

For word замок chosen 0
For word лук chosen 2
For word суда chosen 2
For word бор chosen 2
----------
439
word	ari	count
бор	0.591175	56
замок	0.337541	138
лук	0.637076	110
суда	0.005465	135
	0.342831	439


In [64]:
check_dataset("../data/main/bts-rnc/")

For word балка chosen 0
For word вид chosen 0
For word винт chosen 0
For word горн chosen 2
For word губа chosen 2
For word жаба chosen 0
For word клетка chosen 2
For word крыло chosen 0
For word купюра chosen 0
For word курица chosen 2
For word лавка chosen 2
For word лайка chosen 2
For word лев chosen 2
For word лира chosen 0
For word мина chosen 2
For word мишень chosen 2
For word обед chosen 2
For word оклад chosen 0
For word опушка chosen 2
For word полис chosen 2
For word пост chosen 0
For word поток chosen 2
For word проказа chosen 0
For word пропасть chosen 2
For word проспект chosen 0
For word пытка chosen 2
For word рысь chosen 2
For word среда chosen 2
For word хвост chosen 0
For word штамп chosen 0
----------
3491
word	ari	count
балка	0.559585	119
вид	0.103725	77
винт	0.458525	123
горн	0.148054	51
губа	0.011532	137
жаба	-0.053646	121
клетка	0.253548	150
крыло	0.251726	91
купюра	-0.026889	150
курица	0.303684	93
лавка	0.275710	149
лайка	-0.028626	99
лев	0.227088	44
лира	0.459

In [65]:
check_dataset("../data/main/active-dict/")

For word дар chosen 2
For word двигатель chosen 0
For word двойник chosen 0
For word дворец chosen 0
For word девятка chosen 3
For word дедушка chosen 2
For word дежурная chosen 2
For word дежурный chosen 2
For word декабрист chosen 0
For word декрет chosen 3
For word дело chosen 2
For word демобилизация chosen 3
For word демократ chosen 0
For word демонстрация chosen 0
For word дерево chosen 2
For word держава chosen 2
For word дерзость chosen 0
For word десятка chosen 2
For word десяток chosen 2
For word деятель chosen 0
For word диалог chosen 0
For word диаметр chosen 2
For word диплом chosen 2
For word директор chosen 2
For word диск chosen 0
For word дичь chosen 2
For word длина chosen 2
For word доброволец chosen 0
For word добыча chosen 0
For word доказательство chosen 2
For word доктор chosen 0
For word долгота chosen 2
For word доля chosen 0
For word дом chosen 2
For word дорога chosen 0
For word достижение chosen 2
For word древесина chosen 2
For word дупло chosen 2
For word 

### Вывод

Итак, улучшились скоры на датасетах 2 и 3, но ухудшился скор на вики.

В целом, улучшения заметные, но скоры все еще не очень высокие.

# Simple Rule {MIN>=2}:

take labels with min number of clusters which >=2

In [55]:
def get_min_bigger2(array):
    mini = 0
    for i, val in enumerate(array):
        if (val>=2) and (val<array[mini]):
            mini = i
    return mini

In [60]:
def check_dataset2(dataset_folder):
    DATASET = join(dataset_folder+'train.csv')
    
    
    contexts, main_words, labels, word_list_uniq = make_data(df_file_name=DATASET,\
                     use_gensim_simple_preproc=False, use_mystem=True,\
                     throw_main_word=False, tokenize=True)
    
    libru = pd.read_csv(join(dataset_folder, "train.csv_libru.csv"))
    bigText = pd.read_csv(join(dataset_folder,"train.csv_bigText.csv"))
    baseline = pd.read_csv(join(dataset_folder,"train.baseline-adagram.csv"), sep='\t')
    sensegram = pd.read_csv(join(dataset_folder,"train.csv_sensegram.csv"))
    
    datasets = [libru, bigText, baseline, sensegram]
    
    prediction = []

    for w in word_list_uniq:
        clust_num = []
        for df in datasets:
            n = len(set(df[df.word == w].predict_sense_id.values))
            clust_num.append(n)
        chosen = get_min_bigger2(clust_num)
        print("For word {} chosen {}".format(w, chosen))
        d = datasets[chosen] # датасет,из которого берем предсказания
        prediction.extend(d[d.word == w].predict_sense_id)
    print('-'*10)
    print(len(prediction))
    prediction = [str(p) for p in prediction]
    evaluate_weighted_ari(DATASET, prediction)

In [61]:
check_dataset2("../data/main/wiki-wiki/")

For word замок chosen 3
For word лук chosen 1
For word суда chosen 0
For word бор chosen 0
----------
439
word	ari	count
бор	0.171375	56
замок	-0.002524	138
лук	0.927898	110
суда	-0.078850	135
	0.229323	439


In [66]:
check_dataset2("../data/main/bts-rnc/")

For word балка chosen 1
For word вид chosen 3
For word винт chosen 3
For word горн chosen 0
For word губа chosen 3
For word жаба chosen 1
For word клетка chosen 1
For word крыло chosen 1
For word купюра chosen 3
For word курица chosen 0
For word лавка chosen 0
For word лайка chosen 0
For word лев chosen 3
For word лира chosen 1
For word мина chosen 3
For word мишень chosen 0
For word обед chosen 0
For word оклад chosen 3
For word опушка chosen 0
For word полис chosen 0
For word пост chosen 3
For word поток chosen 1
For word проказа chosen 2
For word пропасть chosen 0
For word проспект chosen 1
For word пытка chosen 0
For word рысь chosen 0
For word среда chosen 0
For word хвост chosen 3
For word штамп chosen 2
----------
3491
word	ari	count
балка	0.570300	119
вид	-0.007547	77
винт	0.007195	123
горн	-0.013183	51
губа	-0.001137	137
жаба	0.026540	121
клетка	0.670594	150
крыло	0.161214	91
купюра	-0.000079	150
курица	0.000000	93
лавка	0.305352	149
лайка	0.000000	99
лев	0.005695	44
лира	0.76

In [67]:
check_dataset2("../data/main/active-dict/")

For word дар chosen 0
For word двигатель chosen 1
For word двойник chosen 1
For word дворец chosen 3
For word девятка chosen 0
For word дедушка chosen 0
For word дежурная chosen 0
For word дежурный chosen 0
For word декабрист chosen 0
For word декрет chosen 0
For word дело chosen 3
For word демобилизация chosen 0
For word демократ chosen 1
For word демонстрация chosen 1
For word дерево chosen 1
For word держава chosen 0
For word дерзость chosen 1
For word десятка chosen 0
For word десяток chosen 3
For word деятель chosen 1
For word диалог chosen 3
For word диаметр chosen 0
For word диплом chosen 0
For word директор chosen 1
For word диск chosen 1
For word дичь chosen 0
For word длина chosen 3
For word доброволец chosen 0
For word добыча chosen 3
For word доказательство chosen 1
For word доктор chosen 1
For word долгота chosen 0
For word доля chosen 1
For word дом chosen 0
For word дорога chosen 3
For word достижение chosen 3
For word древесина chosen 0
For word дупло chosen 0
For word 

### Вывод

Итак, при такой стратегии стало только хуже или ничего не изменилось