# Statistics

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from glob import glob 
import pickle
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
from sklearn import dummy
import sys
import os
import helper

USED_FOLDER = '2017-09-13_21:33'
#USED_FOLDER = None

result_folders = [x for x in glob('data/results/2017*') if os.path.isdir(x)]

folder = 'data/results/{}'.format(USED_FOLDER) if USED_FOLDER else result_folders[-1]

print('Using result folder: {}'.format(folder))

df_all_ = None
for result_file in helper.log_progress(glob('{}/*.npy'.format(folder))):
    with open(result_file, 'rb') as f:
        result = pickle.load(f)
    dataset = result_file.split('/')[-1].rsplit('.', 2)[0]
    is_graph_dataset = 'graph' in dataset
    is_cooccurrence_dataset = 'cooccurrence' in dataset
    
    dataset_name = dataset
    
    if is_graph_dataset:
        is_label_dropped = 'same-label' in result_file
        result['same_label'] = is_label_dropped
        is_relabeled = 'relabeled' in result_file
        result['relabeled'] = is_relabeled
        if is_relabeled:
            topn = result_file.split('topn-')[1].split('_')[0]
            threshold = result_file.split('threshold-')[1].split('_')[0]
            result['topn'] = int(topn)
            result['threshold'] = float(threshold)
        result['wl_iteration'] = dataset.split('.')[-1]
        parts = dataset.split('_')
        if is_cooccurrence_dataset:
            dataset_name = parts[-1].split('_')[0].split('.')[0]
            result['words'] = parts[4]
            result['window_size'] = parts[3]
        # GML
        else:
            dataset_name = parts[3].split('.')[0]
            result['words'] = 'concepts'
        result['type'] = 'cooccurrence' if is_cooccurrence_dataset else 'concept-graph'
    else:
        result['type'] = 'text'
        dataset_name = dataset.split('_')[1]
        #result['words'] = ['all' if x['preprocessing'] != None else 'only-nouns'  for x in result['params']]
        result['words'] = ['all' for x in result['params']]
        #result['words'] = result['params']['preprocessing']
    result['classifier'] = [None] * len(result['params'])
    
    for idx, param in enumerate(result['params']):
        result['classifier'][idx] = type(param['clf']).__name__
        del param['clf']
        
    result['filename'] = result_file
    result['dataset'] = dataset_name
    
    is_lemmatized = '_lemmatized_' in result_file
    result['lemmatized'] = is_lemmatized
    
    if dataset_name.endswith('-single') or dataset_name.endswith('-ana'):
        dataset_name = dataset_name.rsplit('-', 1)[0]
    del result['param_clf']


    
    result_df = pd.DataFrame(result).sort_values(by = 'dataset', ascending = False)
    df_all_ = result_df if df_all_ is None else df_all_.append(result_df)

assert df_all_ is not None
assert df_all_.shape[0]
        
df_all_['threshold'] = pd.to_numeric(df_all_['threshold'])
# Only keep datasets where there are all three types (text, co-occurence and concept-graph) of results
df_all = df_all_.groupby('dataset').filter(lambda x: len(x.type.value_counts()) == 3).reset_index(drop=True)

In [None]:
sns.set('notebook', 'whitegrid')

In [None]:
COLUMNS_OF_INTEREST = ['classifier', 'dataset', 'filename', 'lemmatized', 'accuracy', 'f1', 'precision', 'recall', 'mean_train_f1_macro', 'param_preprocessing', 'param_scaler', 'params', 'relabeled', 'same_label', 'std_test_accuracy', 'std_test_f1_macro', 'std_test_precision_macro', 'std_test_recall_macro', 'threshold', 'topn', 'window_size', 'wl_iteration', 'words']

def plot_best_by_type(df_all, df, title = '', fontsize = 12, figsize = (6, 3), top = 0.85):
    fig, ax = plt.subplots(figsize = figsize)
    els = df_all.iloc[df['mean_test_f1_macro'].idxmax()]
    els = els.set_index('type')
    els = els.rename(columns = {'mean_test_f1_macro': 'f1', 'mean_test_accuracy': 'accuracy', 'mean_test_precision_macro': 'precision', 'mean_test_recall_macro': 'recall'})
    els[['f1', 'accuracy', 'precision', 'recall']].plot(kind = 'barh', ax = ax, xlim = (0, 1.5), xerr=[els.std_test_f1_macro * 2,  els.std_test_accuracy * 2,  els.std_test_precision_macro * 2,  els.std_test_recall_macro * 2])
    #    df['mean_test_f1_macro'].max().plot(kind = 'barh', ax = ax, xlim = (0, 1))#)
    ax.set_xticks(np.linspace(0, 1, 11))
    ax.grid(axis = 'y')
    for set_type, x in els.iterrows():
        out = '{:20}best f1-score: {x.f1:.4f}\twords: {x.words:14}'.format(set_type, x = x)
        if set_type != 'text':
            out += 'wl iteration: {x.wl_iteration:<8} relabeled: {x.relabeled:<6} is_same_label: {x.same_label:<6}'.format(x = x)
        else:
            #out += 'stop words:   {}'.format(x.params["count_vectorizer__stop_words"])
            pass
        if set_type == 'cooccurrence':
            out += 'window_size: {x.window_size}'.format(x = x)
        print(out)
    display(els[COLUMNS_OF_INTEREST])
    if title and title != '':
        fig.suptitle(title, fontsize = fontsize)
    fig.tight_layout()
    if title and title != '':
        fig.subplots_adjust(top = top)
    return fig, ax

for name, df_dataset in sorted(df_all.groupby('dataset'), key = lambda x: x[0]):
    df_dataset_grouped_by_type = df_dataset.groupby('type')
    print(name)
    use_title = False
    fig, ax = plot_best_by_type(df_all, df_dataset_grouped_by_type, 'Dataset: {}'.format(name) if use_title else None)
    fig.savefig('tmp/results/dataset-{}-best.png'.format(name), dpi = 150)
    plt.show()
    plt.close(fig)


In [None]:
if 0 == 1:
    df_graphs_grouped = df_all[df_all.type != 'text'].groupby('dataset')
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize = (10, 6))
    for idx, (dataset_name, df_dataset) in enumerate(df_graphs_grouped):
        ax = axes[int(idx / 2), idx % 2]
        grouped = df_dataset.groupby('relabeled')
        els = df_all.iloc[grouped.mean_test_f1_macro.idxmax()]
        els = els.set_index('relabeled')
        #threshold = els[els.threshold > 0].iloc[0].threshold
        els = els.rename(columns = {'mean_test_f1_macro': 'f1', 'mean_test_accuracy': 'accuracy', 'mean_test_precision_macro': 'precision', 'mean_test_recall_macro': 'recall'})
        els[['f1']].plot(kind = 'barh', ax = ax, legend = False, xlim = (0, 1), title = dataset_name)
        ax.set_xlabel('f1')
        ax.set_ylabel('')

    fig.tight_layout()
    fig.suptitle('Merged labels')
    fig.subplots_adjust(top = 0.9)
    fig.savefig('tmp/results/relabeled.png', dpi = 150)
    plt.show()

In [None]:
def plot_distributions(df, df_all, title = None, figsize = (9, 4)):
    fig, axes_indexed = plt.subplots(nrows = 2, ncols=2, figsize = figsize)

    axes = []
    for ax_row in axes_indexed:
        axes += list(ax_row)
    #, 'relabeled'
    for val, ax in zip(['wl_iteration', 'window_size', 'words', 'type'], axes):
        if len(df.groupby(val).size()) == 0:
            continue
        grouped = df.groupby(val)
        els = df_all.iloc[grouped['mean_test_f1_macro'].idxmax()]
        els = els.set_index(val)
        els = els.rename(columns = {'mean_test_f1_macro': 'f1', 'mean_test_accuracy': 'accuracy', 'mean_test_precision_macro': 'precision', 'mean_test_recall_macro': 'recall'})
        els[['f1', 'accuracy', 'precision', 'recall']].plot(kind = 'barh', ax = ax, xlim=(0, 2))
        
        ax.set_xlabel('f1 macro score')
    plt.suptitle(title, size = 18)
    fig.tight_layout()
    fig.subplots_adjust(top=0.90)
    return fig, axes
    
dpi = 150

if 0 == 1:
    fig, _  = plot_distributions(df_all, df_all, title = 'Mean over all datasets')
    fig.savefig('tmp/results/all.png', dpi = dpi)
    plt.show()
    plt.close(fig)
    for name, df_dataset in df_all.groupby('dataset'):
        if len(df_dataset.type.value_counts()) < 3:
            continue
        fig, _ = plot_distributions(df_dataset, df_all, title = 'Dataset: {}'.format(name))
        fig.savefig('tmp/results/dataset-{}.png'.format(name), dpi = dpi)
        plt.show()
        plt.close(fig)