# Statistics

In [None]:
%%javascript
require("notebook/js/notebook").Notebook.prototype.scroll_to_bottom = function () {}

In [None]:
%%html
<style>
div.output_scroll {
    height: 34em !important;
}
</style>

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
from glob import glob 
import pickle
from IPython.display import display
import seaborn as sns
import numpy as np
import sklearn
from sklearn import dummy
import sys
import os
import seaborn as sns
from utils import helper
from utils import results_helper

EXPORT_DPI = 100
EXPORT_FIG_SIZE = (8, 4)
EXPORT_FIG_SIZE_BIG = (10, 7)
EXPORT_FIG_WIDTH, EXPORT_FIG_HEIGHT = EXPORT_FIG_SIZE
EXPORT_FIG_WIDTH_BIG, EXPORT_FIG_HEIGHT_BIG = EXPORT_FIG_SIZE_BIG

plt.rcParams['figure.figsize'] = EXPORT_FIG_SIZE_BIG
sns.set('notebook', 'whitegrid', palette='deep')

pd.options.display.max_columns = 999
pd.options.display.max_colwidth = -1

## Retrieve results

In [None]:
df_all = results_helper.get_results(use_already_loaded=False, exclude_filter = 'relabeled', filter_out_non_complete_datasets = False)
#df_all = results_helper.get_results(folder ='2017-10-03_15-28', use_already_loaded=False, exclude_filter = 'relabeled')

In [None]:
results_helper.get_result_folder_df().tail()

## DummyClassifier performance per dataset

In [None]:
df_all[df_all.classifier == 'DummyClassifier'].groupby('dataset').mean_test_f1_macro.max().to_frame()

In [None]:
df_all[df_all.type == 'text'].groupby('dataset').mean_test_f1_macro.max().apply(lambda x: '{:.3f}'.format(x))

In [None]:
df_only_cmaps = df_all[(df_all.type  == 'concept-graph')]
df_only_cmaps[df_only_cmaps.wl_casted == False].groupby('dataset').mean_test_f1_macro.max(), df_only_cmaps[df_only_cmaps.wl_casted == True].groupby('dataset').mean_test_f1_macro.max()
#df_all.type.value_counts()

## Result distributions

In [None]:
for data_filter_name, data_filter in [('only-concept-graphs', df_all.type == 'concept-graph'), ('only-coocurrence', df_all.type == 'cooccurrence'), ('all', df_all.type != 'YES')]:
    for dataset_name, df in df_all[data_filter].groupby('dataset'):
        for attr in ['type', 'kernel']:
            # Filter out DummyClassifier
            df = df[(df.classifier != 'DummyClassifier')]

            # Ignore entries that have only one category
            if len(df[attr].value_counts().tolist()) <= 1:
                continue
            
            f1_min, f1_max = df.mean_test_f1_macro.min(), df.mean_test_f1_macro.max()
            fig, axes = plt.subplots(figsize = EXPORT_FIG_SIZE)
            ax = sns.violinplot(x = attr, y = 'mean_test_f1_macro', data=df, cut = 0, split = True, inner = 'quartile')
            ax.set_ylim((0, f1_max + 0.1))
            ax.set_ylabel('f1 macro')
            fig.suptitle('Result distribution ({})'.format(data_filter_name));
            ax.set_title('Dataset: {}, Attribute: {}'.format(dataset_name, attr))
            fig.tight_layout()
            fig.subplots_adjust(top = 0.85)
            fig.savefig('tmp/result-distributions/{}-{}-{}.png'.format(dataset_name, data_filter_name, attr), dpi = EXPORT_DPI)
            plt.show()
            plt.close(fig)

## Best classifers per type per dataset

In [None]:
RENAME_COLS_MAPPING = {'mean_test_f1_macro': 'f1', 'mean_test_accuracy': 'accuracy', 'mean_test_precision_macro': 'precision', 'mean_test_recall_macro': 'recall'}

UNINTERESTING_COLUMNS = [x for x in df_all.columns.tolist() if 'fit_time' in x or 'split' in x or 'std' in x or 'rank' in x]

def plot_best_by_type(df_all, df, df_dataset, title = '', fontsize = 12, figsize = (6, 3), top = 0.85):
    # Get best elements per dataset
    els = df_all.iloc[df['mean_test_f1_macro'].idxmax()]
    els = els.set_index('type')
    els = els.rename(columns = RENAME_COLS_MAPPING)
    
    # Plot
    fig, ax = plt.subplots(figsize = figsize)
    
    std_errs = [els.std_test_f1_macro * 2,  els.std_test_accuracy * 2,  els.std_test_precision_macro * 2,  els.std_test_recall_macro * 2]

    els[['f1', 'accuracy', 'precision', 'recall']].plot(kind = 'barh', ax = ax, xlim = (0, 1.5), xerr=std_errs)
    ax.set_xticks(np.linspace(0, 1, 11))
    
    ax.grid(axis = 'y')
    
    display(els[[x for x in els.columns.tolist() if x not in UNINTERESTING_COLUMNS]])
    
    if title and title != '':
        fig.suptitle(title, fontsize = fontsize)

    fig.tight_layout()
    
    if title and title != '':
        fig.subplots_adjust(top = top)

    return fig, ax

# Ignore 0th WL iteration
for name, df_dataset in sorted(df_all[df_all.wl_iteration != 0].groupby('dataset'), key = lambda x: x[0]):
    df_dataset_grouped_by_type = df_dataset.groupby('type')
    print('################# {}'.format(name))
    use_title = False
    fig, ax = plot_best_by_type(df_all, df_dataset_grouped_by_type, df_dataset, 'Dataset: {}'.format(name) if use_title else None)
    fig.savefig('tmp/results/dataset-{}-best.png'.format(name), dpi = 150)
    plt.show()
    plt.close(fig)


## Plot best per parameter value per dataset

In [None]:
def graphs_grouped_by_plot(df_all, groupby):
    df_graphs_grouped = df_all[df_all.type != 'text'].groupby('dataset')
    
    axes = []
    for idx, (dataset_name, df_dataset) in enumerate(df_graphs_grouped):
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize = EXPORT_FIG_SIZE)
        # Print violinplot of f1, with graph_type as hue
        hue = groupby if df_dataset[groupby].value_counts().count() > 1 else None
        sns.violinplot(x = 'type', y = 'mean_test_f1_macro', hue= hue , data=df_dataset, cut = 0, split = True, inner = 'quartile', title = dataset_name, ax = ax, legend = True)
        ax.set_title('Dataset: {}'.format(dataset_name))
        ax.set_ylabel('f1')
        ax.set_xlabel('TBD')
        ax.grid('off')
        fig.suptitle('TBD')
        fig.tight_layout()
        fig.subplots_adjust(top = 0.86)
        fig.savefig('tmp/results/label-importance-{}.png'.format(dataset_name), dpi = EXPORT_DPI)
        plt.show()

if 1 == 1:
    graphs_grouped_by_plot(df_all, 'combined')

## Correlation matrix

In [None]:
import functools

def add(acc, item):
    acc += item
    return acc

def get_vals_for_col(col):
    return sorted(df_tmp[col].value_counts().index.tolist())

cols = ['combined', 'kernel', 'lemmatized', 'relabeled', 'threshold', 'type', 'window_size', 'wl_iteration', 'words', 'classifier', 'same_label', 'topn']
cols = ['type', 'combined', 'kernel', 'wl_iteration', 'same_label', 'dataset']

df_tmp = df_all[df_all.dataset == 'ling-spam']

vals = [get_vals_for_col(col) for col in cols]
val_lenghts = [len(vals_) for vals_ in vals]
dim = sum(val_lenghts)
vals_flattened = functools.reduce(add, vals, [])

best_of_mat = np.zeros((dim, dim), dtype=np.float32)

col_counter = 0
row_counter = 0

for col_idx1, col1 in enumerate(cols):
    vals_1 = get_vals_for_col(col1)
    col_counter = 0
    for col_idx2, col2 in enumerate(cols):
        vals_2 = get_vals_for_col(col2)
        for idx1, val1 in enumerate(vals_1):
            for idx2, val2 in enumerate(vals_2):
                best_of = df_tmp[(df_tmp[col1] == val1) & (df_tmp[col2] == val2)]
                best_f1 = best_of.mean_test_f1_macro.max()
                best_of_mat[row_counter + idx1, col_counter + idx2] = best_f1
        col_counter += len(vals_2)
    row_counter += len(vals_1)

In [None]:
def plot(best_of_mat, vals, cols, ax = None, cmap='Blues', divider_color = '#FFFFFF', divider_linewidth = 6, fontdict = {'fontsize': 14, 'weight': 'bold'}):
    if not ax:
        fig, ax = plt.subplots()

    vals_lengths = [len(val) for val in vals]
    
    # Add labels to graph
    for idx, s in enumerate(np.cumsum(val_lenghts)):
        for x in ['v' , 'h']:
            getattr(plt, 'ax{}line'.format(x))(s - 0.5, color = divider_color, linewidth = divider_linewidth)
        
        text_offset = ((val_lenghts[idx]) / 2)
        
        # Add the col labels to the right
        ax.text(dim + 0.5, s - text_offset - 0.5, cols[idx], horizontalalignment = 'left', verticalalignment = 'center', fontdict=fontdict)
        # Add the col labels to the top
        ax.text(s - text_offset - 0.2, - 1, cols[idx], horizontalalignment = 'center', verticalalignment = 'center', fontdict=fontdict)

    # Add x- and y-ticks
    for x in ['x' , 'y']:
        getattr(plt, x + 'ticks')(range(len(vals_flattened)), vals_flattened)

    # Rotate x-ticks
    for tick in ax.get_xticklabels():
        tick.set_rotation(90)

    # Mark cells where no values are available
    for row, cell in (zip(*list(np.where(np.isnan(best_of_mat))))):
        ax.text(row, cell, 'X', horizontalalignment = 'center', verticalalignment = 'center', fontdict=fontdict)

    plt.grid('off')
    plt.imshow(best_of_mat, cmap=cmap)
    plt.colorbar(fraction=0.04, pad=0.2)

fig, ax = plt.subplots(figsize = (30, 30))
#plot(np.tril(best_of_mat), vals, cols, ax)
plot(best_of_mat, vals, cols, ax)
fig.tight_layout()
fig.savefig('tmp/correlations.png', dpi = EXPORT_DPI)

## Plot classifier performance

In [None]:
for dataset_name, df_dataset in df_all.groupby('dataset'):
    fig = plt.figure(figsize=(10, 2))
    df_dataset.groupby('classifier').mean_test_f1_macro.max().plot(kind = 'barh', title = dataset_name)
    plt.show()
    #plt.close(fig)
    #sns.violinplot(y = 'classifier', x = 'mean_test_f1_macro', data = df_dataset, cut = 0, split = True, inner = 'quartile', figsize = EXPORT_FIG_SIZE)

In [None]:
for dataset_name, df_dataset in df_all.groupby('dataset'):
    fig = plt.figure(figsize=(10, 2))
    df_dataset.groupby('lemmatized').mean_test_f1_macro.max().plot(kind = 'barh', title = dataset_name)
    plt.show()


# Plot performance per dataset and wl_iteration and graph_type

In [None]:
def add_gap_to_violin_plot(ax, delta = 0.03):
    import matplotlib
    # offset stuff
    delta = 0.03
    for ii, item in enumerate(ax.collections):
        # axis contains PolyCollections and PathCollections
        if isinstance(item, matplotlib.collections.PolyCollection):
            # get path
            path, = item.get_paths()
            vertices = path.vertices

            if ii % 2: # -> to right
                vertices[:,0] += delta
            else: # -> to left
                vertices[:,0] -= delta


for dataset, df_tmp in df_all[(df_all.type != 'text') & (df_all.lemmatized != True)].sort_values('wl_iteration').groupby('dataset'):
    fig, ax = plt.subplots()
    inner = 'quartile'
    ax = sns.violinplot(x = 'wl_iteration', y = 'mean_test_f1_macro', hue = 'type', split = True, data = df_tmp, cut = True, inner = inner, figsize = EXPORT_FIG_SIZE)
    
    add_gap_to_violin_plot(ax)
    
    ax.set_ylabel('f1')
    ax.set_title(dataset)
    ax.figure.tight_layout()
    plt.show()
    plt.close(fig)

## Plot by parameter

In [None]:
def plot_distributions(df, df_all, title = None, figsize = (10, 8)):
    fig, axes_indexed = plt.subplots(nrows = 2, ncols=2, figsize = figsize)

    axes = []
    for ax_row in axes_indexed:
        axes += list(ax_row)
    #, 'relabeled'
    for val, ax in zip(['wl_iteration', 'window_size', 'words', 'type'], axes):
        if len(df.groupby(val).size()) == 0:
            continue
        grouped = df.groupby(val)
        els = df_all.iloc[grouped['mean_test_f1_macro'].idxmax()]
        els = els.set_index(val)
        els = els.rename(columns = RENAME_COLS_MAPPING)
        els[['f1', 'accuracy', 'precision', 'recall']].plot(kind = 'barh', ax = ax, xlim=(0, 2))
        ax.set_xticks(np.linspace(0, 1, 11))
        ax.grid(axis = 'y')
        ax.set_xlim((0, 1.5))
    
    plt.suptitle(title, size = 18)
    fig.tight_layout()
    fig.subplots_adjust(top=0.90)
    return fig, axes
    
dpi = 150

if 1 == 1:
    fig, _  = plot_distributions(df_all, df_all, title = 'Mean over all datasets')
    fig.savefig('tmp/results/all.png', dpi = dpi)
    plt.show()
    plt.close(fig)
    for name, df_dataset in df_all.groupby('dataset'):
        if len(df_dataset.type.value_counts()) < 3:
            continue
        fig, _ = plot_distributions(df_dataset, df_all, title = 'Dataset: {}'.format(name))
        fig.savefig('tmp/results/dataset-{}.png'.format(name), dpi = dpi)
        plt.show()
        plt.close(fig)

## DUMP

import json

with open('data/check-w2v-results.json') as f:
    w2v_results = json.load(f)
 
per_embedding_type = {}
for dataset, value in w2v_results.items():
    print(dataset)
    for embedding_raw, cache_files in sorted(value.items(), key = lambda x: x[0]):
        embedding = embedding_raw.split('/')[-1].rsplit('.', 2)[0]
        if len(cache_files.keys()) != 2: continue
        print('\t{}'.format(embedding))
        if embedding not in per_embedding_type:
            per_embedding_type[embedding] = {}
        per_embemdding_type[embedding][dataset] = []
        for dataset_file, counts in sorted(cache_files.items(), key = lambda x: x[0]):
            not_found_ratio = int(counts['counts']['not_found'] / counts['num_labels'] * 100)
            if embedding == 'trained' and 'coo' in  dataset_file:
                print('Yes', counts['counts']['not_found'], not_found_ratio, '%', counts['not_found_sample'])
            is_gml = 'dataset_graph_gml' in dataset_file
            per_embedding_type[embedding][dataset].append((is_gml, not_found_ratio))
            print('\t\t{:4} missing  {:3>}%'.format('gml' if is_gml else 'co', not_found_ratio))
        per_embedding_type[embedding][dataset] = per_embedding_type[embedding][dataset][0][1]  #sum(y for x, y in per_embedding_type[embedding][dataset]) / 2
df = pd.DataFrame(per_embedding_type)
