
# Statistics

In [None]:
%%javascript
require("notebook/js/notebook").Notebook.prototype.scroll_to_bottom = function () {}

In [None]:
from notebook_prelude import *

## Retrieve results

In [None]:
all_folders = results_helper.get_result_folders()

folder = all_folders[-2].split('/')[-1]
folder = None

df_all = results_helper.get_results(folder=folder, use_already_loaded=False, exclude_filter = 'relabeled', filter_out_non_complete_datasets = True)
gc.collect()
df_all = df_all[df_all.is_ana == False]

In [None]:
df_all.groupby(['dataset', 'type', 'combined', 'kernel', 'vectorizer']).mean_test_f1_macro.max().to_frame()

In [None]:
df_all[(df_all.dataset == 'ng20') &
(df_all.type == 'concept_map') &
(df_all.kernel == 'wl') &
(df_all.combined == False) &
(df_all.filename.str.contains('__graph__'))]

In [None]:
df__ = df_all[(df_all.kernel == 'wl') & (df_all.combined == False)].groupby(['dataset', 'type']).mean_test_f1_macro.max().to_frame().unstack()
#
df__.columns = df__.columns.droplevel()
df__['ratio'] = df__.concept_map / df__.cooccurrence
print(df__.to_latex())
df__

## DummyClassifier performance per dataset

In [None]:
df_all[df_all.type == 'dummy'].groupby('dataset').mean_test_f1_macro.max().to_frame()

In [None]:
RENAME_COLS_MAPPING = {'mean_test_f1_macro': 'f1', 'mean_test_accuracy': 'accuracy', 'mean_test_precision_macro': 'precision', 'mean_test_recall_macro': 'recall'}
METRICS_COLUMNS = ['f1', 'accuracy', 'precision', 'recall']

UNINTERESTING_COLUMNS = [x for x in df_all.columns.tolist() if 'fit_time' in x or 'split' in x or 'std' in x or 'rank' in x]

## Results for combined classification

In [None]:
to_decimals = -1

classifier = 'LinearSVC'
kernel = 'wl'
wl_iteration = "stacked"
text_vectorizer = 'TfidfVectorizer'
text_vectorizer = 'CountVectorizer'
stopwords = 'english'
n_gram_range = (1, 1)
window_size = '1'
used_words = 'all'
use_edges = True
classifier_c = 0.1
binary = False
use_node_weighting = False

default_filter = (df_all.classifier__C == classifier_c) & (df_all.classifier == classifier) & (df_all.same_label != True) & (df_all.vectorizer__vectorizer__binary != False)

def get_text_result_for_graph(graph_item, text_vectorizer = text_vectorizer, stop_words = stopwords, classifier=classifier, n_gram_range = n_gram_range):
    return df_all[
        default_filter &
        (df_all.dataset == graph_item.dataset) &
        (df_all.type == 'text') &
        (df_all.preprocessing.apply(lambda x: x is None)) &
        (df_all.vectorizer == text_vectorizer) &
        (df_all.vectorizer__ngram_range == n_gram_range) &
        (df_all.vectorizer__binary == binary)
    ]

# Filter out specific combination of parameters
df_graphs = df_all[
    (
        default_filter &
        (df_all.kernel == kernel) &
        (df_all.phi_picker__return_iteration == wl_iteration) &
        (df_all.fast_wl__round_to_decimals == to_decimals) &
        #(df_all.fast_wl__use_node_weight_factors == use_node_weighting) &
        (df_all.apply(lambda x: x.type == 'concept_map' or (x.window_size == window_size and x.words == used_words), axis = 1)) 
    ) | (
        default_filter &
        (df_all.type == 'text') &
        (df_all.preprocessing.apply(lambda x: x is None)) &
        (df_all.vectorizer == text_vectorizer) &
        #(df_all.vectorizer__stop_words == stopwords) &
        (df_all.vectorizer__ngram_range == n_gram_range) &
        (df_all.vectorizer__binary == binary)
    )
]

def get_index_name(x):
    if x.type == 'text':
        return 'text only'
    elif x.combined:
        return 'combined' if not x.fast_wl__use_node_weight_factors else 'combined (with node weighting)'
    elif x.fast_wl__use_node_weight_factors:
        return '?'
    else:
        return 

df_graphs['index_readable'] = pd.Categorical(df_graphs.apply(get_index_name, axis = 1), ['text only', 'graph only', 'combined'])
df_graphs['mean_test_f1_macro_with_std'] = df_graphs.apply(lambda x: '{:.3f} (+/- {:.3f})'.format(x.mean_test_f1_macro, x.std_test_f1_macro) ,axis = 1)
df_grouped = df_graphs.groupby(['dataset', 'index_readable', 'type']).mean_test_f1_macro_with_std.max()
display(df_grouped.unstack())

for graph_type in ['cooccurrence', 'concept_map']:
    fig, axes = plt.subplots(nrows=2, ncols=2, sharey=True, figsize = EXPORT_FIG_SIZE)
    df_graphs_f = df_graphs[(df_graphs.type == graph_type)]
    for idx, (ax, (dataset, df)) in enumerate(zip(axes.flatten(), df_graphs_f.groupby('dataset'))):
        el_text = get_text_result_for_graph(df.iloc[0])
        df = df.append(el_text)

        df['index_readable'] = df.apply(get_index_name, axis = 1)
        df = df.set_index('index_readable').sort_index()

        df.mean_test_f1_macro.plot(kind = 'barh', ax = ax, title = 'Dataset: {}'.format(dataset))
        ax.set_xlim(0, 1)
        ax.set_ylabel('Type')
        ax.set_xlabel('f1 macro')

    fig.suptitle('Graph: type={} kernel={}, wl_iterations={}\nText: vectorizer={}, stopwords={}, ngram_range={}'.format(graph_type, kernel, wl_iteration, text_vectorizer, stopwords, n_gram_range))
    fig.tight_layout()
    fig.subplots_adjust(top=0.78)
    fig.subplots_adjust(wspace = 0.2, hspace = 0.7)
    fig.savefig('tmp/result_combined_{}.png'.format(graph_type))

In [None]:
df_graphs_ = df_all[
    (df_all.combined == False) &
    (df_all.apply(lambda x: x.type == 'concept_map' or (x.window_size == window_size and x.words == used_words), axis = 1)) &
    (df_all.vectorizer != 'TfidfVectorizer') &
    (df_all.phi_picker__return_iteration != 0)
    # &(df_all.vectorizer__binary != True)
]

for dataset, df_ in df_graphs_.groupby('dataset'):
    fig, ax = plt.subplots()
    df_.groupby(['type', 'kernel']).mean_test_f1_macro.max().plot(kind = 'barh', title = dataset)
    for (t, k), df in df_.groupby(['type', 'kernel']):
        if t != 'cooccurrence': continue
        if k != 'wl': continue
        best = df.mean_test_f1_macro.idxmax()
        df_best = df.loc[best].to_frame()
        #display(df_best.T)


## Results for "linearized" graphs

In [None]:
def get_right_coo(x):
    return (x.window_size == window_size) & (x.words == used_words)

def get_right_g_2(x):
    return x.kernel != 'text' or (
        x.preprocessing is None and
        x.vectorizer == text_vectorizer and
        x.vectorizer__stop_words == stopwords and
        x.vectorizer__ngram_range == n_gram_range and
        x.graph_to_text__use_edges == use_edges
    )

def get_right_g(x):
    return x.kernel != 'wl' or (x.phi_picker__return_iteration == wl_iteration and x.fast_wl__round_to_decimals == to_decimals)

df_filtered = df_all[
    (df_all.classifier == classifier) &
    (df_all.classifier__C == classifier_c) &
    #((df_all.kernel != 'wl') | (df_all.fast_wl__use_node_weight_factors == use_node_weighting)) &
    (df_all.type != 'text') &
    (df_all.type != 'dummy') &
    ((df_all.kernel == 'wl') | ((df_all.kernel == 'text'))) &
    (df_all.combined == False) &
    #(df_all.same_label == False) & 
    (df_all.apply(get_right_g_2, axis = 1)) & 
    (df_all.apply(lambda x: get_right_g(x) and (x.type == 'concept_map' or get_right_coo(x)), axis = 1))
]

def get_index(df):
    index = ''
    if df.same_label:
        index = 'structure only'
    if df.kernel == 'wl':
        index = 'combined'
    else:
        index = 'content'
    if df.vectorizer__binary:
        index += ' (binary)'
    if df.fast_wl__use_node_weight_factors:
        index += ' (node weights)'
    return index
    
df_filtered['_label'] = df_filtered.apply(get_index, axis = 1)
#df_filtered['_label'] = pd.Categorical(df_filtered._label, ['content only (binary)', 'content only (frequencies)', 'structure only', 'combined'])
#, 'fast_wl__use_node_weight_factors'
groups = ['dataset', 'type', '_label', 'vectorizer__binary']

#groups = ['type', 'dataset', '_label']
df__ = df_filtered.groupby(groups).mean_test_f1_macro.max().to_frame()
#print(df__.unstack().rename(columns = {'mean_test_f1_macro': 'f1 macro', 'dataset': 'Dataset'}).to_latex())
#display(df_filtered.fast_wl__use_node_weight_factors.value_counts())
display(df_filtered[df_filtered.dataset == 'ng20'].groupby(['type', 'vectorizer__binary']).mean_test_f1_macro.max())

for (graph_type, df_) in df_filtered.groupby('type'):
    fig, axes = plt.subplots(nrows = 2, ncols=2, figsize = EXPORT_FIG_SIZE, sharey=True)
    for ax, (dataset, df__) in zip(axes.flatten(), df_.groupby('dataset')):
        title = 'Dataset: {}'.format(dataset)
        df__.set_index('_label').sort_index().mean_test_f1_macro.plot(kind = 'barh', ax = ax, xerr = df__.std_test_f1_macro, title = title)
        ax.set_ylabel('')
        ax.set_xlabel('f1 macro')
        
    suptitle = 'graph={}, wl_iteration={}'.format(graph_type, 'stacked' if wl_iteration == -1 else wl_iteration)
    
    suptitle += ' text_vectorizer={}, ngram={}'.format(text_vectorizer, n_gram_range)
    if graph_type == 'concept_map':
        suptitle += ', concept_map_use_edges={}'.format(use_edges)
    else:
        suptitle += ', window_size={}'.format(window_size)
        
    fig.suptitle(suptitle)
    fig.tight_layout()
    fig.subplots_adjust(top = 0.85)
    fig.savefig('tmp/results_kernel_{}.png'.format(graph_type))

In [None]:
df_all[df_all.type == 'concept_map'].groupby(['dataset', 'combined']).mean_train_f1_macro.describe()#.to_frame()

## Results for concept vs. cooccurrence

In [None]:
fig, axes = plt.subplots(nrows = 2, ncols = 2, sharey=True)

for ax, (dataset, df_) in zip(axes.flatten(), df_graphs[(df_graphs.combined == False) & ((df_graphs.type == 'concept_map') | ((df_graphs.window_size == window_size) & (df_graphs.words == 'all')))].groupby('dataset')):
    df_ = df_.rename(columns=RENAME_COLS_MAPPING)
    df_.set_index('type').sort_index()[METRICS_COLUMNS].plot(kind = 'barh', ax = ax, title = 'Dataset: {}'.format(dataset), legend = False)
    ax.set_xlabel('f1 macro')

axes.flatten()[1].legend()
fig.suptitle('Concept vs. Co-Occurrence\nkernel={}, co-occurrence window_size={}'.format(kernel, window_size))
fig.tight_layout()
fig.subplots_adjust(top=0.88)
fig.subplots_adjust(wspace = 0.2, hspace = 0.5)
fig.savefig('tmp/result_graph_comparison.png'.format(graph_type))


## Best classifers per type per dataset

In [None]:
def plot_best_by_type(df_all, df, df_dataset, title = '', fontsize = 12, figsize = (6, 3), top = 0.85):
    # Get best elements per dataset
    els = df_all.iloc[df['mean_test_f1_macro'].idxmax()]
    els = els.set_index('type')
    els = els.rename(columns = RENAME_COLS_MAPPING)
    
    # Plot
    fig, ax = plt.subplots(figsize = figsize)
    
    std_errs = [els.std_test_f1_macro * 2,  els.std_test_accuracy * 2,  els.std_test_precision_macro * 2,  els.std_test_recall_macro * 2]

    els[METRICS_COLUMNS].plot(kind = 'barh', ax = ax, xlim = (0, 1.5), xerr=std_errs)
    ax.set_xticks(np.linspace(0, 1, 11))
    
    ax.grid(axis = 'y')
    
    display(els[[x for x in els.columns.tolist() if x not in UNINTERESTING_COLUMNS]])
    
    if title and title != '':
        fig.suptitle(title, fontsize = fontsize)

    fig.tight_layout()
    
    if title and title != '':
        fig.subplots_adjust(top = top)

    return fig, ax

# Ignore 0th WL iteration
for name, df_dataset in sorted(df_all[(df_all.phi_picker__return_iteration != 0) & (df_all.combined == False)].groupby('dataset'), key = lambda x: x[0]):
    df_dataset_grouped_by_type = df_dataset.groupby('type')
    print('################# {}'.format(name))
    use_title = False
    fig, ax = plot_best_by_type(df_all, df_dataset_grouped_by_type, df_dataset, 'Dataset: {}'.format(name) if use_title else None)
    fig.savefig('tmp/results/dataset-{}-best.png'.format(name), dpi = 150)
    plt.show()
    plt.close(fig)


## Results for with/without labels

In [None]:
for graph_type, df_ in df_all[
    (df_all.combined == False) &
    (df_all.classifier == 'LinearSVC') &
    (df_all.kernel == 'wl') &
    (df_all.wl_iteration == wl_iteration) &
    (df_all.param_feature_extraction__fast_wl__round_to_decimals == to_decimals)
].groupby('type'):
    fig, axes = plt.subplots(nrows=2, ncols=2, sharey=True, figsize = EXPORT_FIG_SIZE_BIG)
    for (dataset, df), ax in zip(df_.groupby('dataset'), axes.flatten()):
        if graph_type == 'cooccurrence':
            df = df[(df.window_size == window_size) & (df.words == used_words)]
        df = df.set_index('same_label')
        std_errs = [df.std_test_f1_macro,  df.std_test_accuracy, df.std_test_precision_macro,  df.std_test_recall_macro]

        df = df.rename(columns = RENAME_COLS_MAPPING).sort_index()
        df[['accuracy', 'f1', 'precision', 'recall']].plot(kind = 'barh', ax = ax, legend = False, xerr = std_errs)
        ax.set_title('Dataset: {}'.format(dataset))
        ax.set_ylabel('Ignoring labels')
    axes.flatten()[0].legend(loc="upper right")
    title = 'graph={}'.format(graph_type)
    if graph_type == 'cooccurrence':
        title += ' window_size={}'.format(window_size)

    fig.suptitle(title)
    fig.tight_layout()
    fig.subplots_adjust(top=0.85)
    plt.show()
    fig.savefig('tmp/classification_same_label_{}.png'.format(graph_type))
    plt.close(fig)

## Result distributions

In [None]:
for data_filter_name, data_filter in [('only-concept-graphs', df_all.type == 'concept_map'), ('only-coocurrence', df_all.type == 'cooccurrence'), ('all', df_all.type != 'YES')]:
    for dataset_name, df in df_all[df_all.combined == False][data_filter].groupby('dataset'):
        for attr in ['type', 'kernel']:
            # Filter out DummyClassifier
            df = df[(df.classifier != 'DummyClassifier')]

            # Ignore entries that have only one category
            if len(df[attr].value_counts().tolist()) <= 1:
                continue
            
            f1_min, f1_max = df.mean_test_f1_macro.min(), df.mean_test_f1_macro.max()
            fig, axes = plt.subplots(figsize = EXPORT_FIG_SIZE)
            df = df.sort_values(attr)
            ax = sns.violinplot(x = attr, y = 'mean_test_f1_macro', data=df, cut = 0, split = True, inner = 'quartile')
            ax.set_ylim((0, f1_max + 0.1))
            ax.set_ylabel('f1 macro')
            fig.suptitle('Result distribution ({})'.format(data_filter_name));
            ax.set_title('Dataset: {}, Attribute: {}'.format(dataset_name, attr))
            fig.tight_layout()
            fig.subplots_adjust(top = 0.85)
            fig.savefig('tmp/result-distributions/{}-{}-{}.png'.format(dataset_name, data_filter_name, attr), dpi = EXPORT_DPI)
            plt.show()
            plt.close(fig)

## Plot best per parameter value per dataset

In [None]:
def graphs_grouped_by_plot(df_all, groupby):
    df_graphs_grouped = df_all[df_all.type != 'text'].groupby('dataset')
    
    axes = []
    for idx, (dataset_name, df_dataset) in enumerate(df_graphs_grouped):
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize = EXPORT_FIG_SIZE)
        # Print violinplot of f1, with graph_type as hue
        hue = groupby if df_dataset[groupby].value_counts().count() > 1 else None
        sns.violinplot(x = 'type', y = 'mean_test_f1_macro', hue= hue , data=df_dataset, cut = 0, split = True, inner = 'quartile', title = dataset_name, ax = ax, legend = True)
        ax.set_title('{}'.format(dataset_name))
        ax.set_ylabel('f1')
        ax.set_xlabel(groupby)
        ax.grid('off')
        fig.suptitle('')
        fig.tight_layout()
        fig.subplots_adjust(top = 0.86)
        fig.savefig('tmp/results/label-importance-{}.png'.format(dataset_name), dpi = EXPORT_DPI)
        plt.show()

if 1 == 1:
    graphs_grouped_by_plot(df_all, 'combined')

## Correlation matrix

In [None]:
import functools

def add(acc, item):
    acc += item
    return acc

def get_vals_for_col(col):
    return sorted(df_tmp[col].value_counts().index.tolist())

cols = ['combined', 'kernel', 'lemmatized', 'relabeled', 'threshold', 'type', 'window_size', 'wl_iteration', 'words', 'classifier', 'same_label', 'topn']
cols = ['type', 'combined', 'kernel', 'wl_iteration', 'same_label', 'dataset']

df_tmp = df_all[df_all.dataset == 'ling-spam']

vals = [get_vals_for_col(col) for col in cols]
val_lenghts = [len(vals_) for vals_ in vals]
dim = sum(val_lenghts)
vals_flattened = functools.reduce(add, vals, [])

best_of_mat = np.zeros((dim, dim), dtype=np.float32)

col_counter = 0
row_counter = 0

for col_idx1, col1 in enumerate(cols):
    vals_1 = get_vals_for_col(col1)
    col_counter = 0
    for col_idx2, col2 in enumerate(cols):
        vals_2 = get_vals_for_col(col2)
        for idx1, val1 in enumerate(vals_1):
            for idx2, val2 in enumerate(vals_2):
                best_of = df_tmp[(df_tmp[col1] == val1) & (df_tmp[col2] == val2)]
                best_f1 = best_of.mean_test_f1_macro.max()
                best_of_mat[row_counter + idx1, col_counter + idx2] = best_f1
        col_counter += len(vals_2)
    row_counter += len(vals_1)

In [None]:
def plot(best_of_mat, vals, cols, ax = None, cmap='Blues', divider_color = '#FFFFFF', divider_linewidth = 6, fontdict = {'fontsize': 14, 'weight': 'bold'}):
    if not ax:
        fig, ax = plt.subplots()

    vals_lengths = [len(val) for val in vals]
    
    # Add labels to graph
    for idx, s in enumerate(np.cumsum(val_lenghts)):
        for x in ['v' , 'h']:
            getattr(plt, 'ax{}line'.format(x))(s - 0.5, color = divider_color, linewidth = divider_linewidth)
        
        text_offset = ((val_lenghts[idx]) / 2)
        
        # Add the col labels to the right
        ax.text(dim + 0.5, s - text_offset - 0.5, cols[idx], horizontalalignment = 'left', verticalalignment = 'center', fontdict=fontdict)
        # Add the col labels to the top
        ax.text(s - text_offset - 0.2, - 1, cols[idx], horizontalalignment = 'center', verticalalignment = 'center', fontdict=fontdict)

    # Add x- and y-ticks
    for x in ['x' , 'y']:
        getattr(plt, x + 'ticks')(range(len(vals_flattened)), vals_flattened)

    # Rotate x-ticks
    for tick in ax.get_xticklabels():
        tick.set_rotation(90)

    # Mark cells where no values are available
    for row, cell in (zip(*list(np.where(np.isnan(best_of_mat))))):
        ax.text(row, cell, 'X', horizontalalignment = 'center', verticalalignment = 'center', fontdict=fontdict)

    plt.grid('off')
    plt.imshow(best_of_mat, cmap=cmap)
    plt.colorbar(fraction=0.04, pad=0.2)

fig, ax = plt.subplots(figsize = (30, 30))
#plot(np.tril(best_of_mat), vals, cols, ax)
plot(best_of_mat, vals, cols, ax)
fig.tight_layout()
fig.savefig('tmp/correlations.png', dpi = EXPORT_DPI)

## Plot performance per dataset and wl_iteration and graph_type

In [None]:
def add_gap_to_violin_plot(ax, delta = 0.03):
    import matplotlib
    # offset stuff
    delta = 0.03
    for ii, item in enumerate(ax.collections):
        # axis contains PolyCollections and PathCollections
        if isinstance(item, matplotlib.collections.PolyCollection):
            # get path
            path, = item.get_paths()
            vertices = path.vertices

            if ii % 2: # -> to right
                vertices[:,0] += delta
            else: # -> to left
                vertices[:,0] -= delta

is_combined = df_all.combined == True

for dataset, df_tmp in df_all[(df_all.type != 'text') & (df_all.lemmatized != True) & (is_combined == False)].sort_values('wl_iteration').groupby('dataset'):
    fig, ax = plt.subplots()
    inner = 'quartile'
    ax = sns.violinplot(x = 'wl_iteration', y = 'mean_test_f1_macro', hue = 'type', split = True, data = df_tmp, cut = True, inner = inner, figsize = EXPORT_FIG_SIZE)
    
    add_gap_to_violin_plot(ax)
    
    ax.set_ylabel('f1')
    ax.set_title(dataset)
    ax.figure.tight_layout()
    plt.show()
    plt.close(fig)

## Plot by parameter

In [None]:
def plot_distributions(df, df_all, title = None, figsize = (10, 8)):
    fig, axes_indexed = plt.subplots(nrows = 2, ncols=2, figsize = figsize)

    axes = []
    for ax_row in axes_indexed:
        axes += list(ax_row)
    #, 'relabeled'
    for val, ax in zip(['wl_iteration', 'window_size', 'words', 'type'], axes):
        if len(df.groupby(val).size()) == 0:
            continue
        grouped = df.groupby(val)
        els = df_all.iloc[grouped['mean_test_f1_macro'].idxmax()]
        els = els.set_index(val)
        els = els.rename(columns = RENAME_COLS_MAPPING)
        els[['f1', 'accuracy', 'precision', 'recall']].plot(kind = 'barh', ax = ax, xlim=(0, 2))
        ax.set_xticks(np.linspace(0, 1, 11))
        ax.grid(axis = 'y')
        ax.set_xlim((0, 1.5))
    
    plt.suptitle(title, size = 18)
    fig.tight_layout()
    fig.subplots_adjust(top=0.90)
    return fig, axes
    
dpi = 150

if 1 == 1:
    fig, _  = plot_distributions(df_all, df_all, title = 'Mean over all datasets')
    fig.savefig('tmp/results/all.png', dpi = dpi)
    plt.show()
    plt.close(fig)
    for name, df_dataset in df_all.groupby('dataset'):
        if len(df_dataset.type.value_counts()) < 3:
            continue
        fig, _ = plot_distributions(df_dataset, df_all, title = 'Dataset: {}'.format(name))
        fig.savefig('tmp/results/dataset-{}.png'.format(name), dpi = dpi)
        plt.show()
        plt.close(fig)