# Experiment: _Edge label importance_

In [None]:
from notebook_prelude import *

In [None]:
EXPERIMENT_NAME = 'experiment_edge_labels'

df = results_helper.get_experiments_by_names([EXPERIMENT_NAME, EXPERIMENT_NAME + '_without_edge', EXPERIMENT_NAME + '_with_edge'], fetch_predictions=True)

In [None]:
attr = 'prediction_score_f1_macro'
groupby = ['dataset', 'graph__graph_to_text__use_edges']
df_ = df.groupby(groupby)[attr].max().to_frame().unstack().sort_index()
df_.columns = df_.columns.droplevel()
df_['p-value'] = confidences
df_ = df_.rename(columns={True: 'With', False: 'Without'})
df_.columns = df_.columns.values
print(
    df_.to_latex(float_format = '%.3f')
)
df_

In [None]:
from utils import significance_test_utils

best = df.loc[df.groupby(groupby)[attr].idxmax()]

print('Score A: With edges')
print('Score B: Without edges')
print()

confidences = []
for dataset, df_ in best.groupby('dataset'):
    assert len(df_) == 2
    print(dataset)
    df__ = [df_.loc[df_.graph__graph_to_text__use_edges == x].iloc[0] for x in [True, False]]
    prediction_filenames = [df___.prediction_file for df___ in df__]
    
    diffs, score_a, score_b, global_difference, confidence = results_helper.calculate_significance(prediction_filenames[0], prediction_filenames[1])
    
    for k, v in [('Score A', score_a), ('Score B', score_b), ('Difference', global_difference), ('Confidence', confidence)]:
        print('\t{:20} {:9.4f}'.format(k, v))
    print()
    confidences.append(confidence)

## Statistics about edge label occurrences

In [None]:
from itertools import chain

data = collections.defaultdict(list)
for dataset in helper.log_progress(dataset_helper.get_dataset_names_with_concept_map()):
    X, Y = dataset_helper.get_concept_map_for_dataset(dataset, graphs_only=True)
    edge_labels = [[data['name'] for source, target, data in x.edges(data=True)] for x in X]
    flat_edge_labels = list(chain.from_iterable(edge_labels))
    labels_set = set(flat_edge_labels)
    data['dataset'].append(dataset)
    data['edge_labels'].append(flat_edge_labels)

In [None]:
df = pd.DataFrame(data).set_index('dataset')

top_n = 50

def get_most_common_labels(c, top=top_n):
    sorted_label_occs = sorted(c.items(), key=lambda x: x[1])
    return [label for label, occurrences in sorted_label_occs[-top:]]

df['edge_labels_unique'] = df.edge_labels.apply(set)
df['label_count'] = df.edge_labels.apply(len)
df['label_counts'] = df.edge_labels.apply(collections.Counter)
df['label_count_unique'] = df.label_counts.apply(len)
df['label_count_only_once'] = df.apply(lambda x: len([occs for _, occs in x.label_counts.items() if occs == 1]), axis=1)
df['label_ratio_only_once'] = df.label_count_only_once / df.label_count_unique
df['label_ratio_only_once_all'] = df.label_count_only_once / df.label_count
df['most_common_labels'] = df.label_counts.apply(get_most_common_labels)
df['sum_most_common_labels'] = df.apply(lambda x: sum([x.label_counts[label] for label in x.most_common_labels]), axis = 1)
df['ratio_most_common_labels'] = df.sum_most_common_labels / (df.label_count - df.label_count_only_once)

### Percentage of edge labels occurring only once

In [None]:
df_ = df.copy().sort_index()
df_ = (df_[['label_ratio_only_once', 'label_ratio_only_once_all']] * 100)
df_.loc['mean'] = df_.mean()
print(df_.to_latex(float_format = '%.0f'))
df_

### Cumulative edge occurrences

In [None]:
df_ = df.loc['ng20']
cum_sum = pd.Series(sorted(df_.label_counts.values())).cumsum()
max_cum_sum = cum_sum.max()
x = np.array(range(len(cum_sum)))
y = cum_sum
fig, ax = plt.subplots(figsize=(11, 4))
ax.plot(x, y)

only_once_lables = df_.label_count_only_once
ax.axvline(only_once_lables, alpha=0.4, color='red')
#ax.grid(False)

for attr, max_ in [('x', max(x)), ('y', max(y))]:
    ticks_attr = 'set_{}ticks'.format(attr)
    ticklabels_attr = 'set_{}ticklabels'.format(attr)
    ticks = np.linspace(0, max_, 11)
    tick_labels = ['{}%'.format(i * 10) for i in range(11)]
    getattr(ax, ticks_attr)(ticks)
    getattr(ax, ticklabels_attr)(tick_labels)

#ax.set_xticks(np.linspace(0, cum_sum, 10))
ax.set_ylabel('Cumulative label count')
ax.set_xlabel('Labels')
fig.tight_layout()