# Experiment: _Multi-label splitting_

The node labels for concept maps often (~80% of the time) consist of more than one word. This experiment tests the effect of splitting these labels and then classifying the graphs afterwards.

In [None]:
from notebook_prelude import *

In [None]:
NA_VAL = '-'
EXPERIMENT_NAME = 'experiment_split_multi_words'

df = results_helper.get_experiments_by_names([
    EXPERIMENT_NAME + '_with_splitter',
    EXPERIMENT_NAME + '_without_splitter',
    EXPERIMENT_NAME + '_with_splitter_and_lemmatizer'
], fetch_predictions=True)

df['graph__preprocessing__lemmatizer_or_stemmer'] = df.graph__preprocessing__lemmatizer_or_stemmer.apply(lambda x: type(x).__name__ if not isinstance(x, str) else x)

print('# Results: {}'.format(len(df)))

In [None]:
attr = 'prediction_score_f1_macro'
grouped = df.groupby(['dataset', 'graph__preprocessing__lemmatizer_or_stemmer', 'graph__preprocessing'])[attr].max().to_frame().unstack().unstack().dropna(axis=1, how='all')
grouped.columns = grouped.columns.droplevel()
grouped

## Confidences

In [None]:
df_confidence = significance_test_utils.get_confidences(df, performance_attr=attr, model_selection_attr='graph__preprocessing', model_selection_vals=['-', 'GraphMultiWordLabelSplitter'], log_progress=log_progress_nb)

In [None]:
#print(df_confidence[['confidence']].to_latex(float_format=lambda x: '%.4f' % x))
df_with_conf = grouped.merge(df_confidence[['confidence']], right_index=True, left_index=True)
print(df_with_conf.to_latex(float_format=lambda x: '%.4f' % x))
df_with_conf

## Statistics about multi-word node labels

In [None]:
labels = {}
for dataset in log_progress_nb(dataset_helper.get_dataset_names_with_concept_map()):
    X, Y = dataset_helper.get_concept_map_for_dataset(dataset)
    X = graph_helper.get_graphs_only(X)
    node_labels = graph_helper.get_all_node_labels_uniq(X)
    labels[dataset] = node_labels

In [None]:
data = list()
for dataset, labels_ in labels.items():
    c = collections.Counter()
    for l in labels_:
        c['multi_word' if len(l.split()) > 1 else 'single_word'] += 1
    c['dataset'] = dataset
    data.append(c)


In [None]:
df = pd.DataFrame(data).set_index('dataset').sort_index(ascending=False)
sum_ = df.single_word + df.multi_word
df['Single'] = df.single_word / sum_
df['Multi'] = df.multi_word / sum_
fig, ax = plt.subplots(figsize=(8, 2.8))

(df[['Single', 'Multi']] * 100).plot(kind='barh', stacked=True, ax=ax)
ax.set_xlim(0, 113)
ax.grid(False)
ax.set_xlabel('%')
ax.set_ylabel('')
ax.legend(loc='upper right')
fig.tight_layout()
save_fig(fig, 'statistics_percentage_multi_word_labels')