# Experiment: _Multi-label splitting_

The node labels for concept maps often (~80% of the time) consist of more than one word. This experiment tests the effect of splitting these labels and then classifying the graphs afterwards.

In [None]:
from notebook_prelude import *

In [None]:
NA_VAL = '-'
EXPERIMENT_NAME = 'experiment_split_multi_words'

df = results_helper.get_experiments_by_names([
    EXPERIMENT_NAME + '_with_splitter',
    EXPERIMENT_NAME + '_without_splitter',
    EXPERIMENT_NAME + '_with_splitter_and_lemmatizer'
], fetch_predictions=True)

print('# Results: {}'.format(len(df)))

In [None]:
df['graph__preprocessing__lemmatizer_or_stemmer'] = df.graph__preprocessing__lemmatizer_or_stemmer.apply(lambda x: type(x).__name__)

In [None]:
df.groupby(['dataset', 'graph__preprocessing__lemmatizer_or_stemmer', 'graph__preprocessing'])[attr].max().to_frame().unstack().unstack()#.dropna(axis=1)

In [None]:
attr = 'mean_test_f1_macro'
attr = 'prediction_score_f1_macro'
def get_max_grouped_by(df, groupby, attr = attr):
    g = df.groupby(groupby)[attr]
    g_ = g.max().to_frame().unstack()
    g__ = g.idxmax()
    return g_, g__

df_ = df[df.type == 'concept_map']
groupby = ['dataset', 'graph__preprocessing']

df_, df__ = get_max_grouped_by(df_, groupby)
df_.columns = df_.columns.droplevel().values
df_ = df_.rename(columns={'-': 'not_split', 'GraphMultiWordLabelSplitter': 'split'})
df_['difference'] = df_['split'] - df_['not_split']

df__ = df__.to_frame().unstack()
df_['split_idx'] = df__.values[:,0]
df_['not_split_idx'] = df__.values[:,1]
df_

for idx, x in enumerate(['split_idx', 'not_split_idx']):
    df___ = df.loc[df_[x]]
    std = df___.std_test_f1_macro
    df_[x + '_std'] = std.values
    name = x.rsplit('_', 1)[0]
    df_[name + '_cv_f1_macro'] = df___.mean_test_f1_macro.values
    df_[name + '_pred_f1_macro'] = df___.prediction_score_f1_macro.values
#    print(df__.mean_test_f1_macro.values)
    
df_[['not_split', 'split', 'difference']]
#df.loc[df_.split_idx].mean_test_f1_macro

In [None]:
df_[['not_split', 'split', 'difference']]

In [None]:
a = df_[['not_split', 'split']]
print(a.to_latex())
a

## Statistics about multi-word node labels

In [None]:
labels = {}
for dataset in log_progress_nb(dataset_helper.get_dataset_names_with_concept_map()):
    X, Y = dataset_helper.get_concept_map_for_dataset(dataset)
    X = graph_helper.get_graphs_only(X)
    node_labels = graph_helper.get_all_node_labels_uniq(X)
    labels[dataset] = node_labels

In [None]:
data = list()
for dataset, labels_ in labels.items():
    c = collections.Counter()
    for l in labels_:
        c['multi_word' if len(l.split()) > 1 else 'single_word'] += 1
    c['dataset'] = dataset
    data.append(c)


In [None]:
df = pd.DataFrame(data).set_index('dataset').sort_index(ascending=False)
sum_ = df.single_word + df.multi_word
df['Single'] = df.single_word / sum_
df['Multi'] = df.multi_word / sum_
fig, ax = plt.subplots(figsize=(8, 2.8))

(df[['Single', 'Multi']] * 100).plot(kind='barh', stacked=True, ax=ax)
ax.set_xlim(0, 113)
ax.grid(False)
ax.set_xlabel('%')
ax.set_ylabel('')
ax.legend(loc='upper right')
fig.tight_layout()
save_fig(fig, 'statistics_percentage_multi_word_labels')