# Experiment: _Remove infrequent labels_

Some node labels occur only once in the dataset. This is a test whether they can be removed safely.

Note that some labels can occur only once in the train set, and **multiple** times in the test set! These labels would not get matched.

In [None]:
from notebook_prelude import *

In [None]:
NA_VAL = '-'
EXPERIMENT_NAME = 'experiment_remove_infrequent_nodelabels'
df = results_helper.get_experiments_by_names([EXPERIMENT_NAME], fetch_predictions=True)
df_ = results_helper.get_results(filter_out_non_complete_datasets=None, fetch_predictions=True)
df = results_helper.filter_out_datasets(df, lambda x: 'RemoveInfrequentGraphLabels' in x.graph_preprocessing.values)
df = df.append(df_[df_.combined == False]).fillna('-').reset_index()
df = df[(df.combined == False) & (df.kernel != 'unknown')]
df = df[df.type == TYPE_CONCEPT_MAP].fillna('(na)')

## Results when removing single occurrence labels from the dataset

In [None]:
df.columns

In [None]:
groupby = ['dataset', 'graph_preprocessing']
attr = 'prediction_score_f1_macro'
grouped = df.groupby(groupby)[attr]

df_ = grouped.max().to_frame().unstack()
df_.columns = df_.columns.droplevel()

In [None]:
best = df.loc[grouped.idxmax()]
confidences = []
for dataset, df__ in best.groupby('dataset'):
    print(dataset)
    if len(df__) != 2:
        confidences.append(np.nan)
        print('\tNot enough data. Skipping')
        continue
    prediction_filenames = [df__.loc[best.graph_preprocessing == name].iloc[0].prediction_file for name in ['-', 'RemoveInfrequentGraphLabels']]
    diffs, score_a, score_b, global_difference, confidence = results_helper.calculate_significance(prediction_filenames[0], prediction_filenames[1])
    
    for k, v in [('Score A', score_a), ('Score B', score_b), ('Difference', global_difference), ('Confidence', confidence)]:
        print('\t{:20} {:9.4f}'.format(k, v))
    print()
    confidences.append(confidence)

In [None]:
df_['confidence'] = confidences
print(df_.to_latex(float_format="%.4f"))
df_