In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import sys

In [30]:
pickle_in = open("german_softmax_folds.pkl","rb")
results_german = pickle.load(pickle_in)

In [31]:
def no_label_predicted(predictions, thres):
    return np.where((predictions < thres).all(1))

In [32]:
# READ THE GOLD TEST LABELS
gold_labels = pd.read_csv("dataset/german/dev.tsv", header=None, sep='\t').iloc[:,3:]
gold_labels.columns = [str(i+1) for i in np.arange(9)]

In [36]:
metrics_folds = {}
size = len(gold_labels)

for i in range(5):
    predictions = results_german[i]
    binary_predictions = predictions > 0.5
    no_label_indices = no_label_predicted(predictions, 0.5)
    no_label_max_pred_indices = predictions[no_label_indices].argmax(1)
    binary_predictions[no_label_indices, no_label_max_pred_indices] = True
    
    exact_match = metrics.accuracy_score(gold_labels, binary_predictions)
    
    # eliminate frames which either never predicted or never occurred
    never_predicted = np.where(binary_predictions.sum(axis=0)==0)[0]
    print("never_predicted: ", never_predicted)
    
    # eliminate frames which never occured
    never_occurred = np.where(gold_labels.sum(axis=0)==0)[0]
    print("never_occurred: ", never_occurred)
    
    label_indices = np.setdiff1d(np.arange(9), np.union1d(never_occurred, never_predicted))
    print("label_indices: ", label_indices)
    
    gold_labels_selected = gold_labels.iloc[:,label_indices]
    binary_predictions_selected = binary_predictions[:,label_indices]

    hamming = metrics.hamming_loss(gold_labels_selected, binary_predictions_selected)
    f1_macro = metrics.f1_score(gold_labels_selected, binary_predictions_selected, average='macro')
    f1_micro = metrics.f1_score(gold_labels_selected, binary_predictions_selected, average='micro')
    f1_weighted = metrics.f1_score(gold_labels_selected, binary_predictions_selected, average='weighted')
    try:
        auc = metrics.roc_auc_score(gold_labels_selected, binary_predictions_selected)
    except:
        auc = np.nan
    
    exact_match_frame = np.sum(np.equal(gold_labels, binary_predictions), axis=0)/size
    per_class_labels = gold_labels.sum(axis=0)
    total_labels = per_class_labels.sum()
    exact_match_frame_weigted_avg = (exact_match_frame @ per_class_labels)/total_labels
    
    # compute evaluations about multilabeled frame predictions
    multiple_frame_articles_bool = np.sum(gold_labels, axis=1) > 1.0
    results_multiple = np.equal(gold_labels, binary_predictions) \
                                      .loc[multiple_frame_articles_bool]
    number_multiple = len(results_multiple)
    match_multiple = np.sum(results_multiple.all(1))/number_multiple


    metrics_folds[i] = {"hamming":hamming,
                            "f1_macro":f1_macro,
                            "f1_micro":f1_micro,
                            "f1_weighted":f1_weighted,
                            "exact_match":exact_match,
                            "auc":auc,
                            "exact_match_multiple":match_multiple,
                            "number_multiple":number_multiple,
                            "exact_match_frame_weigted_avg":exact_match_frame_weigted_avg}


never_predicted:  [3]
never_occurred:  [3]
label_indices:  [0 1 2 4 5 6 7 8]
never_predicted:  [3]
never_occurred:  [3]
label_indices:  [0 1 2 4 5 6 7 8]
never_predicted:  [3]
never_occurred:  [3]
label_indices:  [0 1 2 4 5 6 7 8]
never_predicted:  []
never_occurred:  [3]
label_indices:  [0 1 2 4 5 6 7 8]
never_predicted:  []
never_occurred:  [3]
label_indices:  [0 1 2 4 5 6 7 8]


In [34]:
def string_for_latex_table(rs):
    s = ""
    rs = rs.astype(str)
    return s + \
        rs['f1_macro'] + '&' + \
        rs['f1_weighted'] + '&' + \
        rs['f1_micro'] + '&' + \
        rs['auc'] + '&' + \
        rs['hamming'] + '&' + \
        rs['exact_match'] + '&' + \
        rs['exact_match_multiple']

In [35]:
rs = pd.DataFrame(metrics_folds).mean(axis=1).round(3)
print(rs)
string_for_latex_table(rs)


auc                               0.633
exact_match                       0.233
exact_match_frame_weigted_avg     0.740
exact_match_multiple              0.000
f1_macro                          0.381
f1_micro                          0.509
f1_weighted                       0.480
hamming                           0.157
number_multiple                  68.000
dtype: float64


'0.381&0.48&0.509&0.633&0.157&0.233&0.0'

In [107]:
size

121

In [110]:
#per class number of instances 
gold_labels.sum(axis=0)

1    18.0
2    51.0
3    50.0
4     0.0
5     7.0
6     2.0
7    52.0
8     6.0
9     3.0
dtype: float64

In [116]:
np.set_printoptions(threshold=sys.maxsize)

In [121]:
results_german[0][:15,:5]

array([[0.01821288, 0.03231776, 0.4294086 , 0.02877392, 0.3435278 ],
       [0.02573443, 0.40315357, 0.2818933 , 0.01444566, 0.12325426],
       [0.08229267, 0.07209018, 0.7866301 , 0.00757782, 0.00781202],
       [0.03506797, 0.16692503, 0.6126149 , 0.0147429 , 0.06963103],
       [0.09715743, 0.10714628, 0.72458094, 0.00848007, 0.01112198],
       [0.01476821, 0.02296535, 0.7054311 , 0.03435485, 0.12602839],
       [0.10307766, 0.09191813, 0.7213165 , 0.00932924, 0.01232137],
       [0.04723834, 0.29686597, 0.39740515, 0.0249335 , 0.01930334],
       [0.02693352, 0.05048189, 0.245083  , 0.05843274, 0.06404084],
       [0.10617355, 0.660077  , 0.14076185, 0.0132264 , 0.01123958],
       [0.1021709 , 0.05261669, 0.64598745, 0.01410179, 0.02828101],
       [0.02262634, 0.02350668, 0.21556696, 0.01192335, 0.02984072],
       [0.09441933, 0.15392938, 0.63778764, 0.006363  , 0.02980804],
       [0.20921466, 0.19762236, 0.47797894, 0.01169711, 0.0210397 ],
       [0.11817447, 0.59554595, 0.

In [129]:
gold_labels[gold_labels.sum(axis=1)>1]

Unnamed: 0,1,2,3,4,5,6,7,8,9
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
13,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
14,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [130]:
(results_german[0][gold_labels.sum(axis=1)>1,:]>0.5).astype(int)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0,

In [None]:
# how many of the samples indeed didn't have prediction?
# in which cases does the model particularly fail
# what happens if I include 3rd frames.

