# Methods to perform error analysis
# Given the result.txt file created by the official script, extract useful data

In [272]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [402]:
%autoreload
import os
from sys import path
import re
import pandas as pd
#path.append('..')
import numpy as np
from scipy.stats import ttest_rel
output_path = '/scratch/geeticka/relation-extraction/output/semeval2010/CrossValidation'
def res(path): return os.path.join(output_path, path)

In [403]:
result_file_location = res('cnn_72aaf050-50a5-42b1-b7d8-3893b12fd708_2018-12-16dataset_semeval2010-pos_embed_size_25-num_filters_100-filter_sizes_2,3,4,5-keep_prob_0.500000-early_stop_False-patience_100/Fold0')

## Below are the methods that gather the necessary information from the file

In [404]:
def read_confusion_matrix_per_line(cur_line):
    if re.search(r'.*\|.*', cur_line): # only get those lines which have a pipe operator
        splitted_line = cur_line.strip().split()
        pipe_seen = 0 # the correct numbers are between two pipes
        confusion_matrix_line = []
        for val in splitted_line:
            if val == '|':
                pipe_seen += 1
            if pipe_seen == 1 and val != '|': # keep collecting the values as you are
                confusion_matrix_line.append(float(val))
        return confusion_matrix_line
    return None

In [405]:
def read_accuracy_per_line(cur_line):
    if cur_line.startswith('Accuracy (calculated'):
        accuracy = re.match(r'.*= (.*)%', cur_line).groups()[0]
        accuracy = float(accuracy)
        return accuracy
    return None

In [406]:
def read_precision_recall_f1(cur_line): # assume that the mode is once we have read 'Results for the individual' 
    match = re.match(r'.*= (.*)%.*= (.*)%.*= (.*)%$', cur_line)
    if match:
        precision, recall, f1 = match.groups()
        return float(precision), float(recall), float(f1)
    else:
        return None
    #if not cur_line.startswith('Micro-averaged result'): # you want to read only up to the point when the relations
    # will need to double check above

In [407]:
def get_file_metrics(result_file_location):
    result_file = os.path.join(result_file_location, 'result.txt')
    official_portion_file = False
    individual_relations_f1_portion = False
    micro_f1_portion = False
    macro_f1_portion = False
    confusion_matrix_official = [] # stores the official confusion matrix read from the file
    accuracy = None
    metrics_indiv_relations = [] # precision, recall and f1 for each relation
    metrics_micro = [] # excluding the other relation
    metrics_macro = [] # excluding the other relation
    with open(result_file, 'r') as result_file:
        for cur_line in result_file:
            cur_line = cur_line.strip()
            if cur_line.startswith('<<< (9+1)-WAY EVALUATION TAKING DIRECTIONALITY INTO ACCOUNT -- OFFICIAL >>>'):
                official_portion_file = True
            if official_portion_file is False:
                continue
            confusion_matrix_line = read_confusion_matrix_per_line(cur_line)
            if confusion_matrix_line is not None: confusion_matrix_official.append(confusion_matrix_line)
            
            acc = read_accuracy_per_line(cur_line)
            if acc is not None: accuracy = acc
            
            # figure out which sub portion of the official portion we are in 
            if cur_line.startswith('Results for the individual relations:'):
                individual_relations_f1_portion = True
            elif cur_line.startswith('Micro-averaged result (excluding Other):'):
                micro_f1_portion = True
            elif cur_line.startswith('MACRO-averaged result (excluding Other):'):
                macro_f1_portion = True
            
            # populate the precision, recall and f1 for the correct respective lists
            if individual_relations_f1_portion is True and micro_f1_portion is False:
                vals = read_precision_recall_f1(cur_line)
                if vals is not None: metrics_indiv_relations.append([vals[0], vals[1], vals[2]])
            elif micro_f1_portion is True and macro_f1_portion is False:
                vals = read_precision_recall_f1(cur_line)
                if vals is not None: metrics_micro.append([vals[0], vals[1], vals[2]])
            elif macro_f1_portion is True:
                vals = read_precision_recall_f1(cur_line)
                if vals is not None: metrics_macro.append([vals[0], vals[1], vals[2]])
    return confusion_matrix_official, accuracy, metrics_indiv_relations, metrics_micro, metrics_macro

## Get the file metrics

In [408]:
confusion_matrix_official, accuracy, \
metrics_indiv_relations, metrics_micro, metrics_macro = get_file_metrics(result_file_location)

### Generate the confusion matrix as a pandas dataframe

https://stackoverflow.com/questions/17091769/python-pandas-fill-a-dataframe-row-by-row
and https://stackoverflow.com/questions/35047842/how-to-store-the-name-of-rows-and-column-index-in-pandas-dataframe

In [409]:
relation_full_form_dictionary = {'C-E': 'Cause-Effect', 'C-W': 'Component-Whole', 'C-C': 'Content-Container',
                                 'E-D': 'Entity-Destination', 'E-O': 'Entity-Origin', 'I-A': 'Instrument-Agency',
                                 'M-C': 'Member-Collection', 'M-T': 'Message-Topic', 'P-P': 'Product-Producer',
                                 '_O': 'Other'}
relation_as_short_list = ['C-E', 'C-W', 'C-C', 'E-D', 'E-O', 'I-A', 'M-C', 'M-T', 'P-P', '_O']

In [410]:
def get_confusion_matrix_as_df(confusion_matrix_official, relations_as_short_list):
    index = pd.Index(relations_as_short_list, name='gold labels')
    columns = pd.Index(relations_as_short_list, name='predicted')
    confusion_matrix_df = pd.DataFrame(data=confusion_matrix_official, columns=columns,index=index)
    return confusion_matrix_df

In [411]:
confusion_matrix_df = get_confusion_matrix_as_df(confusion_matrix_official, relation_as_short_list)

#### Give the confusions across each relation, with a special interest on other

In [412]:
# we want to go row by row, and get only those column names which have 0 values.
#and the number associated with that column name as a string. 
# and we also want to get the correct values as a separate column

In [413]:
def generate_confused_with_string(index, row, relation_full_form_dictionary):
    # index is the current relation that we are considering and row is all the predicted examples
    confused_with_string = ""
    num_of_columns = len(row.index)
    for i in range(0, num_of_columns):
        column_name = row.index[i]
        column_value = int(row.loc[column_name])
        if column_value > 0 and column_name != index:
            confused_with_string += " " + relation_full_form_dictionary[column_name] + \
            "(" + str(column_value) + ")"
    return confused_with_string.strip()
    #print(row.data[0])
    #for val in row:
    #    print(val)

In [414]:
def generate_pretty_summary_confusion_matrix(confusion_matrix_df, relation_full_form_dictionary):
    data = [] # index will be 0,1,2 and so on, but columns will be
    # Actual label, confused with as a string, correct predictions as a number
    for index, row in confusion_matrix_df.iterrows():
        actual_label = relation_full_form_dictionary[index]
        confused_with = generate_confused_with_string(index, row, relation_full_form_dictionary)
        correct_predictions = row[index] # eg: gives the column value for C-E for an index C-E
        if index != '_O': confused_with_other = row['_O'] # this is specific to semeval and will need to be changed
        else: confused_with_other = None
        data.append([actual_label, confused_with, confused_with_other, correct_predictions])
    columns = pd.Index(['Gold Relation', 'Confused With(num_examples)', 'Confused with Other', 'Correct Predictions'], name='summary')
    pretty_summary_confusion_matrix_df = pd.DataFrame(data=data, columns=columns)
    return pretty_summary_confusion_matrix_df

#### Give the individual relation metrics as a dataframe

In [415]:
def create_metrics_indiv_relations_df(metrics_indiv_relations, relation_full_form_dictionary, relation_as_short_list):
    index_list = relation_as_short_list
    index_list_verbose = [relation_full_form_dictionary[x] for x in index_list]
    index = pd.Index(index_list_verbose, name='labels')
    columns = pd.Index(['Precision', 'Recall', 'F1'], name='metrics')
    metrics_indiv_relations_df = pd.DataFrame(data=metrics_indiv_relations, columns=columns,index=index)
    return metrics_indiv_relations_df

In [416]:
def create_metrics_macro_micro_df(metrics_macro, metrics_micro):
    data = metrics_macro + metrics_micro
    index = pd.Index(['macro', 'micro'], name='calculation type')
    columns = pd.Index(['Precision', 'Recall', 'F1'], name='metrics')
    metrics_macro_micro = pd.DataFrame(data=data, columns=columns,index=index)
    return metrics_macro_micro

## Finally, create a large summary function

In [417]:
relation_full_form_dictionary = {'C-E': 'Cause-Effect', 'C-W': 'Component-Whole', 'C-C': 'Content-Container',
                                 'E-D': 'Entity-Destination', 'E-O': 'Entity-Origin', 'I-A': 'Instrument-Agency',
                                 'M-C': 'Member-Collection', 'M-T': 'Message-Topic', 'P-P': 'Product-Producer',
                                 '_O': 'Other'}
relation_as_short_list = ['C-E', 'C-W', 'C-C', 'E-D', 'E-O', 'I-A', 'M-C', 'M-T', 'P-P', '_O'] 

In [418]:
def create_summary(result_file_location, relation_full_form_dictionary, relation_as_short_list):
    if not os.path.exists(result_file_location):
        print("Check your path first!")
        return None
    # get the file metrics
    confusion_matrix_official, accuracy, \
    metrics_indiv_relations, metrics_micro, metrics_macro = get_file_metrics(result_file_location)
    
    # get the confusion matrix dataframe
    confusion_matrix_df = get_confusion_matrix_as_df(confusion_matrix_official, relation_as_short_list)
    
    # these are the summary information that will need to be returned
    pretty_summary_confusion_matrix_df = generate_pretty_summary_confusion_matrix(confusion_matrix_df, 
                                                                                  relation_full_form_dictionary)
    total_correct_predictions = pretty_summary_confusion_matrix_df['Correct Predictions'].sum()
    metrics_indiv_relations_df = create_metrics_indiv_relations_df(metrics_indiv_relations, 
                                                                   relation_full_form_dictionary, 
                                                                   relation_as_short_list)
    metrics_macro_micro = create_metrics_macro_micro_df(metrics_macro, metrics_micro)
    # report accuracy as well
    return confusion_matrix_df, pretty_summary_confusion_matrix_df, total_correct_predictions, metrics_indiv_relations_df, \
    metrics_macro_micro, accuracy

In [419]:
confusion_matrix_df, pretty_summary_confusion_matrix_df, \
total_correct_predictions, metrics_indiv_relations_df, \
metrics_macro_micro, accuracy = create_summary(result_file_location, relation_full_form_dictionary, relation_as_short_list)

In [420]:
# given the confusion matrix, return the sums of all the examples
def get_sum_confusion_matrix(confusion_matrix):
    sum = 0
    for column in confusion_matrix:
        sum += confusion_matrix[column].sum()
    return sum

In [421]:
# for each of the relations, do a t test between the two model metrics
def indiv_metric_comparison(metrics_i_model1, metrics_i_model2, model1_name, model2_name):
    print("TTest from %s to %s"%(model1_name, model2_name))
    print("Below is the metric comparsion across the two models" + \
          "considering individual relations, excluding 'Other'")
    for column in metrics_i_model1:
        metric_model1 = metrics_i_model1[column].tolist()[:-1] # excluding "Other"
        metric_model2 = metrics_i_model2[column].tolist()[:-1]
        tt = ttest_rel(metric_model1, metric_model2)
        print("Metric: %s \t statistic %.2f \t p_value %s"%
              (column, tt.statistic, tt.pvalue))

In [422]:
def get_macro_micro_metric_comparison(metrics_ma_mi_model1, metrics_ma_mi_model2, model1_name, model2_name):
    print("Macro - Micro for the %s model"%(model1_name))
    for column in metrics_ma_mi_model1:
        macro = metrics_ma_mi_model1[column].loc['macro']
        micro = metrics_ma_mi_model1[column].loc['micro']
        print("Metric: %s \t Macro-Micro %.2f"%(column, macro-micro))
        
    print("\nMacro - Micro for the %s model"%(model2_name))
    for column in metrics_ma_mi_model2:
        macro = metrics_ma_mi_model2[column].loc['macro']
        micro = metrics_ma_mi_model2[column].loc['micro']
        print("Metric: %s \t Macro-Micro %.2f"%(column, macro-micro))
        
    print("\nMacro_%s - Macro_%s"%(model1_name, model2_name))
    for column in metrics_ma_mi_model1:
        macro_model1 = metrics_ma_mi_model1[column].loc['macro']
        macro_model2 = metrics_ma_mi_model2[column].loc['macro']
        print("Metric: %s \t Difference %.2f"%(column, macro_model1-macro_model2))
    
    print("\nMicro_%s - Micro_%s"%(model1_name, model2_name))
    for column in metrics_ma_mi_model1:
        micro_model1 = metrics_ma_mi_model1[column].loc['micro']
        micro_model2 = metrics_ma_mi_model2[column].loc['micro']
        print("Metric: %s \t Difference %.2f"%(column, micro_model1-micro_model2))

In [423]:
def get_accuracy_difference(accuracy_model1, accuracy_model2, model1_name, model2_name):
    print("Accuracy_%s - Accuracy_%s %.2f"%(model1_name, model2_name, accuracy_model1 - accuracy_model2))

In nightingale, I created 2 folders inside 
/scratch/geeticka/relation-extraction/output/semeval2010/CrossValidation/error-analysis

- One folder is baseline, the other is elmo-model
- Each folder has result.txt for each Fold
- I am going to generate a summary for each of the Folds

In [429]:
def print_full_summary(model1_loc, model2_loc, model1_name, model2_name, res, 
                       relation_full_form_dictionary, relation_as_short_list):
    model1 = res(model1_loc)
    model2 = res(model2_loc)
    
    cm_model1, summary_cm_model1, correct_pred_model1, metrics_i_model1, \
    metrics_ma_mi_model1, accuracy_model1 \
    = create_summary(model1, relation_full_form_dictionary, relation_as_short_list)
    
    cm_model2, summary_cm_model2, correct_pred_model2, metrics_i_model2, \
    metrics_ma_mi_model2, accuracy_model2 \
    = create_summary(model2, relation_full_form_dictionary, relation_as_short_list)
    
    # T Test of each metrics, across the relations not Other
    indiv_metric_comparison(metrics_i_model1, metrics_i_model2, model1_name, model2_name)
    
    # Get the difference in the macro and micro scores
    get_macro_micro_metric_comparison(metrics_ma_mi_model1, metrics_ma_mi_model2, model1_name, model2_name)
    
    # Print the accuracy difference as well
    get_accuracy_difference(accuracy_model1, accuracy_model2, model1_name, model2_name)
    return summary_cm_model1, summary_cm_model2

In [430]:
summary_cm_baseline, summary_cm_elmo = print_full_summary(
    'error-analysis/baseline/Fold0', 'error-analysis/elmo-model/Fold0', 
    'Baseline','Elmo', res, relation_full_form_dictionary, relation_as_short_list)

TTest from Baseline to Elmo
Below is the metric comparsion across the two modelsconsidering individual relations, excluding 'Other'
Metric: Precision 	 statistic -5.38 	 p_value 0.0006647655060868683
Metric: Recall 	 statistic 0.45 	 p_value 0.6644537071409055
Metric: F1 	 statistic -3.34 	 p_value 0.010214137647466863
Macro - Micro for the Baseline model
Metric: Precision 	 Macro-Micro 0.37
Metric: Recall 	 Macro-Micro -0.77
Metric: F1 	 Macro-Micro -0.47

Macro - Micro for the Elmo model
Metric: Precision 	 Macro-Micro 0.71
Metric: Recall 	 Macro-Micro -1.07
Metric: F1 	 Macro-Micro -0.64

Macro_Baseline - Macro_Elmo
Metric: Precision 	 Difference -7.46
Metric: Recall 	 Difference 0.75
Metric: F1 	 Difference -3.21

Micro_Baseline - Micro_Elmo
Metric: Precision 	 Difference -7.12
Metric: Recall 	 Difference 0.45
Metric: F1 	 Difference -3.38
Accuracy_Baseline - Accuracy_Elmo -3.23


In [426]:
summary_cm_baseline

summary,Gold Relation,Confused With(num_examples),Confused with Other,Correct Predictions
0,Cause-Effect,Entity-Origin(6) Message-Topic(1) Product-Prod...,2.0,91.0
1,Component-Whole,Content-Container(1) Entity-Origin(2) Member-C...,12.0,71.0
2,Content-Container,Component-Whole(2) Entity-Destination(4) Other(1),1.0,48.0
3,Entity-Destination,Content-Container(1),0.0,84.0
4,Entity-Origin,Component-Whole(1) Other(4),4.0,67.0
5,Instrument-Agency,Component-Whole(5) Entity-Origin(1) Message-To...,6.0,34.0
6,Member-Collection,Component-Whole(2) Content-Container(2) Other(2),2.0,64.0
7,Message-Topic,Component-Whole(1) Entity-Destination(1) Produ...,8.0,50.0
8,Product-Producer,Cause-Effect(1) Entity-Destination(1) Entity-O...,9.0,57.0
9,Other,Cause-Effect(10) Component-Whole(15) Content-C...,,56.0


In [427]:
summary_cm_elmo

summary,Gold Relation,Confused With(num_examples),Confused with Other,Correct Predictions
0,Cause-Effect,Entity-Origin(3) Product-Producer(1) Other(2),2.0,95.0
1,Component-Whole,Member-Collection(2) Message-Topic(2) Other(21),21.0,68.0
2,Content-Container,Entity-Destination(6) Other(6),6.0,43.0
3,Entity-Destination,Other(2),2.0,83.0
4,Entity-Origin,Component-Whole(1) Other(6),6.0,65.0
5,Instrument-Agency,Component-Whole(3) Message-Topic(1) Other(12),12.0,34.0
6,Member-Collection,Component-Whole(2) Other(7),7.0,61.0
7,Message-Topic,Other(13),13.0,51.0
8,Product-Producer,Entity-Origin(3) Other(7),7.0,63.0
9,Other,Cause-Effect(8) Component-Whole(8) Content-Con...,,85.0
