In [1]:
# General imports
import numpy as np
import pandas as pd

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import statistics as s

# Own functions
from data_preparation import preprocessing_functions_for_final_runs as preproc_final
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
qa_data_train = pd.read_csv('Data/Friends_data/Final_QA_datasets/qa_data_train.csv', sep='\t', index_col=0)
qa_data_tune = pd.read_csv('Data/Friends_data/Final_QA_datasets/qa_data_tune.csv', sep='\t', index_col=0)
qa_data_dev = pd.read_csv('Data/Friends_data/Final_QA_datasets/qa_data_dev.csv', sep='\t', index_col=0)
qa_data_test = pd.read_csv('Data/Friends_data/Final_QA_datasets/qa_data_test.csv', sep='\t', index_col=0)

X_train, Y_train, X_dev, Y_dev, Y_dev_original, X_test, Y_test, vocab, vocab_size, maxlen, tokenizer, number_of_labels = preproc_final.preprocessing(qa_data_train, qa_data_tune, qa_data_dev, qa_data_test, exclude=[5,6], multi_input=False, input_def='c')

# Accuracy and F1-score for DEV and TEST sets - average of 3 runs

In [4]:
dev1 = "./predictions/CNN_bert/bert_loaded_dev1.npz"
dev2 = "./predictions/CNN_bert/bert_loaded_dev2.npz"
dev3 = "./predictions/CNN_bert/bert_loaded_dev3.npz"

test1 = "./predictions/CNN_bert/bert_loaded_test1.npz"
test2 = "./predictions/CNN_bert/bert_loaded_test2.npz"
test3 = "./predictions/CNN_bert/bert_loaded_test3.npz"

In [5]:
predictions_dev_1 = np.load(dev1)['arr_0']
predictions_dev_2 = np.load(dev2)['arr_0']
predictions_dev_3 = np.load(dev3)['arr_0']

predictions_test_1 = np.load(test1)['arr_0']
predictions_test_2 = np.load(test2)['arr_0']
predictions_test_3 = np.load(test3)['arr_0']

In [6]:
acc_dev1 = accuracy_score(Y_dev_original, predictions_dev_1)
acc_dev2 = accuracy_score(Y_dev_original, predictions_dev_2)
acc_dev3 = accuracy_score(Y_dev_original, predictions_dev_3)

acc_test1 = accuracy_score(Y_test, predictions_test_1)
acc_test2 = accuracy_score(Y_test, predictions_test_2)
acc_test3 = accuracy_score(Y_test, predictions_test_3)

fsc_dev1 = f1_score(Y_dev_original, predictions_dev_1, average='macro')
fsc_dev2 = f1_score(Y_dev_original, predictions_dev_2, average='macro')
fsc_dev3 = f1_score(Y_dev_original, predictions_dev_3, average='macro')

fsc_test1 = f1_score(Y_test, predictions_test_1, average='macro')
fsc_test2 = f1_score(Y_test, predictions_test_2, average='macro')
fsc_test3 = f1_score(Y_test, predictions_test_3, average='macro')

In [7]:
average_dev_acc = (acc_dev1 + acc_dev2 + acc_dev3) / 3
average_test_acc = (acc_test1 + acc_test2 + acc_test3) / 3

average_dev_fsc = (fsc_dev1 + fsc_dev2 + fsc_dev3) / 3
average_test_fsc = (fsc_test1 + fsc_test2 + fsc_test3) / 3

In [8]:
print("DEV: \nAverage accuracy: {} with standard deviation: {}".format(np.round(average_dev_acc*100,3), np.round(s.stdev([acc_dev1,acc_dev2,acc_dev3])*100,3)))
print("Average f1-score: {} with standard deviation: {}".format(np.round(average_dev_fsc*100,3), np.round(s.stdev([fsc_dev1,fsc_dev2,fsc_dev3])*100,3)))
print()
print("TEST: \nAverage accuracy: {} with standard deviation: {}".format(np.round(average_test_acc*100,3), np.round(s.stdev([acc_test1,acc_test2,acc_test3])*100,3)))
print("Average f1-score: {} with standard deviation: {}".format(np.round(average_test_fsc*100,3), np.round(s.stdev([fsc_test1,fsc_test2,fsc_test3])*100,3)))


DEV: 
Average accuracy: 64.081 with standard deviation: 0.446
Average f1-score: 49.161 with standard deviation: 3.956

TEST: 
Average accuracy: 61.327 with standard deviation: 0.487
Average f1-score: 45.647 with standard deviation: 3.164


## Class-wise F1-score for DEV set

In [9]:
print("DEV:")
print(classification_report(Y_dev_original, predictions_dev_1))
print(classification_report(Y_dev_original, predictions_dev_2))
print(classification_report(Y_dev_original, predictions_dev_3))

DEV:
              precision    recall  f1-score   support

           0       0.69      0.77      0.73       291
           1       0.61      0.44      0.52       153
           2       0.60      0.21      0.32        14
           3       0.55      0.61      0.58       135

    accuracy                           0.64       593
   macro avg       0.61      0.51      0.53       593
weighted avg       0.63      0.64      0.63       593

              precision    recall  f1-score   support

           0       0.68      0.82      0.74       291
           1       0.64      0.42      0.51       153
           2       0.25      0.07      0.11        14
           3       0.58      0.58      0.58       135

    accuracy                           0.65       593
   macro avg       0.54      0.47      0.48       593
weighted avg       0.64      0.65      0.63       593

              precision    recall  f1-score   support

           0       0.67      0.80      0.73       291
           1    

In [10]:
f1_1 = f1_score(Y_dev_original,predictions_dev_1,average=None)
f1_2 = f1_score(Y_dev_original,predictions_dev_2,average=None)
f1_3 = f1_score(Y_dev_original,predictions_dev_3,average=None)

In [11]:
print(f1_1)
print(f1_2)
print(f1_3)

[0.72727273 0.51515152 0.31578947 0.57839721]
[0.74418605 0.50592885 0.11111111 0.57777778]
[0.73125    0.53691275 0.         0.55555556]


In [12]:
print(np.average(np.asarray((f1_1,f1_2,f1_3)),axis=0))

[0.73423626 0.51933104 0.14230019 0.57057685]
