In [None]:
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def get_all_results(dataset_name, embeddings, classifiers, y_true):
    '''
    Print accuracy, balanced accuracy and F1 score for provided dataset, embedding methods and classifiers.
    Parameters:
        dataset_name (str): Dataset name.
        embeddings (list): Names of embedding methods.
        classifiers (list): Names of classifiers.
        y_true (Series): Real genres of the test data.
    '''
    accuracy = []
    balanced_accuracy = []
    f1 = []
    emb_names = []
    cls_names = []
    for emb_name in embeddings:
        for cls_name in classifiers:
            fname = f'predictions/{dataset_name}/model_{emb_name}_{cls_name}.csv'
            y_pred = pd.read_csv(fname, header=None)
            accuracy.append(metrics.accuracy_score(y_true=y_true, y_pred=y_pred))
            balanced_accuracy.append(metrics.balanced_accuracy_score(y_true=y_true, y_pred=y_pred))
            f1.append(metrics.f1_score(y_true=y_true, y_pred=y_pred, average='weighted'))
            emb_names.append(emb_name)
            cls_names.append(cls_name)
    results = pd.DataFrame({
        'nlp_embedding': emb_names,
        'nlp_classifier': cls_names,
        'accuracy': accuracy,
        'balanced_accuracy': balanced_accuracy,
        'f1_score': f1})
    return results

In [None]:
def save_plots_of_results(dataset_name, embeddings, classifiers, y_true, labels_x, labels_y):
    '''
    Save plots of accuracy, balanced accuracy and F1 score for provided dataset, embedding methods and classifiers.
    Parameters:
        dataset_name (str): Dataset name.
        embeddings (list): Names of embedding methods.
        classifiers (list): Names of classifiers.
        y_true (Series): Real genres of the test data.
        labels_x (list): Labels of embedding methods.
        labels_y (list): Labels of classifiers.
    '''
    accuracy = pd.DataFrame(np.zeros((len(embeddings), len(classifiers))), columns=labels_y, index=labels_x)
    balanced_accuracy = pd.DataFrame(np.zeros((len(embeddings), len(classifiers))), columns=labels_y, index=labels_x)
    f1 = pd.DataFrame(np.zeros((len(embeddings), len(classifiers))), columns=labels_y, index=labels_x)
    for i, emb_name in enumerate(embeddings):
        for j, cls_name in enumerate(classifiers):
            fname = f'predictions/{dataset_name}/model_{emb_name}_{cls_name}.csv'
            y_pred = pd.read_csv(fname, header=None)
            accuracy.at[labels_x[i], labels_y[j]] = metrics.accuracy_score(y_true=y_true, y_pred=y_pred)
            balanced_accuracy.at[labels_x[i], labels_y[j]] = metrics.balanced_accuracy_score(y_true=y_true, y_pred=y_pred)
            f1.at[labels_x[i], labels_y[j]] = metrics.f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    accuracy.plot(y=accuracy.columns, kind='bar', color=['#F6BD60', '#AB4E68', '#F5CAC3', '#84A59D', '#F28482'], xlabel='Accuracy')
    plt.legend(bbox_to_anchor=(1, 0.5))
    plt.savefig(f'accuracy_{dataset_name}.svg', bbox_inches='tight')
    plt.savefig(f'accuracy_{dataset_name}.pdf', bbox_inches='tight')
    balanced_accuracy.plot(y=balanced_accuracy.columns, kind='bar', color=['#F6BD60', '#AB4E68', '#F5CAC3', '#84A59D', '#F28482'], xlabel='Balanced accuracy')
    plt.legend(bbox_to_anchor=(1, 0.5))
    plt.savefig(f'bal_accuracy_{dataset_name}.svg', bbox_inches='tight')
    plt.savefig(f'bal_accuracy_{dataset_name}.pdf', bbox_inches='tight')
    f1.plot(y=f1.columns, kind='bar', color=['#F6BD60', '#AB4E68', '#F5CAC3', '#84A59D', '#F28482'], xlabel='F1-score')
    plt.legend(bbox_to_anchor=(1, 0.5))
    plt.savefig(f'f1_{dataset_name}.svg', bbox_inches='tight')
    plt.savefig(f'f1_{dataset_name}.pdf', bbox_inches='tight')

In [None]:
# Parameters

dataset_name = 'dataset_proc'
nlp_embeddings = ['smaller-bert', 'glove']
nlp_classifiers = ['naive-bayes', 'svm', 'xgboost', 'cnn']
labels_x = ['Smaller BERT', 'Glove']
labels_y = ['Naive Bayes', 'Linear SVM', 'XGBoost', 'CNN']

In [None]:
# Reading test data from CSV file

test_data = pd.read_csv(f'data/test/{dataset_name}.csv')
y_true = test_data.genre

In [None]:
# Printing results

get_all_results(dataset_name, nlp_embeddings, nlp_classifiers, y_true)

In [None]:
# Saving plots

save_plots_of_results(dataset_name, nlp_embeddings, nlp_classifiers, y_true, labels_x, labels_y)