# EVALUATION FOR MULTI-LABEL CLASSIFIERS

- Evaluate predictions from multi-label classifier

In [61]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, multilabel_confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [62]:
# Define the path of the pickle file that predict.py script uses as input and output
# Pickle file should contain the dataframe that has columns for predicted category values and confidence scores
# e.g. [0,0,0,0,0,0,0,0,0,1], and [0.123534534, ...]
combined_pred_file = '../predictions/combined_test_new_INS_fixed_FP.pkl'
#combined_pred_file = '../predictions/all_data_10.pkl'

In [64]:
# Read the pickle file
df_comb = pd.read_pickle(combined_pred_file)

# Print out columns to confirm the column name that contains predictions and confidence scores
df_comb.columns

In [None]:
# Fill in with the column name that contains relevant predictions e.g. 'pred_jenia_M3_ft2' 
pred_col_j = 'pred_jenia_M3_ft2' 

In [65]:
# Define new columns to use specifically for classification report. 
# The true values may be under a different column name, in that case update 'labels_10' with correct true label column
df_comb['pred_10'] = df_comb[pred_col_j]
df_comb['true_10'] = df_comb['labels_10']

In [66]:
# If given prediction dataframe contains a dataset with 9-category true values,
# this function turns them into 10-category true values by adding the 10th value as 1 if it is a none category 
# 9 category none: [0,0,0,0,0,0,0,0,0]
# 10 catregory none: [0,0,0,0,0,0,0,0,0,1]
def generate_pred_10(pred_vector):
    return pred_vector + [1] if sum(pred_vector) == 0 else pred_vector + [0]

In [None]:
# Transform any 9 category representation into 10 category if needed.

# Creating True_10 from old 9 category true labels
df_comb['true_10'] = df_comb['labels_9'].apply(generate_pred_10) 

# Creating pred_10 from 9 category prediction results if our model outputs 9 category results
#df_comb['pred_10'] = df_comb[pred_col_j].apply(generate_pred_10) 

### Classification report

This code block calculates precision, recall, and f1 scores using scikit learn library

In [None]:
# df_comb is a predefined DataFrame with true and predicted labels for ICF categories
df = df_comb

# Extract true and predicted labels for the ICF categories from the DataFrame
icf_true = df['true_10'].tolist()
icf_pred = df['pred_10'].tolist()

# Define the category names corresponding to the labels
cat = ['ADM', 'ATT', 'BER', 'ENR', 'ETN', 'FAC', 'INS', 'MBW', 'STM', 'none']

# Generate a classification report for the true and predicted labels
# The report includes precision, recall, f1-score, and support for each category
report_icf = classification_report(icf_true, icf_pred, target_names=cat)

# Generate the classification report as a dictionary to create a DataFrame
report_icf_dict = classification_report(icf_true, icf_pred, target_names=cat, output_dict=True)

# Convert the classification report dictionary to a DataFrame and round the values to 2 decimal places
df_report_icf = pd.DataFrame(report_icf_dict).round(2)

# Print the name of the combined predictions output file (assuming combined_pred_file is predefined)
print("Combined predictions output file: ", combined_pred_file)
print()

# Print the DataFrame containing the classification report
print("====== COMBINED (binary + icf) classification report in df: ====== \n", df_report_icf)
print()

# Display the DataFrame containing the classification report
df_report_icf

# Additional comments:
# 1. This script assumes that `df_comb` contains the combined predictions and true labels for ICF categories.
# 2. The categories are specified in the list `cat`, which includes ICF categories and 'none' for no category.
# 3. The `classification_report` function from sklearn.metrics is used to generate the report.
# 4. The report is first printed in a textual format and then converted to a DataFrame for further analysis or export.

### Confusion Matrix

This code block creates a Confusion Matrix, rows correspond to True values, columns correspond to Predictions.

In [None]:
import pandas as pd
import numpy as np

# Assuming df_comb is a predefined DataFrame with true and predicted labels for ICF categories
df = df_comb

# Define the true and predicted category columns
cat = ['ADM', 'ATT', 'BER', 'ENR', 'ETN', 'FAC', 'INS', 'MBW', 'STM', 'none']
pred_cat = ['pred_ADM', 'pred_ATT', 'pred_BER', 'pred_ENR', 'pred_ETN', 'pred_FAC', 'pred_INS', 'pred_MBW', 'pred_STM', 'pred_none']

# Convert the lists of true and predicted labels into DataFrame columns
df[cat] = pd.DataFrame(df.true_10.tolist(), index=df.index)
df[pred_cat] = pd.DataFrame(df.pred_10.tolist(), index=df.index)

# Create empty confusion matrix with category names as both rows and columns
df_cm = pd.DataFrame(0, index=cat, columns=cat)

# Populate the confusion matrix DataFrame with counts
for i in range(len(cat)):
    # False Negatives: True label is 1, predicted label is 0
    # Select rows where the true label for the current category (cat[i]) is 1, but the predicted label for the same category (pred_cat[i]) is 0
    fneg = df.loc[df[cat[i]].apply(lambda x: x == 1) & df[pred_cat[i]].apply(lambda x: x == 0)]
    
    # True Positives: Both true label and predicted label are 1
    # Select rows where both the true label and predicted label for the current category (cat[i]) are 1
    tp = df.loc[df[cat[i]].apply(lambda x: x == 1) & df[pred_cat[i]].apply(lambda x: x == 1)]
    
    # False Positives: True label is 0, predicted label is 1
    # Select rows where the true label for the current category (cat[i]) is 0, but the predicted label for the same category (pred_cat[i]) is 1
    fpos = df.loc[df[cat[i]].apply(lambda x: x == 0) & df[pred_cat[i]].apply(lambda x: x == 1)]

    for j in range(len(cat)):
        if i == j:
            # True Positives for the current category
            # For diagonal elements, count the number of true positives where both the true and predicted labels for the current category are 1
            df_cm.at[cat[i], cat[i]] = len(tp[pred_cat[j]] == 1)
            continue
        
        # False Negatives: True label is current category, predicted label is another category
        # For off-diagonal elements, count the number of false negatives where the true label is the current category (cat[i]) and the predicted label is another category (pred_cat[j])
        df_cm.at[cat[i], cat[j]] = len(fneg[fneg[pred_cat[j]] == 1])

# Update column and index labels for better readability
# Set multi-index for columns with 'PREDICT' as the first level and the category names as the second level
df_cm.columns = pd.MultiIndex.from_tuples([('PREDICT', col) for col in df_cm.columns])

# Set multi-index for rows with 'TRUE' as the first level and the category names as the second level
df_cm.index = pd.MultiIndex.from_tuples([('TRUE', idx) for idx in df_cm.index])

# Style the confusion matrix DataFrame for better visualization
# Center-align text in cells and add a green gradient background to highlight the values
styled_cm = df_cm.style.set_properties(**{
    'text-align': 'center'
}).set_table_styles([
    dict(selector='th.col_heading.level0', props=[('text-align', 'center')]),
    dict(selector='th.row_heading.level0', props=[('text-align', 'center'), ('transform', 'rotate(270deg)'), ('vertical-align', 'middle')]),
    dict(selector='th.row_heading.level1', props=[('text-align', 'right')])
]).background_gradient(cmap='Greens')

# Display the styled confusion matrix
styled_cm


### Exporting Errors

This code block exports false positives and false negatives into csv files per category and error in the running directory

In [76]:
# Extract relevant columns from the main DataFrame created in classification report
df_red = df_comb[['text', 'true_10', 'pred_10', 'source_dataset', 'pad_sen_id']]

# Define the list of categories
categories = ['ADM', 'ATT', 'BER', 'ENR', 'ETN', 'FAC', 'INS', 'MBW', 'STM', 'none']

# Create dictionaries to store DataFrames for false positives (FP) and false negatives (FN)
fp_df_dict = {category: pd.DataFrame(columns=['text', 'true_10', 'pred_10']) for category in categories}
fn_df_dict = {category: pd.DataFrame(columns=['text', 'true_10', 'pred_10']) for category in categories}

def compute_fp_fn(df, true_col, pred_col, source_col, sen_id, categories):
    """
    Function to compute false positives (FP) and false negatives (FN) for each category and store them in corresponding DataFrames.
    
    Parameters:
    df (DataFrame): The input DataFrame containing text, true labels, predicted labels, source dataset, and sentence IDs.
    true_col (str): Column name for true labels.
    pred_col (str): Column name for predicted labels.
    source_col (str): Column name for the source dataset.
    sen_id (str): Column name for the sentence ID.
    categories (list): List of category names.
    """
    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        true_labels = row[true_col]      # Extract true labels for the current row
        pred_labels = row[pred_col]      # Extract predicted labels for the current row
        text = row['text']               # Extract text for the current row
        source = row[source_col]         # Extract source dataset for the current row
        sent_id = row[sen_id]            # Extract sentence ID for the current row

        # Iterate through each category to determine FP and FN
        for idx, category in enumerate(categories):
            if true_labels[idx] == 0 and pred_labels[idx] == 1:
                # False Positive: True label is 0, but predicted label is 1
                fp_df_dict[category] = pd.concat([fp_df_dict[category], pd.DataFrame([{'text': text, 'true_10': true_labels, 'pred_10': pred_labels, 'source_dataset': source, 'pad_sen_id': sent_id}])], ignore_index=True)
            elif true_labels[idx] == 1 and pred_labels[idx] == 0:
                # False Negative: True label is 1, but predicted label is 0
                fn_df_dict[category] = pd.concat([fn_df_dict[category], pd.DataFrame([{'text': text, 'true_10': true_labels, 'pred_10': pred_labels, 'source_dataset': source, 'pad_sen_id': sent_id}])], ignore_index=True)

# Call the function to compute FP and FN
compute_fp_fn(df_red, 'true_10', 'pred_10', 'source_dataset', 'pad_sen_id', categories)

# Export the FP and FN DataFrames to CSV files
for category in categories:
    fp_df_dict[category].to_csv(f'fp_{category}.csv', index=False, sep='~')
    fn_df_dict[category].to_csv(f'fn_{category}.csv', index=False, sep='~')


In [70]:
df_comb

Unnamed: 0,pad_sen_id,year,NotitieID,batch,annotator,source_dataset,background_sent,target_sent,text_raw,text,...,pred_ADM,pred_ATT,pred_BER,pred_ENR,pred_ETN,pred_FAC,pred_INS,pred_MBW,pred_STM,pred_none
0,0100001_0000,2020,0100001,fysiotherapie,edwin,ellemijn,False,False,categorie,categorie,...,0,0,0,0,0,0,0,0,0,1
1,0100001_0001,2020,0100001,fysiotherapie,edwin,ellemijn,False,False,Anders,Anders,...,0,0,0,0,0,0,0,0,0,1
2,0100002_0000,2020,0100002,dietetiek,edwin,ellemijn,False,False,),),...,0,0,0,0,0,0,0,0,0,1
3,0100002_0001,2020,0100002,dietetiek,edwin,ellemijn,False,False,valuatie,valuatie,...,0,0,0,0,0,0,0,0,0,1
4,0100002_0002,2020,0100002,dietetiek,edwin,ellemijn,False,False,/,/,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37350,423659267_0125,2020,423659267,ze_batch1,hinke,jenia_test,False,False,dr. Polyukhovych Beleid - TTE na weekend herha...,dr. Polyukhovych Beleid - TTE na weekend herha...,...,0,0,0,0,0,0,0,0,0,1
37351,423659267_0126,2020,423659267,ze_batch1,hinke,jenia_test,False,False,Morgen lab controle - Inplannen PCI na weekend...,Morgen lab controle - Inplannen PCI na weekend...,...,0,0,0,0,0,0,0,0,0,1
37352,423659267_0127,2020,423659267,ze_batch1,hinke,jenia_test,False,False,Na weekend .,Na weekend .,...,0,0,0,0,0,0,0,0,0,1
37353,423659267_0128,2020,423659267,ze_batch1,hinke,jenia_test,False,False,@ - Alvast aanmelden device MDO :,@ - Alvast aanmelden device MDO :,...,0,0,0,0,0,0,0,0,0,1
