# Evaluation of domain classifiers

- Multi-label classification on sentence-level;
- Evaluation on sentence-level + aggregation on note-level;
- Each domain is evaluated independently.

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)

# Config

Define:

- list of domains,
- path to dir where a pickled test file with predictions is located,
- the predictions column to be evaluated.

In [3]:
domains = ['ADM', 'ATT', 'BER', 'ENR', 'ETN', 'FAC', 'INS', 'MBW', 'STM']

path = Path('../../data/expr_july/clf_domains/')

pred_col = 'pred_domains_spacy_default'

# Load and process data

In [4]:
# raw predictions are a list in a list, e.g. [[0, 1, 0, 0]];
# `preds` columns flattens them to a list.
# domains column is needed for the explosion in the next cell.

df = pd.read_pickle(path / 'test.pkl').assign(
    preds = lambda df: df[pred_col].str[0],
    domains = lambda df: [domains] * len(df),
)

In [5]:
results = df.explode(
    ['domains', 'labels', 'preds']
)[['pad_sen_id', 'NotitieID', 'annotator', 'domains', 'labels', 'preds']].reset_index()

# Sentence level evaluation

## Precision, recall, F1-score

In [6]:
def make_classreport(domain):
    y_true = results.query(f"domains == '{domain}'").labels.astype(int)
    y_pred = results.query(f"domains == '{domain}'").preds.astype(int)
    precision = precision_score(y_true, y_pred).round(2)
    recall = recall_score(y_true, y_pred).round(2)
    f1 = f1_score(y_true, y_pred).round(2)
    return pd.Series(
        index = ['precision', 'recall', 'f1_score'],
        data = [precision, recall, f1],
        name = domain,
    )

pd.concat([make_classreport(domain) for domain in domains], axis=1)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
precision,0.69,0.88,1.0,0.69,0.61,0.68,0.59,0.55,0.66
recall,0.61,0.47,0.08,0.73,0.51,0.71,0.17,0.73,0.64
f1_score,0.65,0.61,0.14,0.71,0.56,0.7,0.27,0.63,0.65


## Error analysis

In [7]:
results = results.merge(
    df[['pad_sen_id', 'background_sent', 'target_sent']],
    how='left',
    on='pad_sen_id',
).assign(
    background = lambda df: df.background_sent | df.target_sent,
    correct = lambda df: df.labels == df.preds,
)

In [8]:
not_correct = results.query("not correct")
false_pos = results.query("not correct and labels == 0")
false_neg = results.query("not correct and labels == 1")

### False positives

#### Background/target sentences

In [9]:
# proportion of background out of false positives

pd.concat([
    pd.crosstab(
        index=false_pos.domains,
        columns=false_pos.background,
        aggfunc='count',
        values=false_pos.pad_sen_id,
        normalize='index',
    ).mul(100).round(1),
    pd.crosstab(
        index=false_pos.domains,
        columns=false_pos.background,
        aggfunc='count',
        values=false_pos.pad_sen_id,
    ).fillna(0).astype(int),
], keys=['percentage', 'count'], axis=1
)

Unnamed: 0_level_0,percentage,percentage,count,count
background,False,True,False,True
domains,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ADM,81.2,18.8,91,21
ATT,100.0,0.0,2,0
ENR,57.6,42.4,19,14
ETN,82.8,17.2,48,10
FAC,82.6,17.4,38,8
INS,68.8,31.2,11,5
MBW,82.1,17.9,32,7
STM,76.0,24.0,38,12


In [10]:
# random example

q = "domains == 'STM' and background"
example = false_pos.query(q).pad_sen_id.sample(1)
df.query(f"pad_sen_id == '{example.iloc[0]}'")[['pad_sen_id', 'text']]

Unnamed: 0,pad_sen_id,text
890,237672660_0102,Mama was er ook bang voor .


#### Not background/target

In [11]:
# random example

q = "domains == 'MBW' and not background"
example = false_pos.query(q).pad_sen_id.sample(1)
df.query(f"pad_sen_id == '{example.iloc[0]}'")[['pad_sen_id', 'text']]

Unnamed: 0,pad_sen_id,text
161678,435636274_0059,"Gewicht wisselt altijd , weegt nu 130 wat ook ongeveer het gewicht was in 2016 toen de verhoogde leverwaarden werden gevonden ."


### False negatives

In [12]:
# random example

q = "domains == 'INS'"
example = false_neg.query(q).pad_sen_id.sample(1)
df.query(f"pad_sen_id == '{example.iloc[0]}'")[['pad_sen_id', 'text']]

Unnamed: 0,pad_sen_id,text
20156,415914113_0006,"Ademhaling nu weer normaal , wel nog minder conditie ."


# Note level evaluation

In [13]:
notes = results.astype({
    'labels': bool, 'preds': bool
}).groupby([
    'NotitieID', 'domains',
])[['labels', 'preds']].any().assign(
    correct = lambda df: df.labels == df.preds,
).reset_index(level=1)

## Precision, recall, F1-score

In [14]:
def make_classreport(domain):
    y_true = notes.query(f"domains == '{domain}'").labels.astype(int)
    y_pred = notes.query(f"domains == '{domain}'").preds.astype(int)
    precision = precision_score(y_true, y_pred).round(2)
    recall = recall_score(y_true, y_pred).round(2)
    f1 = f1_score(y_true, y_pred).round(2)
    return pd.Series(
        index = ['precision', 'recall', 'f1_score'],
        data = [precision, recall, f1],
        name = domain,
    )

pd.concat([make_classreport(domain) for domain in domains], axis=1)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
precision,0.87,0.88,1.0,0.78,0.74,0.78,0.79,0.67,0.76
recall,0.86,0.67,0.09,0.81,0.63,0.79,0.31,0.84,0.76
f1_score,0.87,0.76,0.17,0.79,0.68,0.78,0.44,0.75,0.76


## Impact background/target

To estimate the impact of background/target sentences, I remove them from the test set and repeat the evaluation.

In [15]:
no_bckgrnd = notes = results.query("not background").astype({
    'labels': bool, 'preds': bool
}).groupby([
    'NotitieID', 'domains',
])[['labels', 'preds']].any().assign(
    correct = lambda df: df.labels == df.preds,
).reset_index(level=1)

In [16]:
def make_classreport(domain):
    y_true = no_bckgrnd.query(f"domains == '{domain}'").labels.astype(int)
    y_pred = no_bckgrnd.query(f"domains == '{domain}'").preds.astype(int)
    precision = precision_score(y_true, y_pred).round(2)
    recall = recall_score(y_true, y_pred).round(2)
    f1 = f1_score(y_true, y_pred).round(2)
    return pd.Series(
        index = ['precision', 'recall', 'f1_score'],
        data = [precision, recall, f1],
        name = domain,
    )

pd.concat([make_classreport(domain) for domain in domains], axis=1)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
precision,0.89,0.88,1.0,0.83,0.75,0.79,0.88,0.7,0.78
recall,0.87,0.67,0.05,0.78,0.61,0.79,0.29,0.82,0.75
f1_score,0.88,0.76,0.1,0.8,0.68,0.79,0.44,0.76,0.77
