# Evaluation of domain classifiers

- Multi-label classification on sentence-level;
- Evaluation on sentence-level + aggregation on note-level;
- Each domain is evaluated independently.

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.metrics import precision_score, recall_score, f1_score

import sys
sys.path.insert(0, '..')
from utils.config import PATHS

In [2]:
pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)

# Config

Define:

- list of domains,
- path to dir where a pickled test file with predictions is located,
- the predictions column to be evaluated.

In [3]:
domains = ['ADM', 'ATT', 'BER', 'ENR', 'ETN', 'FAC', 'INS', 'MBW', 'STM']

pred_col = 'pred_domains_add_pilot'

datapath = 'data_expr_july'
testdir = 'clf_domains'
testpath = PATHS.getpath(datapath) / testdir / 'test.pkl'

# Load and process data

In [4]:
# raw predictions are a list in a list, e.g. [[0, 1, 0, 0]];
# `preds` columns flattens them to a list.
# domains column is needed for the explosion in the next cell.

df = pd.read_pickle(testpath).assign(
    preds = lambda df: df[pred_col].str[0],
    domains = lambda df: [domains] * len(df),
)

In [5]:
results = df.explode(
    ['domains', 'labels', 'preds']
)[['pad_sen_id', 'NotitieID', 'annotator', 'domains', 'labels', 'preds']].reset_index()

# Sentence level evaluation

## Precision, recall, F1-score

In [6]:
def make_classreport(domain):
    y_true = results.query(f"domains == '{domain}'").labels.astype(int)
    y_pred = results.query(f"domains == '{domain}'").preds.astype(int)
    precision = precision_score(y_true, y_pred, zero_division=0).round(2)
    recall = recall_score(y_true, y_pred, zero_division=0).round(2)
    f1 = f1_score(y_true, y_pred).round(2)
    support = sum(y_true)
    return pd.Series(
        index = ['precision', 'recall', 'f1_score', 'support'],
        data = [precision, recall, f1, support],
        name = domain,
        dtype = object,
    )

pd.concat([make_classreport(domain) for domain in domains], axis=1)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
precision,0.65,0.83,0.31,0.72,0.51,0.49,0.42,0.58,0.53
recall,0.6,0.47,0.62,0.66,0.66,0.82,0.4,0.83,0.76
f1_score,0.63,0.6,0.42,0.69,0.57,0.62,0.41,0.68,0.62
support,405.0,32.0,26.0,98.0,178.0,136.0,133.0,64.0,155.0


## Error analysis

In [7]:
results = results.merge(
    df[['pad_sen_id', 'background_sent', 'target_sent']],
    how='left',
    on='pad_sen_id',
).assign(
    background = lambda df: df.background_sent | df.target_sent,
    correct = lambda df: df.labels == df.preds,
)

In [8]:
not_correct = results.query("not correct")
false_pos = results.query("not correct and labels == 0")
false_neg = results.query("not correct and labels == 1")

### Confusion matrix

In [9]:
def find_confusion(df, domain):
    query1 = f"domains == '{domain}' and not correct and labels == 1"
    query2 = f"domains != '{domain}' and not correct and preds == 1"
    fneg_sents = set(df.query(query1).pad_sen_id)
    select_sents = "pad_sen_id in @fneg_sents"
    s = df.query(select_sents).query(query2).groupby('domains').size()
    return s.rename(domain)

def align_series(s, doms):
    output = pd.Series(index=doms, name=s.name, dtype='float')
    output.update(s)
    return output

confusions = [find_confusion(results, domain) for domain in domains]
confusions = [align_series(s, domains) for s in confusions]

In [10]:
def find_truepos(df, domain):
    query = f"domains == '{domain}' and correct and labels == 1"
    return len(df.query(query))

truepos = {domain:find_truepos(results, domain) for domain in domains}

In [11]:
def find_fneg(df, domain):
    query1 = f"domains == '{domain}' and not correct and labels == 1"
    query2 = f"domains != '{domain}' and correct"
    fneg_sents = set(df.query(query1).pad_sen_id)
    select_sents = "pad_sen_id in @fneg_sents"
    return df.query(select_sents).query(query2).pad_sen_id.nunique()
fneg = {domain:find_fneg(results, domain) for domain in domains}
fneg = pd.Series(fneg, name='&lt;none&gt;')

In [12]:
def find_fpos(df, domain):
    query1 = f"domains == '{domain}' and not correct and labels == 0"
    query2 = f"domains != '{domain}' and correct"
    fneg_sents = set(df.query(query1).pad_sen_id)
    select_sents = "pad_sen_id in @fneg_sents"
    return df.query(select_sents).query(query2).pad_sen_id.nunique()
fpos = {domain:find_fpos(results, domain) for domain in domains}
fpos = pd.Series(fpos, name='&lt;none&gt;')

In [13]:
matrix = pd.concat(confusions, axis=1).sort_index()
for key, val in truepos.items():
    matrix.loc[key, key] = val
matrix = matrix.append(fneg).join(fpos)


In [14]:
matrix.fillna(0).astype(int).style.background_gradient(cmap='Greens')

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM,<none>
ADM,245,0,0,1,1,1,3,0,2,131
ATT,0,15,0,0,0,0,0,0,1,3
BER,0,0,16,1,0,0,1,0,0,35
ENR,1,0,0,65,0,0,0,0,0,25
ETN,0,0,0,0,117,0,0,0,0,113
FAC,2,1,0,1,1,112,9,0,0,116
INS,2,1,0,2,0,1,53,1,1,72
MBW,0,0,0,0,0,0,0,53,1,38
STM,0,0,0,1,0,0,0,0,118,105
<none>,160,17,10,33,61,24,80,11,37,0


### False positives

#### Background/target sentences

In [15]:
# proportion of background out of false positives

pd.concat([
    pd.crosstab(
        index=false_pos.domains,
        columns=false_pos.background,
        aggfunc='count',
        values=false_pos.pad_sen_id,
        normalize='index',
    ).mul(100).round(1),
    pd.crosstab(
        index=false_pos.domains,
        columns=false_pos.background,
        aggfunc='count',
        values=false_pos.pad_sen_id,
    ).fillna(0).astype(int),
], keys=['percentage', 'count'], axis=1
)

Unnamed: 0_level_0,percentage,percentage,count,count
background,False,True,False,True
domains,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ADM,74.0,26.0,97,34
ATT,66.7,33.3,2,1
BER,77.1,22.9,27,8
ENR,60.0,40.0,15,10
ETN,72.6,27.4,82,31
FAC,66.4,33.6,77,39
INS,76.4,23.6,55,17
MBW,78.9,21.1,30,8
STM,82.9,17.1,87,18


In [16]:
# random example

q = "domains == 'STM' and background"
example = false_pos.query(q).pad_sen_id.sample(1)
df.query(f"pad_sen_id == '{example.iloc[0]}'")[['pad_sen_id', 'text']]

Unnamed: 0,pad_sen_id,text
149762,407195723_0058,Voelt zich opgelucht .


#### Not background/target

In [17]:
# random example

q = "domains == 'MBW' and not background"
example = false_pos.query(q).pad_sen_id.sample(1)
df.query(f"pad_sen_id == '{example.iloc[0]}'")[['pad_sen_id', 'text']]

Unnamed: 0,pad_sen_id,text
212244,253217006_0027,Niet afgevallen .


### False negatives

In [18]:
# random example

q = "domains == 'INS'"
example = false_neg.query(q).pad_sen_id.sample(1)
df.query(f"pad_sen_id == '{example.iloc[0]}'")[['pad_sen_id', 'text']]

Unnamed: 0,pad_sen_id,text
24474,413727342_0038,Zit nog niet op uit bed .


# Note level evaluation

In [19]:
notes = results.astype({
    'labels': bool, 'preds': bool
}).groupby([
    'NotitieID', 'domains',
])[['labels', 'preds']].any().assign(
    correct = lambda df: df.labels == df.preds,
).reset_index(level=1)

## Precision, recall, F1-score

In [20]:
def make_classreport(domain):
    y_true = notes.query(f"domains == '{domain}'").labels.astype(int)
    y_pred = notes.query(f"domains == '{domain}'").preds.astype(int)
    precision = precision_score(y_true, y_pred, zero_division=0).round(2)
    recall = recall_score(y_true, y_pred, zero_division=0).round(2)
    f1 = f1_score(y_true, y_pred).round(2)
    support = sum(y_true)
    return pd.Series(
        index = ['precision', 'recall', 'f1_score', 'support'],
        data = [precision, recall, f1, support],
        name = domain,
        dtype = object,
    )

pd.concat([make_classreport(domain) for domain in domains], axis=1)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
precision,0.86,0.88,0.41,0.81,0.71,0.68,0.64,0.69,0.65
recall,0.87,0.67,0.73,0.8,0.75,0.92,0.6,0.93,0.91
f1_score,0.87,0.76,0.52,0.8,0.73,0.78,0.62,0.79,0.76
support,199.0,21.0,22.0,69.0,123.0,80.0,75.0,45.0,85.0


## Impact background/target

To estimate the impact of background/target sentences, I remove them from the test set and repeat the evaluation.

In [21]:
background_sents = df.query(
    "(background_sent or target_sent) and labels.astype('str') == '[0, 0, 0, 0, 0, 0, 0, 0, 0]'"
).pad_sen_id.to_list()

In [22]:
no_bckgrnd = results.loc[~results.pad_sen_id.isin(background_sents)].astype({
    'labels': bool, 'preds': bool
}).groupby([
    'NotitieID', 'domains',
])[['labels', 'preds']].any().assign(
    correct = lambda df: df.labels == df.preds,
).reset_index(level=1)

In [23]:
def make_classreport(domain):
    y_true = no_bckgrnd.query(f"domains == '{domain}'").labels.astype(int)
    y_pred = no_bckgrnd.query(f"domains == '{domain}'").preds.astype(int)
    precision = precision_score(y_true, y_pred, zero_division=0).round(2)
    recall = recall_score(y_true, y_pred, zero_division=0).round(2)
    f1 = f1_score(y_true, y_pred).round(2)
    support = sum(y_true)
    return pd.Series(
        index = ['precision', 'recall', 'f1_score', 'support'],
        data = [precision, recall, f1, support],
        name = domain,
        dtype = object,
    )

pd.concat([make_classreport(domain) for domain in domains], axis=1)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
precision,0.87,0.93,0.48,0.86,0.74,0.73,0.7,0.71,0.68
recall,0.85,0.67,0.68,0.78,0.75,0.92,0.59,0.93,0.89
f1_score,0.86,0.78,0.57,0.82,0.74,0.81,0.64,0.81,0.77
support,199.0,21.0,22.0,69.0,123.0,80.0,75.0,45.0,85.0
