# Evaluation of domain classifiers

- Multi-label classification on sentence-level;
- Evaluation on sentence-level + aggregation on note-level;
- Each domain is evaluated independently.

In [1]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

import sys
sys.path.insert(0, '..')
from utils.config import PATHS
from utils.data_process import flatten_preds_if_necessary

In [2]:
pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)

# Config

Define:

- list of domains,
- path to dir where a pickled test file with predictions is located,
- the predictions column to be evaluated.

In [3]:
domains = ['ADM', 'ATT', 'BER', 'ENR', 'ETN', 'FAC', 'INS', 'MBW', 'STM']

pred_col = 'pred_domains_eb_ap_mod1'

datapath = 'data_expr_sept'
testdir = 'clf_domains'
testpath = PATHS.getpath(datapath) / testdir / 'test.pkl'

# Load and process data

In [4]:
# domains column is needed for the explosion in the next cell
df = pd.read_pickle(testpath).pipe(
    flatten_preds_if_necessary
).assign(
    domains = lambda df: [domains] * len(df),
).rename(columns={pred_col: 'preds'})

In [5]:
results = df.explode(
    ['domains', 'labels', 'preds']
)[['pad_sen_id', 'NotitieID', 'annotator', 'domains', 'labels', 'preds']].reset_index()

# Sentence level evaluation

## Precision, recall, F1-score

In [6]:
def make_classreport(domain):
    y_true = results.query(f"domains == '{domain}'").labels.astype(int)
    y_pred = results.query(f"domains == '{domain}'").preds.astype(int)
    precision = precision_score(y_true, y_pred, zero_division=0).round(2)
    recall = recall_score(y_true, y_pred, zero_division=0).round(2)
    f1 = f1_score(y_true, y_pred).round(2)
    support = sum(y_true)
    return pd.Series(
        index = ['precision', 'recall', 'f1_score', 'support'],
        data = [precision, recall, f1, support],
        name = domain,
        dtype = object,
    )

pd.concat([make_classreport(domain) for domain in domains], axis=1)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
precision,0.98,1.0,0.48,0.98,0.93,0.82,0.82,0.78,0.68
recall,0.47,0.36,0.41,0.56,0.49,0.72,0.29,0.62,0.76
f1_score,0.64,0.53,0.44,0.71,0.64,0.76,0.43,0.69,0.72
support,775.0,39.0,54.0,160.0,382.0,253.0,287.0,125.0,181.0


## Error analysis

In [7]:
results = results.merge(
    df[['pad_sen_id', 'background_sent', 'target_sent']],
    how='left',
    on='pad_sen_id',
).assign(
    background = lambda df: df.background_sent | df.target_sent,
    correct = lambda df: df.labels == df.preds,
)

In [8]:
not_correct = results.query("not correct")
false_pos = results.query("not correct and labels == 0")
false_neg = results.query("not correct and labels == 1")

### Confusion matrix

In [9]:
def find_confusion(df, domain):
    query1 = f"domains == '{domain}' and not correct and labels == 1"
    query2 = f"domains != '{domain}' and not correct and preds == 1"
    fneg_sents = set(df.query(query1).pad_sen_id)
    select_sents = "pad_sen_id in @fneg_sents"
    s = df.query(select_sents).query(query2).groupby('domains').size()
    return s.rename(domain)

def align_series(s, doms):
    output = pd.Series(index=doms, name=s.name, dtype='float')
    output.update(s)
    return output

confusions = [find_confusion(results, domain) for domain in domains]
confusions = [align_series(s, domains) for s in confusions]

In [10]:
def find_truepos(df, domain):
    query = f"domains == '{domain}' and correct and labels == 1"
    return len(df.query(query))

truepos = {domain:find_truepos(results, domain) for domain in domains}

In [11]:
def find_fneg(df, domain):
    query1 = f"domains == '{domain}' and not correct and labels == 1"
    query2 = f"domains != '{domain}' and not correct and preds == 1"
    fneg_sents = set(df.query(query1).pad_sen_id)
    confusions = set(df.query(query2).pad_sen_id)
    select_sents = "pad_sen_id in @fneg_sents"
    select_sents2 = "not pad_sen_id in @confusions"
    return df.query(select_sents).query(select_sents2).pad_sen_id.nunique()
fneg = {domain:find_fneg(results, domain) for domain in domains}
fneg = pd.Series(fneg, name='&lt;none&gt;')

In [12]:
def find_fpos(df, domain):
    query1 = f"domains == '{domain}' and not correct and labels == 0"
    query2 = f"domains != '{domain}' and correct"
    fneg_sents = set(df.query(query1).pad_sen_id)
    select_sents = "pad_sen_id in @fneg_sents"
    return df.query(select_sents).query(query2).pad_sen_id.nunique()
fpos = {domain:find_fpos(results, domain) for domain in domains}
fpos = pd.Series(fpos, name='&lt;none&gt;')

In [13]:
matrix = pd.concat(confusions, axis=1).sort_index()
for key, val in truepos.items():
    matrix.loc[key, key] = val
matrix = matrix.append(fneg).join(fpos)

In [14]:
matrix.fillna(0).astype(int).style.background_gradient(cmap='Greens')

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM,<none>
ADM,368,0,0,0,0,0,0,0,0,9
ATT,0,14,0,0,0,0,0,0,0,0
BER,0,0,22,1,0,0,4,0,0,24
ENR,0,0,0,90,0,0,0,0,0,2
ETN,0,0,0,0,186,0,0,1,0,15
FAC,2,0,0,1,1,182,5,0,0,41
INS,1,0,1,0,0,0,83,0,0,18
MBW,0,0,0,1,4,0,0,77,0,22
STM,1,0,1,1,1,0,1,0,138,65
<none>,403,25,30,66,190,71,194,47,43,0


In [79]:
# example

domain = 'BER'
q = f"domains == '{domain}'"
false_neg_dom = false_neg.query(q).pad_sen_id.unique()
domain = 'INS'
q = f"domains == '{domain}'"
false_pos_dom = false_pos.query(q).pad_sen_id.unique()

for pad_sen_id in false_pos_dom:
    if pad_sen_id in false_neg_dom:
        print(pad_sen_id)

399996679_0072


In [80]:
sents = ['399996679_0072']
df.query(f"pad_sen_id == @sents")[['pad_sen_id', 'text', 'labels', 'preds']]

Unnamed: 0,pad_sen_id,text,labels,preds
13606,399996679_0072,"In het dagelijks leven gaat het redelijk , kan een trap oplopen , helpt haar dochter in de groothandel naar kunnen .","[0, 0, 1, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 1, 1, 0, 0]"


### False positives

In [76]:
# random example

domain = 'STM'
q = f"domains == '{domain}'"
example = false_pos.query(q).pad_sen_id.sample(1)
df.query(f"pad_sen_id == '{example.iloc[0]}'")[['pad_sen_id', 'text', 'labels', 'preds']]

Unnamed: 0,pad_sen_id,text,labels,preds
30854,444008207_0025,"Bij psychiatrisch onderzoek is er sprake van een geagiteerd angstig-depressief toestandsbeeld met suicidaliteit , geen psychose .","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1]"


### False negatives

In [61]:
# random example

domain = 'STM'
q = f"domains == '{domain}'"
example = false_neg.query(q).pad_sen_id.sample(1)
df.query(f"pad_sen_id == '{example.iloc[0]}'")[['pad_sen_id', 'text', 'labels', 'preds']]

Unnamed: 0,pad_sen_id,text,labels,preds
192672,408924107_0038,Emotineel laag belastbaar en onzeker tijdens mobiliseren .,"[0, 0, 0, 0, 0, 0, 0, 0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"


# Note level evaluation

In [19]:
notes = results.astype({
    'labels': bool, 'preds': bool
}).groupby([
    'NotitieID', 'domains',
])[['labels', 'preds']].any().assign(
    correct = lambda df: df.labels == df.preds,
).reset_index(level=1)

## Precision, recall, F1-score

In [20]:
def make_classreport(domain):
    y_true = notes.query(f"domains == '{domain}'").labels.astype(int)
    y_pred = notes.query(f"domains == '{domain}'").preds.astype(int)
    precision = precision_score(y_true, y_pred, zero_division=0).round(2)
    recall = recall_score(y_true, y_pred, zero_division=0).round(2)
    f1 = f1_score(y_true, y_pred).round(2)
    support = sum(y_true)
    return pd.Series(
        index = ['precision', 'recall', 'f1_score', 'support'],
        data = [precision, recall, f1, support],
        name = domain,
        dtype = object,
    )

pd.concat([make_classreport(domain) for domain in domains], axis=1)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
precision,0.99,1.0,0.62,0.98,0.94,0.83,0.94,0.88,0.79
recall,0.87,0.48,0.62,0.67,0.73,0.89,0.53,0.88,0.89
f1_score,0.93,0.65,0.62,0.8,0.82,0.86,0.67,0.88,0.84
support,231.0,27.0,34.0,92.0,165.0,95.0,116.0,64.0,94.0
