# Evaluation of domain classifiers

- Multi-label classification on sentence-level;
- Evaluation on sentence-level + aggregation on note-level;
- Each domain is evaluated independently.

In [1]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

import sys
sys.path.insert(0, '..')
from utils.config import PATHS
from utils.data_process import flatten_preds_if_necessary

In [2]:
pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)

# Config

Define:

- list of domains,
- path to dir where a pickled test file with predictions is located,
- the predictions column to be evaluated.

In [3]:
domains = ['ADM', 'ATT', 'BER', 'ENR', 'ETN', 'FAC', 'INS', 'MBW', 'STM']

pred_col = 'pred_domains_eb_ap_mod1'

datapath = 'data_expr_sept'
testdir = 'clf_domains'
testpath = PATHS.getpath(datapath) / testdir / 'dev.pkl'

# Load and process data

In [4]:
# domains column is needed for the explosion in the next cell
df = pd.read_pickle(testpath).pipe(
    flatten_preds_if_necessary
).assign(
    domains = lambda df: [domains] * len(df),
).rename(columns={pred_col: 'preds'})

In [5]:
results = df.explode(
    ['domains', 'labels', 'preds']
)[['pad_sen_id', 'NotitieID', 'annotator', 'domains', 'labels', 'preds']].reset_index()

# Sentence level evaluation

## Precision, recall, F1-score

In [6]:
def make_classreport(domain):
    y_true = results.query(f"domains == '{domain}'").labels.astype(int)
    y_pred = results.query(f"domains == '{domain}'").preds.astype(int)
    precision = precision_score(y_true, y_pred, zero_division=0).round(2)
    recall = recall_score(y_true, y_pred, zero_division=0).round(2)
    f1 = f1_score(y_true, y_pred).round(2)
    support = sum(y_true)
    return pd.Series(
        index = ['precision', 'recall', 'f1_score', 'support'],
        data = [precision, recall, f1, support],
        name = domain,
        dtype = object,
    )

pd.concat([make_classreport(domain) for domain in domains], axis=1)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
precision,0.72,0.91,0.44,0.72,0.54,0.47,0.42,0.7,0.53
recall,0.66,0.45,0.62,0.69,0.57,0.76,0.39,0.8,0.73
f1_score,0.69,0.61,0.51,0.7,0.56,0.58,0.4,0.75,0.61
support,411.0,22.0,29.0,105.0,225.0,119.0,127.0,96.0,147.0


## Error analysis

In [7]:
results = results.merge(
    df[['pad_sen_id', 'background_sent', 'target_sent']],
    how='left',
    on='pad_sen_id',
).assign(
    background = lambda df: df.background_sent | df.target_sent,
    correct = lambda df: df.labels == df.preds,
)

In [8]:
not_correct = results.query("not correct")
false_pos = results.query("not correct and labels == 0")
false_neg = results.query("not correct and labels == 1")

### Confusion matrix

In [9]:
def find_confusion(df, domain):
    query1 = f"domains == '{domain}' and not correct and labels == 1"
    query2 = f"domains != '{domain}' and not correct and preds == 1"
    fneg_sents = set(df.query(query1).pad_sen_id)
    select_sents = "pad_sen_id in @fneg_sents"
    s = df.query(select_sents).query(query2).groupby('domains').size()
    return s.rename(domain)

def align_series(s, doms):
    output = pd.Series(index=doms, name=s.name, dtype='float')
    output.update(s)
    return output

confusions = [find_confusion(results, domain) for domain in domains]
confusions = [align_series(s, domains) for s in confusions]

In [10]:
def find_truepos(df, domain):
    query = f"domains == '{domain}' and correct and labels == 1"
    return len(df.query(query))

truepos = {domain:find_truepos(results, domain) for domain in domains}

In [11]:
def find_fneg(df, domain):
    query1 = f"domains == '{domain}' and not correct and labels == 1"
    query2 = f"domains != '{domain}' and correct"
    fneg_sents = set(df.query(query1).pad_sen_id)
    select_sents = "pad_sen_id in @fneg_sents"
    return df.query(select_sents).query(query2).pad_sen_id.nunique()
fneg = {domain:find_fneg(results, domain) for domain in domains}
fneg = pd.Series(fneg, name='&lt;none&gt;')

In [12]:
def find_fpos(df, domain):
    query1 = f"domains == '{domain}' and not correct and labels == 0"
    query2 = f"domains != '{domain}' and correct"
    fneg_sents = set(df.query(query1).pad_sen_id)
    select_sents = "pad_sen_id in @fneg_sents"
    return df.query(select_sents).query(query2).pad_sen_id.nunique()
fpos = {domain:find_fpos(results, domain) for domain in domains}
fpos = pd.Series(fpos, name='&lt;none&gt;')

In [13]:
matrix = pd.concat(confusions, axis=1).sort_index()
for key, val in truepos.items():
    matrix.loc[key, key] = val
matrix = matrix.append(fneg).join(fpos)


In [14]:
matrix.fillna(0).astype(int).style.background_gradient(cmap='Greens')

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM,<none>
ADM,273,0,0,0,0,0,0,0,0,107
ATT,0,10,0,0,0,0,0,0,0,1
BER,0,0,18,0,0,0,0,0,0,23
ENR,0,0,0,72,0,0,2,0,0,28
ETN,0,0,0,2,128,0,0,2,0,107
FAC,0,0,0,0,0,91,1,0,0,103
INS,1,0,0,3,0,1,49,0,0,68
MBW,1,0,0,0,0,0,0,77,0,33
STM,0,0,0,3,1,0,0,0,107,95
<none>,138,12,11,33,97,28,78,19,40,0


### False positives

#### Background/target sentences

In [15]:
# proportion of background out of false positives

pd.concat([
    pd.crosstab(
        index=false_pos.domains,
        columns=false_pos.background,
        aggfunc='count',
        values=false_pos.pad_sen_id,
        normalize='index',
    ).mul(100).round(1),
    pd.crosstab(
        index=false_pos.domains,
        columns=false_pos.background,
        aggfunc='count',
        values=false_pos.pad_sen_id,
    ).fillna(0).astype(int),
], keys=['percentage', 'count'], axis=1
)

Unnamed: 0_level_0,percentage,percentage,count,count
background,False,True,False,True
domains,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ADM,63.6,36.4,68,39
ATT,100.0,0.0,1,0
BER,87.0,13.0,20,3
ENR,78.6,21.4,22,6
ETN,64.5,35.5,69,38
FAC,77.7,22.3,80,23
INS,77.9,22.1,53,15
MBW,87.9,12.1,29,4
STM,89.5,10.5,85,10


In [16]:
# random example

q = "domains == 'STM' and background"
example = false_pos.query(q).pad_sen_id.sample(1)
df.query(f"pad_sen_id == '{example.iloc[0]}'")[['pad_sen_id', 'text']]

Unnamed: 0,pad_sen_id,text
199528,440256911_0055,Angsten : ja -


#### Not background/target

In [17]:
# random example

q = "domains == 'MBW' and not background"
example = false_pos.query(q).pad_sen_id.sample(1)
df.query(f"pad_sen_id == '{example.iloc[0]}'")[['pad_sen_id', 'text']]

Unnamed: 0,pad_sen_id,text
181972,241987526_0010,6% gewichtsverlies


### False negatives

In [18]:
# random example

q = "domains == 'INS'"
example = false_neg.query(q).pad_sen_id.sample(1)
df.query(f"pad_sen_id == '{example.iloc[0]}'")[['pad_sen_id', 'text']]

Unnamed: 0,pad_sen_id,text
179104,411163806_0017,Activiteiten en Participatie ADL functioneren :


# Note level evaluation

In [19]:
notes = results.astype({
    'labels': bool, 'preds': bool
}).groupby([
    'NotitieID', 'domains',
])[['labels', 'preds']].any().assign(
    correct = lambda df: df.labels == df.preds,
).reset_index(level=1)

## Precision, recall, F1-score

In [20]:
def make_classreport(domain):
    y_true = notes.query(f"domains == '{domain}'").labels.astype(int)
    y_pred = notes.query(f"domains == '{domain}'").preds.astype(int)
    precision = precision_score(y_true, y_pred, zero_division=0).round(2)
    recall = recall_score(y_true, y_pred, zero_division=0).round(2)
    f1 = f1_score(y_true, y_pred).round(2)
    support = sum(y_true)
    return pd.Series(
        index = ['precision', 'recall', 'f1_score', 'support'],
        data = [precision, recall, f1, support],
        name = domain,
        dtype = object,
    )

pd.concat([make_classreport(domain) for domain in domains], axis=1)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
precision,0.87,1.0,0.55,0.8,0.83,0.7,0.61,0.81,0.67
recall,0.89,0.53,0.72,0.79,0.77,0.85,0.53,0.87,0.84
f1_score,0.88,0.69,0.62,0.79,0.8,0.77,0.57,0.84,0.74
support,188.0,17.0,25.0,71.0,128.0,75.0,78.0,71.0,83.0


## Impact background/target

To estimate the impact of background/target sentences, I remove them from the test set and repeat the evaluation.

In [21]:
background_sents = df.query(
    "(background_sent or target_sent) and labels.astype('str') == '[0, 0, 0, 0, 0, 0, 0, 0, 0]'"
).pad_sen_id.to_list()

In [22]:
no_bckgrnd = results.loc[~results.pad_sen_id.isin(background_sents)].astype({
    'labels': bool, 'preds': bool
}).groupby([
    'NotitieID', 'domains',
])[['labels', 'preds']].any().assign(
    correct = lambda df: df.labels == df.preds,
).reset_index(level=1)

In [23]:
def make_classreport(domain):
    y_true = no_bckgrnd.query(f"domains == '{domain}'").labels.astype(int)
    y_pred = no_bckgrnd.query(f"domains == '{domain}'").preds.astype(int)
    precision = precision_score(y_true, y_pred, zero_division=0).round(2)
    recall = recall_score(y_true, y_pred, zero_division=0).round(2)
    f1 = f1_score(y_true, y_pred).round(2)
    support = sum(y_true)
    return pd.Series(
        index = ['precision', 'recall', 'f1_score', 'support'],
        data = [precision, recall, f1, support],
        name = domain,
        dtype = object,
    )

pd.concat([make_classreport(domain) for domain in domains], axis=1)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
precision,0.89,1.0,0.55,0.84,0.86,0.73,0.66,0.81,0.67
recall,0.88,0.53,0.68,0.79,0.75,0.84,0.51,0.86,0.84
f1_score,0.89,0.69,0.61,0.81,0.8,0.78,0.58,0.84,0.75
support,188.0,17.0,25.0,71.0,128.0,75.0,78.0,71.0,83.0
