# Evaluation of domain classifiers
## ====== 9-CLASS FORMAT ======

- Multi-label classification on sentence-level;
- Evaluation on sentence-level + aggregation on note-level;
- Each domain is evaluated independently.

In [1]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import sys
sys.path.insert(0, '..')
from utils.config import PATHS

from utils.data_process import flatten_preds_if_necessary

# Config

In [18]:
###### USER TO UPDATE ######
# UPDATE PREDICTION & GOLD LABEL COLUMN NAME

# list of domains
domains = ['ADM', 'ATT', 'BER', 'ENR', 'ETN', 'FAC', 'INS', 'MBW', 'STM']

# filename and path to .pkl with predictions
testpath = '../data/output/e0_output_sharona_data_all_labels.pkl'

# name of column with predictions
pred_col = 'pred_jenia'

# Load and process data

In [20]:
# domains column is needed for the explosion in the next cell
df = pd.read_pickle(testpath).assign(domains = lambda df: [domains] * len(df),).rename(columns={pred_col: 'preds'})
#.pipe(flatten_preds_if_necessary)

df

In [21]:
results = df.explode(
    ['domains', 'labels_9', 'preds']
)[['pad_sen_id', 'NotitieID', 'annotator', 'domains', 'labels_9', 'preds']].reset_index()

# Sentence level evaluation

## Precision, recall, F1-score

In [22]:
def make_classreport(domain):
    y_true = results.query(f"domains == '{domain}'").labels_9.astype(int)
    y_pred = results.query(f"domains == '{domain}'").preds.astype(int)
    precision = precision_score(y_true, y_pred, zero_division=0).round(2)
    recall = recall_score(y_true, y_pred, zero_division=0).round(2)
    accuracy = accuracy_score(y_true, y_pred).round(2)
    f1 = f1_score(y_true, y_pred).round(2)
    support = sum(y_true)
    return pd.Series(
        index = ['precision', 'recall', 'f1_score', 'accuracy', 'support'],
        data = [precision, recall, f1, accuracy, support],
        name = domain,
        dtype = object,
    )

#pd.concat([make_classreport(domain) for domain in domains], axis=1)
report = (pd.concat([make_classreport(domain) for domain in domains], axis=1))
print(report)
report_latex = report.to_latex()
print(report_latex)

            ADM   ATT   BER   ENR   ETN   FAC   INS   MBW   STM
precision  0.71   1.0   0.7   0.8  0.46  0.63  0.49  0.73  0.82
recall     0.72   0.1  0.48  0.76  0.65  0.81  0.44  0.79  0.63
f1_score   0.72  0.18  0.57  0.78  0.54  0.71  0.46  0.76  0.71
accuracy   0.99   1.0   1.0   1.0  0.97  0.99  0.99  0.99  0.99
support     116    10    40    68   200    63   101   161    89
\begin{tabular}{llllllllll}
\toprule
{} &   ADM &   ATT &   BER &   ENR &   ETN &   FAC &   INS &   MBW &   STM \\
\midrule
precision &  0.71 &   1.0 &   0.7 &   0.8 &  0.46 &  0.63 &  0.49 &  0.73 &  0.82 \\
recall    &  0.72 &   0.1 &  0.48 &  0.76 &  0.65 &  0.81 &  0.44 &  0.79 &  0.63 \\
f1\_score  &  0.72 &  0.18 &  0.57 &  0.78 &  0.54 &  0.71 &  0.46 &  0.76 &  0.71 \\
accuracy  &  0.99 &   1.0 &   1.0 &   1.0 &  0.97 &  0.99 &  0.99 &  0.99 &  0.99 \\
support   &   116 &    10 &    40 &    68 &   200 &    63 &   101 &   161 &    89 \\
\bottomrule
\end{tabular}



  report_latex = report.to_latex()


## Error analysis

In [23]:
results = results.merge(
    df[['pad_sen_id', 'background_sent', 'target_sent']],
    how='left',
    on='pad_sen_id',
).assign(
    background = lambda df: df.background_sent | df.target_sent,
    correct = lambda df: df.labels_9 == df.preds,
)

In [24]:
not_correct = results.query("not correct")
false_pos = results.query("not correct and labels_9 == 0")
false_neg = results.query("not correct and labels_9 == 1")

### Confusion matrix

In [25]:
def find_confusion(df, domain):
    query1 = f"domains == '{domain}' and not correct and labels_9 == 1"
    query2 = f"domains != '{domain}' and not correct and preds == 1"
    fneg_sents = set(df.query(query1).pad_sen_id)
    select_sents = "pad_sen_id in @fneg_sents"
    s = df.query(select_sents).query(query2).groupby('domains').size()
    return s.rename(domain)

def align_series(s, doms):
    output = pd.Series(index=doms, name=s.name, dtype='float')
    output.update(s)
    return output

confusions = [find_confusion(results, domain) for domain in domains]
confusions = [align_series(s, domains) for s in confusions]

In [26]:
def find_truepos(df, domain):
    query = f"domains == '{domain}' and correct and labels_9 == 1"
    return len(df.query(query))

truepos = {domain:find_truepos(results, domain) for domain in domains}

In [27]:
def find_fneg(df, domain):
    query1 = f"domains == '{domain}' and not correct and labels_9 == 1"
    query2 = f"domains != '{domain}' and not correct and preds == 1"

    # false negative sentence ids of ADM
    fneg_sents = set(df.query(query1).pad_sen_id)
    # false positive sentence ids of domains other than ADM
    confusions = set(df.query(query2).pad_sen_id)

#     print('===fneg_sents', fneg_sents)
#     print('===confusions', confusions)

    select_sents = "pad_sen_id in @fneg_sents"
    select_sents2 = "not pad_sen_id in @confusions"
    
    return df.query(select_sents).query(select_sents2).pad_sen_id.nunique()

fneg = {domain:find_fneg(results, domain) for domain in domains}
fneg = pd.Series(fneg, name='&lt;none&gt;')

In [28]:
def find_fpos(df, domain):
    
    query1 = f"domains == '{domain}' and not correct and labels_9 == 0"
    query2 = f"domains != '{domain}' and correct"
    fneg_sents = set(df.query(query1).pad_sen_id)
    select_sents = "pad_sen_id in @fneg_sents"
    return df.query(select_sents).query(query2).pad_sen_id.nunique()
fpos = {domain:find_fpos(results, domain) for domain in domains}
fpos = pd.Series(fpos, name='&lt;none&gt;')

In [29]:
matrix = pd.concat(confusions, axis=1).sort_index()
for key, val in truepos.items():
    matrix.loc[key, key] = val

# print(type(matrix))
# print(type(fneg), type(fpos))
# #.join(fpos)
matrix.loc['none'] = fneg
matrix = matrix.join(fpos)
#matrix = pd.concat([matrix,fneg],axis = 1)
#matrix = matrix.join(fpos)
print(matrix)


       ADM  ATT   BER   ENR    ETN   FAC   INS    MBW   STM  &lt;none&gt;
ADM   84.0  NaN   NaN   NaN    1.0   NaN   2.0    NaN   NaN          34.0
ATT    NaN  1.0   NaN   NaN    NaN   NaN   NaN    NaN   NaN           0.0
BER    NaN  NaN  19.0   NaN    NaN   NaN   NaN    NaN   1.0           8.0
ENR    1.0  NaN   NaN  52.0    1.0   NaN   3.0    NaN   1.0          13.0
ETN    1.0  NaN   NaN   NaN  130.0   NaN   1.0    2.0   NaN         150.0
FAC    1.0  NaN   NaN   NaN    NaN  51.0   1.0    NaN   1.0          30.0
INS    NaN  NaN   NaN   NaN    NaN   1.0  44.0    1.0   NaN          46.0
MBW    NaN  NaN   NaN   NaN    2.0   NaN   3.0  127.0   NaN          48.0
STM    NaN  NaN   NaN   NaN    NaN   NaN   NaN    NaN  56.0          12.0
none  29.0  9.0  21.0  16.0   67.0  11.0  48.0   31.0  31.0           NaN


In [31]:
matrix.fillna(0).astype(int).style.background_gradient(cmap='GnBu')

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM,<none>
ADM,84,0,0,0,1,0,2,0,0,34
ATT,0,1,0,0,0,0,0,0,0,0
BER,0,0,19,0,0,0,0,0,1,8
ENR,1,0,0,52,1,0,3,0,1,13
ETN,1,0,0,0,130,0,1,2,0,150
FAC,1,0,0,0,0,51,1,0,1,30
INS,0,0,0,0,0,1,44,1,0,46
MBW,0,0,0,0,2,0,3,127,0,48
STM,0,0,0,0,0,0,0,0,56,12
none,29,9,21,16,67,11,48,31,31,0


In [79]:
# example

domain = 'BER'
q = f"domains == '{domain}'"
false_neg_dom = false_neg.query(q).pad_sen_id.unique()
domain = 'INS'
q = f"domains == '{domain}'"
false_pos_dom = false_pos.query(q).pad_sen_id.unique()

for pad_sen_id in false_pos_dom:
    if pad_sen_id in false_neg_dom:
        print(pad_sen_id)

399996679_0072


In [80]:
sents = ['399996679_0072']
df.query(f"pad_sen_id == @sents")[['pad_sen_id', 'text', 'labels', 'preds']]

Unnamed: 0,pad_sen_id,text,labels,preds
13606,399996679_0072,"In het dagelijks leven gaat het redelijk , kan een trap oplopen , helpt haar dochter in de groothandel naar kunnen .","[0, 0, 1, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 1, 1, 0, 0]"


### False positives

In [76]:
# random example

domain = 'STM'
q = f"domains == '{domain}'"
example = false_pos.query(q).pad_sen_id.sample(1)
df.query(f"pad_sen_id == '{example.iloc[0]}'")[['pad_sen_id', 'text', 'labels', 'preds']]

Unnamed: 0,pad_sen_id,text,labels,preds
30854,444008207_0025,"Bij psychiatrisch onderzoek is er sprake van een geagiteerd angstig-depressief toestandsbeeld met suicidaliteit , geen psychose .","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1]"


### False negatives

In [61]:
# random example

domain = 'STM'
q = f"domains == '{domain}'"
example = false_neg.query(q).pad_sen_id.sample(1)
df.query(f"pad_sen_id == '{example.iloc[0]}'")[['pad_sen_id', 'text', 'labels', 'preds']]

Unnamed: 0,pad_sen_id,text,labels,preds
192672,408924107_0038,Emotineel laag belastbaar en onzeker tijdens mobiliseren .,"[0, 0, 0, 0, 0, 0, 0, 0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"


# Note level evaluation

In [33]:
notes = results.astype({
    'labels': bool, 'preds': bool
}).groupby([
    'NotitieID', 'domains',
])[['labels', 'preds']].any().assign(
    correct = lambda df: df.labels == df.preds,
).reset_index(level=1)

## Precision, recall, F1-score

In [34]:
def make_classreport(domain):
    y_true = notes.query(f"domains == '{domain}'").labels.astype(int)
    y_pred = notes.query(f"domains == '{domain}'").preds.astype(int)
    precision = precision_score(y_true, y_pred, zero_division=0).round(2)
    recall = recall_score(y_true, y_pred, zero_division=0).round(2)
    f1 = f1_score(y_true, y_pred).round(2)
    support = sum(y_true)
    return pd.Series(
        index = ['precision', 'recall', 'f1_score', 'support'],
        data = [precision, recall, f1, support],
        name = domain,
        dtype = object,
    )

pd.concat([make_classreport(domain) for domain in domains], axis=1)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
precision,0.72,0.18,0.12,0.43,0.57,0.49,0.6,0.19,0.51
recall,0.99,0.81,1.0,0.86,0.98,0.91,0.76,0.98,0.89
f1_score,0.83,0.29,0.22,0.57,0.72,0.63,0.67,0.31,0.65
support,231.0,27.0,34.0,92.0,165.0,95.0,116.0,64.0,94.0
