# Results Analysis

## Utils link

- [Multi Label Model Evaluation](https://www.kaggle.com/code/kmkarakaya/multi-label-model-evaluation)

## Code prepare

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import roc_auc_score, multilabel_confusion_matrix, classification_report, accuracy_score, jaccard_score, f1_score
import os
import warnings; warnings.filterwarnings('ignore')
from IPython.display import display

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
plt.style.use([s for s in plt.style.available if 'whitegrid' in s][0])
plt.rcParams['figure.figsize'] = [16, 9]
plt.rcParams['figure.dpi'] = 100

### Utils functions

In [3]:
def list_folders(path):
    # Check if the path exists
    if not os.path.exists(path):
        return "The specified path does not exist."

    # Get a list of all items in the path
    contents = os.listdir(path)

    # Filter only the folders
    folders = [item for item in contents if os.path.isdir(os.path.join(path, item))]

    return folders

def experiments(name:str):
    experiments = [
        ('exp0', 'all'),
        ('exp1', 'diagnostic'),
        ('exp1.1', 'subdiagnostic'),
        ('exp1.1.1', 'superdiagnostic'),
        ('exp2', 'form'),
        ('exp3', 'rhythm')
       ]
    for exp in experiments:
        if name == exp[0]:
            return exp[1]

### Variables

In [4]:
experiments = {
        'exp0': 'all',
        'exp1': 'diagnostic',
        'exp1.1': 'subdiagnostic',
        'exp1.1.1': 'superdiagnostic',
        'exp2': 'form',
        'exp3': 'rhythm'
       }

datas = {
"test": None, 
"train": None, 
"val": None
}

predictions = {
"test": None, 
"train": None, 
"val": None
}

path = "./"

threshold = 0.5

## Analysis

In [5]:
for exp in list_folders(path):
    print("_"*100)
    print(exp)
    for dt in datas:
        datas[dt] = np.load(f'{exp}/data/y_{dt}.npy', allow_pickle=True)
    for model in list_folders(f'{exp}/models'):
        print(f'\n\t{model}')
        for pred in predictions:
            predictions[pred] = np.load(f'{exp}/models/{model}/y_{pred}_pred.npy', allow_pickle=True)
            print(f'\t\t{pred}')
            print(f'\t\t\t- Shape: {predictions[pred].shape}')
            print(f'\t\t\t- Binary values: {len(np.unique(predictions[pred]))==2}')
            print(f'\t\t\t- Min value: {np.min(predictions[pred])}')
            print(f'\t\t\t- Max value: {np.max(predictions[pred])}')
            print(f'\t\t\t- Range 0-1: {(np.min(predictions[pred]) >= 0) and (np.max(predictions[pred]) <= 1)}')
            


    print("\n\n")

____________________________________________________________________________________________________
exp1.1

	fastai_xresnet1d101
		test
			- Shape: (2158, 23)
			- Binary values: False
			- Min value: 3.480590815563289e-11
			- Max value: 0.9999078512191772
			- Range 0-1: True
		train
			- Shape: (17084, 23)
			- Binary values: False
			- Min value: 4.276196282785921e-15
			- Max value: 1.0
			- Range 0-1: True
		val
			- Shape: (2146, 23)
			- Binary values: False
			- Min value: 1.7381232629391785e-13
			- Max value: 0.9999195337295532
			- Range 0-1: True

	fastai_lstm
		test
			- Shape: (2158, 23)
			- Binary values: False
			- Min value: 1.116526454625344e-10
			- Max value: 0.9998555183410645
			- Range 0-1: True
		train
			- Shape: (17084, 23)
			- Binary values: False
			- Min value: 5.954940967417555e-12
			- Max value: 0.999969482421875
			- Range 0-1: True
		val
			- Shape: (2146, 23)
			- Binary values: False
			- Min value: 2.215562039387109e-10
			- Max value: 0.9999210

## Metrics

In [8]:
results = []

res = {}

for exp in sorted(list_folders(path)):
    res[exp] = {}
    for dt in datas:
        datas[dt] = np.load(f'{exp}/data/y_{dt}.npy', allow_pickle=True)
    res[exp]["data"] = datas.copy()
    for model in list_folders(f'{exp}/models'):
        for pred in predictions:
            predictions[pred] = np.load(f'{exp}/models/{model}/y_{pred}_pred.npy', allow_pickle=True)
            res[exp][model]=predictions.copy()

            # AUC score
            results.append({
                "experiment" : experiments[exp],
                "model" : model,
                "metric" : "AUC",
                "set": pred,
                "value": roc_auc_score(datas[pred], predictions[pred], average="weighted")
            })

            # Accuracy score
            results.append({
                "experiment" : experiments[exp],
                "model" : model,
                "metric" : "Accuracy",
                "set": pred,
                "value": accuracy_score(datas[pred], (predictions[pred]>threshold))
            })

            # Jaccard score
            results.append({
                "experiment" : experiments[exp],
                "model" : model,
                "metric" : "Jaccard",
                "set": pred,
                "value": jaccard_score(datas[pred], (predictions[pred]>threshold), average="weighted")
            })

            # F1-score
            results.append({
                "experiment" : experiments[exp],
                "model" : model,
                "metric" : "F1",
                "set": pred,
                "value": f1_score(datas[pred], (predictions[pred]>threshold), average="weighted")
            })

            # Precision score
            results.append({
                "experiment" : experiments[exp],
                "model" : model,
                "metric" : "Precision",
                "set": pred,
                "value": metrics.precision_score(datas[pred], (predictions[pred]>threshold), average="weighted")
            })

            # Recall score
            results.append({
                "experiment" : experiments[exp],
                "model" : model,
                "metric" : "Recall",
                "set": pred,
                "value": metrics.recall_score(datas[pred], (predictions[pred]>threshold), average="weighted")
            })

results = pd.DataFrame(results)
results

Unnamed: 0,experiment,model,metric,set,value
0,all,fastai_fcn_wang,AUC,test,0.905179
1,all,fastai_fcn_wang,Accuracy,test,0.375341
2,all,fastai_fcn_wang,Jaccard,test,0.547143
3,all,fastai_fcn_wang,F1,test,0.650552
4,all,fastai_fcn_wang,Precision,test,0.675138
...,...,...,...,...,...
391,subdiagnostic,fastai_resnet1d_wang,Accuracy,val,0.526561
392,subdiagnostic,fastai_resnet1d_wang,Jaccard,val,0.544909
393,subdiagnostic,fastai_resnet1d_wang,F1,val,0.676185
394,subdiagnostic,fastai_resnet1d_wang,Precision,val,0.667169


In [9]:
res

{'exp0': {'data': {'test': array([[0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          ...,
          [0, 0, 0, ..., 0, 1, 0],
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0]]),
   'train': array([[0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          ...,
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0]]),
   'val': array([[0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          ...,
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0]])},
  'fastai_fcn_wang': {'test': array([[3.6528323e-02, 6.8741779e-06, 7.1852342e-06, ..., 3.1988477e-04,
           3.2268898e-04, 1.2024202e-05],
          [1.2492822e-03, 2.1575242e-06, 8.3010104e-07, ..., 9.7902412e-05,
           1.0494200e-04, 9.6862259e-06],
          [1.12

In [10]:
results[(results["experiment"]=="all") & (results["metric"]=="AUC") & (results["set"]=="test")]

Unnamed: 0,experiment,model,metric,set,value
0,all,fastai_fcn_wang,AUC,test,0.905179
18,all,fastai_xresnet1d101,AUC,test,0.916907
36,all,fastai_lstm,AUC,test,0.904195
54,all,fastai_inception1d,AUC,test,0.912341
72,all,ensemble,AUC,test,0.916652
90,all,Wavelet+NN,AUC,test,0.853179
108,all,fastai_lstm_bidir,AUC,test,0.907127
126,all,naive,AUC,test,0.5
144,all,fastai_resnet1d_wang,AUC,test,0.910896


In [11]:
results[(results["experiment"]=="diagnostic") & (results["metric"]=="Accuracy") & (results["set"]=="test")]

Unnamed: 0,experiment,model,metric,set,value
163,diagnostic,fastai_fcn_wang,Accuracy,test,0.53012
181,diagnostic,fastai_xresnet1d101,Accuracy,test,0.530584
199,diagnostic,fastai_lstm,Accuracy,test,0.524096
217,diagnostic,fastai_inception1d,Accuracy,test,0.517146
235,diagnostic,ensemble,Accuracy,test,0.544486
253,diagnostic,Wavelet+NN,Accuracy,test,0.418906
271,diagnostic,fastai_lstm_bidir,Accuracy,test,0.525023
289,diagnostic,naive,Accuracy,test,0.0
307,diagnostic,fastai_resnet1d_wang,Accuracy,test,0.531047


## Baseline MI vs NORM

In [12]:
agg_df = pd.read_csv('../data/ptbxl/scp_statements.csv')
agg_df

Unnamed: 0.1,Unnamed: 0,description,diagnostic,form,rhythm,diagnostic_class,diagnostic_subclass,Statement Category,SCP-ECG Statement Description,AHA code,aECG REFID,CDISC Code,DICOM Code
0,NDT,non-diagnostic T abnormalities,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,non-diagnostic T abnormalities,,,,
1,NST_,non-specific ST changes,1.0,1.0,,STTC,NST_,Basic roots for coding ST-T changes and abnorm...,non-specific ST changes,145.0,MDC_ECG_RHY_STHILOST,,
2,DIG,digitalis-effect,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,suggests digitalis-effect,205.0,,,
3,LNGQT,long QT-interval,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,long QT-interval,148.0,,,
4,NORM,normal ECG,1.0,,,NORM,NORM,Normal/abnormal,normal ECG,1.0,,,F-000B7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,BIGU,"bigeminal pattern (unknown origin, SV or Ventr...",,,1.0,,,Statements related to ectopic rhythm abnormali...,"bigeminal pattern (unknown origin, SV or Ventr...",,,,
67,AFLT,atrial flutter,,,1.0,,,Statements related to impulse formation (abnor...,atrial flutter,51.0,MDC_ECG_RHY_ATR_FLUT,,
68,SVTAC,supraventricular tachycardia,,,1.0,,,Statements related to impulse formation (abnor...,supraventricular tachycardia,55.0,MDC_ECG_RHY_SV_TACHY,,D3-31290
69,PSVT,paroxysmal supraventricular tachycardia,,,1.0,,,Statements related to impulse formation (abnor...,paroxysmal supraventricular tachycardia,,MDC_ECG_RHY_SV_TACHY_PAROX,,


In [67]:
baseline = {}
diagnosis = {
    "MI": None,
    "NORM": None
    }
exp = "exp0"

# find diagnostic index
dia_glob = []
for dia in diagnosis:
    diagnosis[dia] = agg_df.diagnostic_class == dia
    diagnosis[dia] = np.where(diagnosis[dia])[0]
    dia_glob.extend(diagnosis[dia])

dia_glob = sorted(dia_glob)
for dia in diagnosis:
    diagnosis[dia] = [dia_glob.index(idx) for idx in diagnosis[dia]]

# prepare baseline
for model in res[exp]:
    baseline[model] = {}
    for set in res[exp][model]:
        baseline[model][set] = res[exp][model][set][:, (agg_df.diagnostic_class.isin(diagnosis.keys()))]

# prepare target
y = baseline.pop("data")
for set in y:
    tmp = []
    for dia in diagnosis:
        tmp.append(np.any(y[set][:, diagnosis[dia]], axis=1).astype(int))
    y[set] = np.transpose(np.array(tmp))

# find utils rows
utils = {}
for set in y:
    utils[set] = np.sum(y[set], axis=1) == 1

# prepare prediction values
for model in baseline:
    for set in baseline[model]:
        baseline[model][set] = np.where(np.isin(np.argmax(baseline[model][set], axis = 1), diagnosis["MI"]), 1, 0)

# keep only utils rows in predictions
for model in baseline:
    for set in baseline[model]:
        baseline[model][set] = baseline[model][set][utils[set]]

# keep only utils rows in target & prepare it
for set in y:
    y[set] = y[set][utils[set]]
    y[set] = y[set][:, 0]

In [82]:
model = "naive"
for set in baseline[model]:
    print(f'{set} - {np.sum(baseline[model][set])} / {len(baseline[model][set])}')

test - 608 / 608
train - 5057 / 5057
val - 597 / 597


In [81]:
for set in y:
    print(f'{set} - {np.sum(y[set])} / {len(y[set])}')

test - 537 / 608
train - 4408 / 5057
val - 524 / 597


In [68]:
base_res = []

for model in baseline:
    for set in baseline[model]:

        # AUC score
        base_res.append({
            "model" : model,
            "metric" : "AUC",
            "set": set,
            "value": roc_auc_score(y[set], baseline[model][set], average="weighted")
        })

        # Accuracy score
        base_res.append({
            "model" : model,
            "metric" : "Accuracy",
            "set": set,
            "value": accuracy_score(y[set], baseline[model][set])
        })

        # Jaccard score
        base_res.append({
            "model" : model,
            "metric" : "Jaccard",
            "set": set,
            "value": jaccard_score(y[set], baseline[model][set], average="weighted")
        })

        # F1-score
        base_res.append({
            "model" : model,
            "metric" : "F1",
            "set": set,
            "value": f1_score(y[set], baseline[model][set], average="weighted")
        })

        # Precision score
        base_res.append({
            "model" : model,
            "metric" : "Precision",
            "set": set,
            "value": metrics.precision_score(y[set], baseline[model][set], average="weighted")
        })

        # Recall score
        base_res.append({
            "model" : model,
            "metric" : "Recall",
            "set": set,
            "value": metrics.recall_score(y[set], baseline[model][set], average="weighted")
        })

base_res = pd.DataFrame(base_res)
base_res

Unnamed: 0,model,metric,set,value
0,fastai_fcn_wang,AUC,test,0.938193
1,fastai_fcn_wang,Accuracy,test,0.955592
2,fastai_fcn_wang,Jaccard,test,0.921811
3,fastai_fcn_wang,F1,test,0.957399
4,fastai_fcn_wang,Precision,test,0.961333
...,...,...,...,...
157,fastai_resnet1d_wang,Accuracy,val,0.964824
158,fastai_resnet1d_wang,Jaccard,val,0.936559
159,fastai_resnet1d_wang,F1,val,0.966041
160,fastai_resnet1d_wang,Precision,val,0.968958


In [71]:
base_res[(base_res["set"]=="test") & (base_res["metric"]=="F1")]

Unnamed: 0,model,metric,set,value
3,fastai_fcn_wang,F1,test,0.957399
21,fastai_xresnet1d101,F1,test,0.949635
39,fastai_lstm,F1,test,0.9549
57,fastai_inception1d,F1,test,0.956145
75,ensemble,F1,test,0.951563
93,Wavelet+NN,F1,test,0.895826
111,fastai_lstm_bidir,F1,test,0.953013
129,naive,F1,test,0.828456
147,fastai_resnet1d_wang,F1,test,0.958661
