# Model Evaluation

## Four-fold Cross Evaluation

```
Fold 1: trained on FRH01, FRH02, FRH03, tested on FRH04
Fold 2: trained on FRH01, FRH02, FRH04, tested on FRH03
Fold 3: trained on FRH01, FRH03, FRH04, tested on FRH02
Fold 4: trained on FRH02, FRH03, FRH04, tested on FRH01
```

## Download and unzip evaluation runs
to `/tmp`


Folder structure
```
<level>/
    <fold>/
        <model>/
            y_pred.npy
            y_true.npy
            classification_report.txt
            model.pth
            ...
```

In [22]:
!wget -N -P /tmp https://syncandshare.lrz.de/dl/fi6zpj2xQ24mM3VxLewj87eQ/L1C.zip
!unzip -o /tmp/L1C.zip -d /tmp

!wget -N -P /tmp https://syncandshare.lrz.de/dl/fi5D238TjzBYFr78ERmeUbCd/L2A.zip    
!unzip -o /tmp/L2A.zip -d /tmp

--2020-05-02 22:15:35--  https://syncandshare.lrz.de/dl/fi6zpj2xQ24mM3VxLewj87eQ/L1C.zip
Resolving syncandshare.lrz.de (syncandshare.lrz.de)... 129.187.255.213
Connecting to syncandshare.lrz.de (syncandshare.lrz.de)|129.187.255.213|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 284560207 (271M) [application/x-zip-compressed]
Saving to: ‘/tmp/L1C.zip’


2020-05-02 22:16:00 (11,2 MB/s) - ‘/tmp/L1C.zip’ saved [284560207/284560207]

--2020-05-02 22:16:00--  https://syncandshare.lrz.de/dl/fi5D238TjzBYFr78ERmeUbCd/L2A.zip
Resolving syncandshare.lrz.de (syncandshare.lrz.de)... 129.187.255.213
Connecting to syncandshare.lrz.de (syncandshare.lrz.de)|129.187.255.213|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 279045345 (266M) [application/x-zip-compressed]
Saving to: ‘/tmp/L2A.zip’


2020-05-02 22:16:24 (11,2 MB/s) - ‘/tmp/L2A.zip’ saved [279045345/279045345]



In [None]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import sys
sys.path.append("..")
from examples.train import metrics as calc_metrics

def load(run):
    y_pred = np.load(os.path.join(run,"y_pred.npy"))
    y_true = np.load(os.path.join(run,"y_true.npy"))
    y_score = np.load(os.path.join(run,"y_score.npy"))
    field_ids = np.load(os.path.join(run,"field_ids.npy"))
    rs = pd.DataFrame([y_pred,y_true,field_ids],index=["y_pred","y_true","field_ids"]).T.set_index("field_ids")
    return rs, y_score

def load_table(logdir):
    runs = os.listdir(logdir)
    #runs = ["LSTM","OmniScaleCNN","MSResNet","StarRNN","TempCNN","TransformerEncoder", "InceptionTime"]

    stats = list()
    for run in runs:
        rs, _ = load(os.path.join(logdir,run))
        stat = calc_metrics(rs.y_true,rs.y_pred)
        stat["model"] = run
        stats.append(stat)
    stats = pd.DataFrame(stats).set_index("model")

    df = stats.T
    #df["RF"] = ""

    #models = ["RF","OmniScaleCNN", "TempCNN","MSResNet", "InceptionTime", "LSTM","StarRNN","TransformerEncoder"]
    table = df.loc[["accuracy","recall_macro","f1_macro","f1_weighted","kappa"]]
    table.index = ["overall accuracy","average accuracy","class-mean f-score","weighted f-score","kappa-metric"]
    return table

### Compile L1C Results

In [6]:
logdir = "/tmp"

l1tables = []
for fold in [1,2,3,4]:
    table = load_table(f"{logdir}/L1C/{fold}").T
    table["fold"] = fold
    l1tables.append(table)


  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
meantable = pd.concat(l1tables).groupby(["model"]).mean()
stdtable = pd.concat(l1tables).groupby(["model"]).std()

metrics = ["overall accuracy","average accuracy","class-mean f-score","weighted f-score","kappa-metric"]
models = ["OmniScaleCNN","TempCNN","MSResNet", "InceptionTime", "LSTM", "StarRNN", "TransformerEncoder"]
for metric in metrics:
    entries = list()
        
    for model in models:
        entries.append("$" + f"{meantable.loc[model,metric]:.2f}"+"^{\\pm "+ f"{stdtable.loc[model,metric]:.2f}" + "}$")
    row = f"{metric} & & " + " & ".join(entries)
    row += " \\\ "
    print(row)

overall accuracy & & $0.78^{\pm 0.02}$ & $0.78^{\pm 0.03}$ & $0.70^{\pm 0.06}$ & $0.73^{\pm 0.06}$ & $0.79^{\pm 0.03}$ & $0.79^{\pm 0.02}$ & $0.80^{\pm 0.01}$ \\ 
average accuracy & & $0.54^{\pm 0.02}$ & $0.55^{\pm 0.01}$ & $0.54^{\pm 0.05}$ & $0.53^{\pm 0.04}$ & $0.56^{\pm 0.02}$ & $0.56^{\pm 0.01}$ & $0.57^{\pm 0.01}$ \\ 
class-mean f-score & & $0.55^{\pm 0.02}$ & $0.54^{\pm 0.03}$ & $0.55^{\pm 0.04}$ & $0.50^{\pm 0.04}$ & $0.56^{\pm 0.03}$ & $0.56^{\pm 0.02}$ & $0.58^{\pm 0.01}$ \\ 
weighted f-score & & $0.77^{\pm 0.02}$ & $0.77^{\pm 0.04}$ & $0.69^{\pm 0.06}$ & $0.71^{\pm 0.07}$ & $0.77^{\pm 0.04}$ & $0.76^{\pm 0.04}$ & $0.80^{\pm 0.01}$ \\ 
kappa-metric & & $0.71^{\pm 0.02}$ & $0.72^{\pm 0.04}$ & $0.63^{\pm 0.07}$ & $0.64^{\pm 0.08}$ & $0.72^{\pm 0.05}$ & $0.72^{\pm 0.03}$ & $0.74^{\pm 0.02}$ \\ 


In [7]:
pd.concat(l1tables).groupby(["model","fold"]).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,overall accuracy,average accuracy,class-mean f-score,weighted f-score,kappa-metric
model,fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
InceptionTime,1,0.79,0.561291,0.550248,0.782731,0.725212
InceptionTime,2,0.760876,0.536852,0.510457,0.732667,0.684464
InceptionTime,3,0.660557,0.477702,0.477964,0.624437,0.54902
InceptionTime,4,0.691921,0.526863,0.448783,0.696238,0.617201
LSTM,1,0.803098,0.57344,0.580049,0.78838,0.740102
LSTM,2,0.779778,0.556539,0.55652,0.765028,0.70915
LSTM,3,0.74324,0.523667,0.522199,0.722079,0.664367
LSTM,4,0.820651,0.577291,0.580665,0.820059,0.771737
MSResNet,1,0.776795,0.508189,0.521983,0.761183,0.704569
MSResNet,2,0.639566,0.483825,0.503944,0.623503,0.561879


### Compile L2A Results

In [8]:
l2tables = []
for fold in [1,2,3,4]:
    table = load_table(f"{logdir}/L2A/{fold}").T
    table["fold"] = fold
    l2tables.append(table)

In [11]:

meantable = pd.concat(l2tables).groupby(["model"]).mean()
stdtable = pd.concat(l2tables).groupby(["model"]).std()

metrics = ["overall accuracy","average accuracy","class-mean f-score","weighted f-score","kappa-metric"]
models = ["OmniScaleCNN","TempCNN","MSResNet", "InceptionTime", "LSTM", "StarRNN", "TransformerEncoder"]
for metric in metrics:
    entries = list()
        
    for model in models:
        entries.append("$" + f"{meantable.loc[model,metric]:.2f}"+"^{\\pm "+ f"{stdtable.loc[model,metric]:.2f}" + "}$")
    row = f"{metric} & & " + " & ".join(entries)
    row += " \\\ "
    print(row)

overall accuracy & & $0.75^{\pm 0.06}$ & $0.79^{\pm 0.03}$ & $0.76^{\pm 0.02}$ & $0.73^{\pm 0.03}$ & $0.78^{\pm 0.05}$ & $0.78^{\pm 0.04}$ & $0.79^{\pm 0.03}$ \\ 
average accuracy & & $0.54^{\pm 0.04}$ & $0.55^{\pm 0.04}$ & $0.55^{\pm 0.04}$ & $0.51^{\pm 0.02}$ & $0.56^{\pm 0.03}$ & $0.55^{\pm 0.02}$ & $0.58^{\pm 0.03}$ \\ 
class-mean f-score & & $0.54^{\pm 0.04}$ & $0.55^{\pm 0.03}$ & $0.57^{\pm 0.04}$ & $0.49^{\pm 0.03}$ & $0.56^{\pm 0.03}$ & $0.56^{\pm 0.02}$ & $0.58^{\pm 0.02}$ \\ 
weighted f-score & & $0.73^{\pm 0.06}$ & $0.78^{\pm 0.04}$ & $0.76^{\pm 0.02}$ & $0.70^{\pm 0.06}$ & $0.78^{\pm 0.05}$ & $0.78^{\pm 0.04}$ & $0.79^{\pm 0.04}$ \\ 
kappa-metric & & $0.68^{\pm 0.08}$ & $0.72^{\pm 0.05}$ & $0.70^{\pm 0.03}$ & $0.65^{\pm 0.04}$ & $0.72^{\pm 0.06}$ & $0.72^{\pm 0.05}$ & $0.73^{\pm 0.05}$ \\ 


In [13]:
pd.concat(l2tables).groupby(["model","fold"]).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,overall accuracy,average accuracy,class-mean f-score,weighted f-score,kappa-metric
model,fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
InceptionTime,1,0.77105,0.5338,0.536306,0.766904,0.703828
InceptionTime,2,0.730403,0.476804,0.502645,0.717395,0.643124
InceptionTime,3,0.693775,0.523857,0.46287,0.614834,0.603513
InceptionTime,4,0.734605,0.507277,0.47122,0.701496,0.659076
LSTM,1,0.800983,0.566342,0.576579,0.799199,0.739637
LSTM,2,0.722138,0.517356,0.527994,0.720954,0.63946
LSTM,3,0.781208,0.562395,0.560205,0.764094,0.714349
LSTM,4,0.829336,0.580922,0.587238,0.827328,0.782179
MSResNet,1,0.766837,0.536133,0.549065,0.76938,0.697171
MSResNet,2,0.742245,0.508646,0.531518,0.737054,0.660065
