# Collating Traning Results

The training results are parsed and summarized in this notebook

In [1]:
import re
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
from typing import *

### Get the hyperparameters from the training logs.

In [2]:
# Path to train.log
path = "outputs/2024-04-11/07-26-56/train.log"

In [3]:
def read_log(path: Union[str, Path]):
    with open(path, "r") as f:
        lines = f.readlines()
    return lines

In [4]:
train_log = read_log(path)

In [5]:
tokens = ['src_data', 'model', 'embed_model', 'value_mode', 'task', 'transfer']

In [6]:
def create_pattern(token: str):
    return re.compile(r"'{}': '(\w+)'".format(token))

In [7]:
def parse_log(train_log: List[str], tokens: List[str]):
    hyperparams = {}
    for line in train_log:
        for tok in tokens:
            pattern = create_pattern(tok)
            match = pattern.search(line)
            if match:
                #print(match.group(1))
                hyperparams[tok] = match.group(1)
    return hyperparams


In [8]:
hyperparams = parse_log(train_log, tokens)

In [9]:
def get_trainlog_paths(root: Union[str, Path]):
    root = Path(root)
    return list(root.glob("*/*/train.log"))

In [None]:
def get_last_auprc(train_log: List[str], patterns: Dict[str, re.Pattern]):
    for line in train_log[::-1]:
        hit = patterns['auprc'].search(line)
        if hit:
            return hit.group(1)

In [None]:
def create_pattern_numerical(token: str):
    return re.compile(r"{}: (\d+(\.\d+)?)".format(token))

In [None]:
metrics = ['epoch','auroc', 'auprc', 'loss']
# create a pattern dictionay with the metrics as keys
patterns = {metric: create_pattern_numerical(metric) for metric in metrics}

In [53]:
def parse_experiment(root: Union[str, Path], tokens: List[str], patterns: Dict[str, re.Pattern]=None):
    l_trainlogs = get_trainlog_paths(root)
    df = pd.DataFrame()
    for log in l_trainlogs:
        train_log = read_log(log)
        hyperparams = parse_log(train_log, tokens)
        run = log.parent.name
        hyperparams['run'] = log.parent#/run
        if "done training" in train_log[-1]:
            hyperparams['done'] = True
        else:
            hyperparams['done'] = False
        if patterns:
            hyperparams['auprc'] = get_last_auprc(train_log, patterns)
        df = pd.concat([df, pd.DataFrame(hyperparams, index=[0])])
    if patterns:
        df['auprc'] = df['auprc'].astype(float)
        order = ['run', 'done', 'src_data', 'task', 'embed_model', 'model', 'value_mode', 'auprc']
    else:
        order = ['run', 'done', 'src_data', 'task', 'embed_model', 'model', 'value_mode']
    return df[order]

In [54]:
df_experiments = parse_experiment("outputs", tokens, patterns)

In [55]:
df_experiments.head(5)

Unnamed: 0,run,done,src_data,task,embed_model,model,value_mode,auprc
0,outputs/2024-04-13/07-13-12,False,mimiciii,mlm,,descemb_rnn,NV,
0,outputs/2024-04-13/07-13-22,False,eicu,mlm,,descemb_rnn,NV,
0,outputs/2024-04-13/07-17-36,False,mimiciii,mlm,,descemb_rnn,NV,
0,outputs/2024-04-13/07-26-31,False,mimiciii,mlm,,descemb_rnn,NV,
0,outputs/2024-04-13/07-44-35,False,mimiciii,mlm,,descemb_rnn,NV,


In [57]:
df_experiments.to_excel("outputs/experiments.xlsx", index=False)

### Parsing the train.log file for metrics

In [16]:
def extract_metrics(line: str, df_metrics: pd.DataFrame, patterns: Dict[str, re.Pattern], fold: str):
    epoch = patterns['epoch'].search(line).group(1)
    loss = patterns['loss'].search(line).group(1)
    auroc = patterns['auroc'].search(line).group(1)
    auprc = patterns['auprc'].search(line).group(1)
    df = pd.DataFrame({'fold': fold, 'epoch': int(epoch), 'loss': float(loss), 'auroc': float(auroc), 'auprc': float(auprc)}, index=[0])
    df_metrics = pd.concat([df_metrics, df])
    return df_metrics

df_metrics = pd.DataFrame(columns=['run', 'fold', ]+metrics)

In [48]:
get_last_auprc(train_log, patterns)

'0.089'

In [17]:
def parse_experiment_metrics(root: Union[str, Path], patterns: Dict[str, re.Pattern]):
    l_trainlogs = get_trainlog_paths(root)
    df_metrics = pd.DataFrame(columns=['run', 'fold', 'epoch', 'loss', 'auroc', 'auprc'])
    for log in l_trainlogs:
        train_log = read_log(log)
        run = log.parent
        for line in train_log:
            if "[INFO]" in line:
                if "[train]" in line:
                    df_metrics = extract_metrics(line, df_metrics, patterns, "train")
                    df_metrics['run'] = run
                elif "[valid]" in line:
                    df_metrics = extract_metrics(line, df_metrics, patterns, "valid")
                    df_metrics['run'] = run
                elif "[test]" in line:
                    df_metrics = extract_metrics(line, df_metrics, patterns, "test")
                    df_metrics['run'] = run
    return df_metrics

In [18]:
df_metrics = parse_experiment_metrics("outputs", patterns)

In [21]:
df_metrics.run.unique()

array([PosixPath('outputs/2024-04-11/07-26-56')], dtype=object)

In [23]:
len(df_metrics)

3028

In [24]:
df_metrics.head(100)

Unnamed: 0,run,fold,epoch,loss,auroc,auprc
0,outputs/2024-04-11/07-26-56,train,1,0.654,0.509,0.045
0,outputs/2024-04-11/07-26-56,train,1,0.654,0.509,0.045
0,outputs/2024-04-11/07-26-56,valid,1,0.647,0.502,0.045
0,outputs/2024-04-11/07-26-56,test,1,1.462,0.505,0.043
0,outputs/2024-04-11/07-26-56,train,2,0.645,0.509,0.045
...,...,...,...,...,...,...
0,outputs/2024-04-11/07-26-56,valid,5,0.622,0.503,0.045
0,outputs/2024-04-11/07-26-56,test,5,1.404,0.504,0.043
0,outputs/2024-04-11/07-26-56,train,6,0.619,0.509,0.045
0,outputs/2024-04-11/07-26-56,valid,6,0.618,0.502,0.045


In [58]:
df_metrics.to_excel("outputs/metrics.xlsx", index=False)

In [None]:
def plot_metrics(df_metrics: pd.DataFrame, run: str):
    fig, ax = plt.subplots()
    for fold in ['train', 'valid', 'test']:
        df_metrics.loc[(df_metrics['fold']==fold) & (df_metrics['run']==run)].plot(x='epoch', y='loss', ax=ax, label=fold)
    plt.title(run)
    plt.show()