In [1]:
import mlflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

experiment_name = 'Baseline-async'
mlflow.set_tracking_uri('http://localhost/')

In [2]:
experiment = mlflow.get_experiment_by_name(experiment_name)
df_exp = mlflow.search_runs(experiment_ids=[experiment.experiment_id], filter_string='attributes.status="FINISHED"')

fixed_cols = ['run_id', 'start_time', 'end_time']
for col in df_exp.columns:
    if col.startswith('metrics.') or col.startswith('params'):
        new_col_name = '.'.join(col.split('.')[1:])
        df_exp.rename(columns={col: new_col_name}, inplace=True)
    elif col == 'tags.mlflow.runName':
        df_exp.rename(columns={col: 'run_name'}, inplace=True)
    elif col not in fixed_cols:
        df_exp.drop(columns=[col], inplace=True)
df_exp.head(n=3)

Unnamed: 0,run_id,start_time,end_time,test_balanced_accuracy,train_f1_weighted,train_time_sec,test_count_observations,train_certainty_neg,test_f1_weighted,train_balanced_accuracy,...,test_predict_time_sec,test_accuracy,train_recall_macro,train_precision_micro,test_precision_weighted,zimp_mechanism,model_type,dataset,random_seed,run_name
0,32c6c37a0bab47a0aa9ac7781b7bf6f1,2022-02-18 22:34:21.414000+00:00,2022-02-18 22:34:34.875000+00:00,0.560406,0.678478,1.303403,500.0,0.489957,0.621646,0.614225,...,0.439959,0.634,0.614225,0.670213,0.690311,,DECISION_TREE,TREC-6,36755,DECISION_TREE-105
1,98835f555a34424082c805cabdd44d15,2022-02-18 22:28:22.842000+00:00,2022-02-18 22:34:20.916000+00:00,0.713443,0.719426,119.630484,70000.0,0.471007,0.713335,0.719116,...,21.584111,0.713443,0.719116,0.719116,0.718528,,DECISION_TREE,DBP-14,45710,DECISION_TREE-110
2,e477a4ba4dac4717b2d51da6beb0bc9f,2022-02-18 22:15:49.078000+00:00,2022-02-18 22:28:22.030000+00:00,0.3546,0.342824,264.185552,50000.0,0.342243,0.340802,0.357672,...,24.278904,0.3546,0.357672,0.357672,0.34466,,DECISION_TREE,YELP-5,139949,DECISION_TREE-118


In [3]:
df_stats = df_exp.groupby(['dataset', 'model_type']).agg(['mean', 'std', 'count']).fillna(0)
for metric in df_stats.columns.levels[0]:
    df_stats[metric,'ci95'] = 1.96*df_stats[metric]['std']/df_stats[metric]['count']**.5

df_stats = df_stats.drop(['count', 'std'], axis=1, level=1).drop('random_seed', axis=1, level=0).sort_index(axis=1)
df_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,test_accuracy,test_accuracy,test_balanced_accuracy,test_balanced_accuracy,test_certainty_neg,test_certainty_neg,test_certainty_pos,test_certainty_pos,test_count_observations,test_count_observations,...,train_predict_time_sec,train_predict_time_sec,train_recall_macro,train_recall_macro,train_recall_micro,train_recall_micro,train_recall_weighted,train_recall_weighted,train_time_sec,train_time_sec
Unnamed: 0_level_1,Unnamed: 1_level_1,ci95,mean,ci95,mean,ci95,mean,ci95,mean,ci95,mean,...,ci95,mean,ci95,mean,ci95,mean,ci95,mean,ci95,mean
dataset,model_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
10K-GNAD,DECISION_TREE,0.0,0.535019,0.0,0.504087,2.3e-05,0.437288,5.832332e-05,0.673475,0.0,1028.0,...,0.346151,18.262609,0.0,0.511002,0.0,0.55457,0.0,0.55457,23.698053,24.269801
10K-GNAD,FASTTEXT,0.002575,0.268191,0.00236,0.19396,0.00141,0.179484,0.00316078,0.202325,0.0,1028.0,...,0.659668,18.590105,0.000755,0.201125,0.001203,0.276084,0.001203,0.276084,1.601432,13.923119
10K-GNAD,GERMAN_BERT,0.00418,0.717412,0.004732,0.683768,0.00252,0.344503,0.002272291,0.5583,0.0,1028.0,...,14.307742,123.677541,0.001453,0.670587,0.001367,0.706847,0.001367,0.706847,13.197393,451.582112
10K-GNAD,RANDOM_FOREST,0.007675,0.403307,0.006283,0.294119,0.000462,0.169741,0.0008805727,0.186234,0.0,1028.0,...,0.467231,17.914,0.00451,0.308493,0.005358,0.422282,0.005358,0.422282,0.387675,8.082151
10K-GNAD,SVM,0.0,0.824903,0.0,0.796318,0.0,0.645624,0.0,0.65688,0.0,1028.0,...,0.0,172.324178,0.0,0.996576,0.0,0.996214,0.0,0.996214,0.0,335.192722
DBP-14,BERT,6.9e-05,0.988867,6.9e-05,0.988867,0.001948,0.727565,2.903959e-05,0.985383,0.0,70000.0,...,91.665461,1100.59246,3.2e-05,0.987916,3.2e-05,0.987916,3.2e-05,0.987916,47.20936,4086.110599
DBP-14,DECISION_TREE,0.0,0.713443,0.0,0.713443,0.0,0.476763,5.606241e-07,0.815013,0.0,70000.0,...,17.166313,213.833416,0.0,0.719116,0.0,0.719116,0.0,0.719116,20.918244,152.728414
DBP-14,FASTTEXT,7.6e-05,0.9836,7.6e-05,0.9836,0.000883,0.740631,3.268359e-05,0.987506,0.0,70000.0,...,0.919464,152.819417,2.7e-05,0.994813,2.7e-05,0.994813,2.7e-05,0.994813,14.488742,152.080461
DBP-14,RANDOM_FOREST,0.025222,0.318983,0.025222,0.318983,0.000171,0.072949,0.0006452829,0.07714,0.0,70000.0,...,37.106974,596.228237,0.024787,0.319086,0.024787,0.319086,0.024787,0.319086,31.224789,102.120383
DBP-14,SVM,0.0,0.9841,0.0,0.9841,0.0,0.632003,0.0,0.65335,0.0,70000.0,...,0.0,14915.279169,0.0,0.999443,0.0,0.999443,0.0,0.999443,0.0,8247.112962


In [4]:
for metric in df_stats.columns.levels[0]:
    if metric == 'random_seed':
        continue
    df_stats[metric,'report_str'] = df_stats[metric].apply(lambda row: f"{row['mean']:.3f} \pm {row['ci95']:.2f}", axis=1)

df_report = df_stats.drop(['ci95', 'mean'], axis=1, level=1)
df_report.columns = df_report.columns.droplevel(1)
df_report.head(n=3)

Unnamed: 0_level_0,Unnamed: 1_level_0,test_accuracy,test_balanced_accuracy,test_certainty_neg,test_certainty_pos,test_count_observations,test_f1_macro,test_f1_micro,test_f1_weighted,test_precision_macro,test_precision_micro,...,train_f1_micro,train_f1_weighted,train_precision_macro,train_precision_micro,train_precision_weighted,train_predict_time_sec,train_recall_macro,train_recall_micro,train_recall_weighted,train_time_sec
dataset,model_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10K-GNAD,DECISION_TREE,0.535 \pm 0.00,0.504 \pm 0.00,0.437 \pm 0.00,0.673 \pm 0.00,1028.000 \pm 0.00,0.510 \pm 0.00,0.535 \pm 0.00,0.527 \pm 0.00,0.537 \pm 0.00,0.535 \pm 0.00,...,0.555 \pm 0.00,0.548 \pm 0.00,0.560 \pm 0.00,0.555 \pm 0.00,0.560 \pm 0.00,18.263 \pm 0.35,0.511 \pm 0.00,0.555 \pm 0.00,0.555 \pm 0.00,24.270 \pm 23.70
10K-GNAD,FASTTEXT,0.268 \pm 0.00,0.194 \pm 0.00,0.179 \pm 0.00,0.202 \pm 0.00,1028.000 \pm 0.00,0.117 \pm 0.00,0.268 \pm 0.00,0.160 \pm 0.00,0.150 \pm 0.01,0.268 \pm 0.00,...,0.276 \pm 0.00,0.168 \pm 0.00,0.167 \pm 0.00,0.276 \pm 0.00,0.194 \pm 0.00,18.590 \pm 0.66,0.201 \pm 0.00,0.276 \pm 0.00,0.276 \pm 0.00,13.923 \pm 1.60
10K-GNAD,GERMAN_BERT,0.717 \pm 0.00,0.684 \pm 0.00,0.345 \pm 0.00,0.558 \pm 0.00,1028.000 \pm 0.00,0.687 \pm 0.00,0.717 \pm 0.00,0.707 \pm 0.00,0.727 \pm 0.01,0.717 \pm 0.00,...,0.707 \pm 0.00,0.702 \pm 0.00,0.724 \pm 0.00,0.707 \pm 0.00,0.715 \pm 0.00,123.678 \pm 14.31,0.671 \pm 0.00,0.707 \pm 0.00,0.707 \pm 0.00,451.582 \pm 13.20


In [5]:
df_dummy = pd.read_csv('dummy_baseline.csv', index_col=0).applymap("{0:.3f}".format)
df_dummy.set_index(pd.MultiIndex.from_tuples([(i, 'DUMMY') for i in df_report.index.levels[0]]), inplace=True)
df_merged = pd.concat([df_report, df_dummy]).sort_index()
df_merged.head(n=3)

Unnamed: 0_level_0,Unnamed: 1_level_0,test_accuracy,test_balanced_accuracy,test_certainty_neg,test_certainty_pos,test_count_observations,test_f1_macro,test_f1_micro,test_f1_weighted,test_precision_macro,test_precision_micro,...,train_f1_micro,train_f1_weighted,train_precision_macro,train_precision_micro,train_precision_weighted,train_predict_time_sec,train_recall_macro,train_recall_micro,train_recall_weighted,train_time_sec
dataset,model_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10K-GNAD,DECISION_TREE,0.535 \pm 0.00,0.504 \pm 0.00,0.437 \pm 0.00,0.673 \pm 0.00,1028.000 \pm 0.00,0.510 \pm 0.00,0.535 \pm 0.00,0.527 \pm 0.00,0.537 \pm 0.00,0.535 \pm 0.00,...,0.555 \pm 0.00,0.548 \pm 0.00,0.560 \pm 0.00,0.555 \pm 0.00,0.560 \pm 0.00,18.263 \pm 0.35,0.511 \pm 0.00,0.555 \pm 0.00,0.555 \pm 0.00,24.270 \pm 23.70
10K-GNAD,DUMMY,0.163,0.111,0.163,0.163,1028.000,0.031,0.163,0.046,0.018,0.163,...,0.163,0.046,0.018,0.163,0.027,,0.111,0.163,0.163,
10K-GNAD,FASTTEXT,0.268 \pm 0.00,0.194 \pm 0.00,0.179 \pm 0.00,0.202 \pm 0.00,1028.000 \pm 0.00,0.117 \pm 0.00,0.268 \pm 0.00,0.160 \pm 0.00,0.150 \pm 0.01,0.268 \pm 0.00,...,0.276 \pm 0.00,0.168 \pm 0.00,0.167 \pm 0.00,0.276 \pm 0.00,0.194 \pm 0.00,18.590 \pm 0.66,0.201 \pm 0.00,0.276 \pm 0.00,0.276 \pm 0.00,13.923 \pm 1.60


In [6]:
def get_latex_method_name(model_type):
    if model_type == 'GERMAN_BERT':
        return 'BERT'
    elif model_type == 'DECISION_TREE':
        return 'DT'
    elif model_type == 'RANDOM_FOREST':
        return 'RF'
    elif model_type == 'FASTTEXT':
        return 'FT'
    elif model_type == 'DUMMY':
        return 'BASE'
    return model_type.replace('_', '\_')

def get_latex_metric(metric_str, max_score, method):
    is_max = metric_str == max_score
    
    if method == 'SVM':
        metric_str = metric_str.split()[0]
    
    if metric_str.endswith('0.00'):
        metric_str = metric_str[:-4] + '\epsilon'
    if is_max:
        return '\mathbf{' + metric_str + '}'
    return metric_str

In [7]:
reported_metrics = ['train_accuracy', 'train_balanced_accuracy', 'train_f1_weighted', 'test_accuracy', 'test_balanced_accuracy', 'test_f1_weighted']
reported_methods = ['DUMMY', 'DECISION_TREE', 'RANDOM_FOREST', 'SVM', 'FASTTEXT', 'BERT', 'GERMAN_BERT']

print("""

""")

for dataset in df_merged.index.levels[0]:
    df_dataset = df_merged.loc[dataset]
    max_scores = df_dataset.fillna('0').max()
    print("""
\\begin{table}[]
    \small
    \centering
    \\begin{tabular}{l|ccc|ccc}
        Method & $ACC_{train}$ & $BAC_{train}$ & $F_{train}$ & $ACC_{test}$ & $BAC_{test}$ & $F_{test}$ \\\\ \hline
    """)

    for method in reported_methods:
        if method not in df_dataset.index:
            continue
        s = ' ' * 4
        s += '\\textit{' + get_latex_method_name(method) + '} & '
        s += ' & '.join([f'${get_latex_metric(df_dataset.loc[method, metric], max_scores[metric], method)}$' for metric in reported_metrics])
        s +=  ' \\\\'
        print(s)
    
    print("""
    \end{tabular}
    \caption{Baseline results for \\texttt{""" + dataset + """}}
    \label{tab:results_"""+dataset+"""}
\end{table}
    """)





\begin{table}[]
    \small
    \centering
    \begin{tabular}{l|ccc|ccc}
        Method & $ACC_{train}$ & $BAC_{train}$ & $F_{train}$ & $ACC_{test}$ & $BAC_{test}$ & $F_{test}$ \\ \hline
    
    \textit{BASE} & $0.163$ & $0.111$ & $0.046$ & $0.163$ & $0.111$ & $0.046$ \\
    \textit{DT} & $0.555 \pm \epsilon$ & $0.511 \pm \epsilon$ & $0.548 \pm \epsilon$ & $0.535 \pm \epsilon$ & $0.504 \pm \epsilon$ & $0.527 \pm \epsilon$ \\
    \textit{RF} & $0.422 \pm 0.01$ & $0.308 \pm \epsilon$ & $0.338 \pm 0.01$ & $0.403 \pm 0.01$ & $0.294 \pm 0.01$ & $0.319 \pm 0.01$ \\
    \textit{SVM} & $\mathbf{0.996}$ & $\mathbf{0.997}$ & $\mathbf{0.996}$ & $\mathbf{0.825}$ & $\mathbf{0.796}$ & $\mathbf{0.825}$ \\
    \textit{FT} & $0.276 \pm \epsilon$ & $0.201 \pm \epsilon$ & $0.168 \pm \epsilon$ & $0.268 \pm \epsilon$ & $0.194 \pm \epsilon$ & $0.160 \pm \epsilon$ \\
    \textit{BERT} & $0.707 \pm \epsilon$ & $0.671 \pm \epsilon$ & $0.702 \pm \epsilon$ & $0.717 \pm \epsilon$ & $0.684 \pm \epsilon$ & $0.