In [1]:
import mlflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os

experiment_name = 'Baseline-async'
mlflow.set_tracking_uri('http://localhost/')
artifact_dir = 'H:\\mlruns_2\\'
source_dir = '../../zimp_orchestrator/orch/resources'

In [2]:
experiment = mlflow.get_experiment_by_name(experiment_name)
df_exp = mlflow.search_runs(experiment_ids=[experiment.experiment_id], filter_string='attributes.status="FINISHED"')

df_exp = df_exp[['run_id', 'params.model_type', 'params.dataset']].rename(columns={'params.model_type': 'model_type', 'params.dataset': 'dataset'})
df_exp

Unnamed: 0,run_id,model_type,dataset
0,32c6c37a0bab47a0aa9ac7781b7bf6f1,DECISION_TREE,TREC-6
1,98835f555a34424082c805cabdd44d15,DECISION_TREE,DBP-14
2,e477a4ba4dac4717b2d51da6beb0bc9f,DECISION_TREE,YELP-5
3,abebe1eb2560436f9a5a4db6a79e10fe,DECISION_TREE,DBP-14
4,5c5ed70970754259a0538395a85a1a61,DECISION_TREE,TREC-6
...,...,...,...
241,52b3ce034aa64cef9c70b30c91a75405,BERT,YELP-5
242,8af23752f0a040dcb93939e0926a6c8f,BERT,YELP-5
243,3594b51440bc449b9129a27199488c1c,BERT,YELP-5
244,8de36d8d26ec4bcb9c62983934487f65,BERT,YELP-5


In [3]:
def get_confidence_scores(run_id):
    '''
    returns dataframes sorted by descending size
    in our case the training set is always bigger and therfor the first to be returned
    '''
    prediction_files = glob.glob(artifact_dir + run_id + '/artifacts/predictions*.csv')
    dfs = [pd.read_csv(file)[['prediction', 'certainty']] for file in prediction_files]
    dfs = sorted(dfs, key=lambda df: df.shape[0], reverse=True)
    return dfs
           
def compute_confidence_scores(run_ids, model_type, dataset):
    train_dfs = []
    test_dfs = []
    
    for run_id in run_ids:
        train_df, test_df = get_confidence_scores(run_id)
        train_dfs.append(train_df)
        test_dfs.append(test_df)
    
    df_train = pd.concat(train_dfs, axis=1)
    df_test = pd.concat(test_dfs, axis=1)
    
    for df, label in zip([df_train, df_test], ['train', 'test']):
        df_true = pd.read_csv(f'{source_dir}/{dataset}/{label}.csv').target
        if model_type == 'SVM':
            df_c = pd.DataFrame({'mean': df['certainty'], 'count': 1, 'std': 0})
            df_c['p_true'] = (df.prediction == df_true).astype(float)
            df_c['cnt_unique'] = 1.0
            df_c['p_mf'] = 1.0
        else:
            df_c = df.certainty.agg(['mean', 'count', 'std'], axis=1)
            df_c['p_true'] = df.prediction.apply(lambda s: s.eq(df_true)).sum(axis=1)/df_c['count']
            df_c['cnt_unique'] = df.prediction.apply(lambda row: len(np.unique(row.values)) , axis = 1)
            df_c['p_mf'] = df.prediction.apply(lambda row: np.unique(row.values, return_counts=True)[1].max(), axis = 1)/df_c['count'] # probability of most frequent prediction
        df_c['ci95'] = 1.96*df_c['std']/df_c['count']**.5
        df_c[['mean', 'std', 'ci95', 'p_true', 'p_mf', 'cnt_unique']].to_csv(f'confidence/confidence_{model_type}_{dataset}_{label}.csv', index=False)    

In [4]:
is_rerun = False

for idx, row in df_exp[['model_type', 'dataset']].drop_duplicates().iterrows():
    model_type = row['model_type']
    dataset = row['dataset']
    if not is_rerun and os.path.exists(f'confidence/confidence_{model_type}_{dataset}_train.csv') and os.path.exists(f'confidence/confidence_{model_type}_{dataset}_test.csv'):
        print(f'{model_type}--{dataset} already processed')
        continue
    
    print(f'Processing {model_type}--{dataset}')
    run_ids = df_exp[(df_exp.model_type == model_type) & (df_exp.dataset == dataset)].run_id.values
    compute_confidence_scores(run_ids, model_type, dataset)

DECISION_TREE--TREC-6 already processed
DECISION_TREE--DBP-14 already processed
DECISION_TREE--YELP-5 already processed
DECISION_TREE--GERMEVAL-2020 already processed
DECISION_TREE--GERMEVAL-2018 already processed
DECISION_TREE--10K-GNAD already processed
RANDOM_FOREST--YELP-5 already processed
RANDOM_FOREST--DBP-14 already processed
RANDOM_FOREST--TREC-6 already processed
RANDOM_FOREST--10K-GNAD already processed
RANDOM_FOREST--GERMEVAL-2018 already processed
RANDOM_FOREST--GERMEVAL-2020 already processed
FASTTEXT--DBP-14 already processed
FASTTEXT--TREC-6 already processed
FASTTEXT--YELP-5 already processed
FASTTEXT--10K-GNAD already processed
FASTTEXT--GERMEVAL-2018 already processed
FASTTEXT--GERMEVAL-2020 already processed
GERMAN_BERT--10K-GNAD already processed
GERMAN_BERT--GERMEVAL-2018 already processed
GERMAN_BERT--GERMEVAL-2020 already processed
SVM--YELP-5 already processed
SVM--DBP-14 already processed
SVM--TREC-6 already processed
SVM--10K-GNAD already processed
SVM--GERME