In [23]:
import sys
import mlflow
import pandas as pd
from mlflow.tracking import MlflowClient
sys.path.append('../src/')
from helpers.mlflow_utils import mlflow_tracking_uri
import matplotlib.pyplot as plt

client = MlflowClient(tracking_uri=mlflow_tracking_uri)
experiment_id = '206'
unlearning_methods = ['finetune', 'neggrad', 'relabel', 'badT', 'scrub', 'ssd', 'unsir', 'our']
runs = client.search_runs(experiment_id)
# Convert runs to a DataFrame
metrics = ["mia", "acc_forget", "acc_retain", "t", "acc_test", "js", "js_proxy"]
runs_df = pd.DataFrame([{k: v for k, v in run.data.metrics.items() if k in metrics} for run in runs])
runs_df['method'] = [run.data.tags.get('mlflow.runName') for run in runs]
runs_df['seed'] = [run.data.params.get('seed') for run in runs]
runs_df = runs_df.set_index(['method', 'seed'])

gap_metrics = ['mia', 'acc_forget', 'acc_retain', 'acc_test']
unlearning_methods = ['finetune', 'neggrad', 'relabel', 'badT', 'scrub', 'ssd', 'unsir', 'our', 'retrained']
# Calculate the difference in 't' between the unlearning methods and 'retrain' of the same seed
for method in unlearning_methods:
    for metric in gap_metrics:
        for seed in runs_df.index.get_level_values('seed').unique():
            runs_df.loc[method, f'{metric}_gap'] = abs(runs_df.loc[(method, seed), metric] - runs_df.loc[('retrained', seed), metric])

grouped_df = runs_df.groupby('method').aggregate(['mean', 'std'])
grouped_df['js'] = grouped_df['js'].apply(lambda x: x*1e6)
grouped_df = grouped_df.round(2)

# Display the updated DataFrame
grouped_df


Unnamed: 0_level_0,acc_retain,acc_retain,acc_test,acc_test,mia,mia,t,t,js,js,...,acc_forget,acc_forget,mia_gap,mia_gap,acc_forget_gap,acc_forget_gap,acc_retain_gap,acc_retain_gap,acc_test_gap,acc_test_gap
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
badT,0.71,0.18,0.66,0.16,0.43,0.37,0.33,0.0,238.67,203.46,...,0.71,0.18,0.73,0.0,0.37,0.0,0.44,0.0,0.4,0.0
finetune,0.98,0.01,0.89,0.02,0.3,0.26,0.43,0.0,103.33,24.21,...,0.97,0.01,0.27,0.0,0.08,0.0,0.01,0.0,0.01,0.0
neggrad,0.97,0.01,0.88,0.02,0.48,0.5,0.49,0.0,105.67,21.01,...,0.97,0.01,0.29,0.0,0.08,0.0,0.01,0.0,0.01,0.0
original,1.0,0.0,0.93,0.01,0.94,0.0,6.5,0.13,,,...,1.0,0.0,,,,,,,,
our,0.85,0.07,0.77,0.07,0.67,0.58,0.29,0.0,147.33,50.65,...,0.78,0.07,0.73,0.0,0.04,0.0,0.03,0.0,0.04,0.0
relabel,0.97,0.01,0.89,0.02,0.48,0.5,0.57,0.0,100.33,25.54,...,0.96,0.01,0.27,0.0,0.06,0.0,0.01,0.0,0.01,0.0
retrained,0.99,0.02,0.91,0.02,0.76,0.03,5.32,1.18,0.0,0.0,...,0.91,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
scrub,1.0,0.0,0.93,0.01,0.94,0.01,0.58,0.0,41.33,9.24,...,1.0,0.0,0.2,0.0,0.11,0.0,0.04,0.0,0.03,0.0
ssd,0.87,0.16,0.8,0.15,0.81,0.11,0.54,0.0,82.33,57.27,...,0.86,0.16,0.09,0.0,0.03,0.0,0.03,0.0,0.04,0.0
unsir,0.93,0.01,0.86,0.01,0.46,0.03,0.45,0.0,65.0,4.58,...,0.93,0.01,0.25,0.0,0.03,0.0,0.04,0.0,0.04,0.0


In [None]:
# # Calculate average and standard deviation per seed
# grouped_df = runs_df.groupby('method')[metrics].agg(['mean', 'std'])
# for method in unlearning_methods:
#     for metric in ["mia", "acc_forget", "acc_retain", "acc_test"]:
#         grouped_df.loc[method, (metric, 'gap')] = abs(grouped_df.loc[method, (metric, 'mean')] - grouped_df.loc['retrained', (metric, 'mean')])
# grouped_df = grouped_df.sort_index(axis=1, level=0)
# grouped_df['avg_gap'] = grouped_df[[col for col in grouped_df.columns if col[1] == 'gap']].mean(axis=1)

# # Format the values to have only 2 decimals, apart from js and js_proxy that would have 6
# for col in grouped_df.columns:
#     if col[0] in ['js', 'js_proxy']:
#         grouped_df[col] = grouped_df[col].apply(lambda x: f"{x:.6f}")
#     else:
#         grouped_df[col] = grouped_df[col].apply(lambda x: f"{x:.2f}")

# grouped_df

# time_means = grouped_df[('t', 'mean')].astype(float)
# time_stds = grouped_df[('t', 'std')].astype(float)
# time_means = time_means.drop(['original', 'retrained'])
# time_stds = time_stds.drop(['original', 'retrained'])

# # Sort the means and standard deviations
# time_means_sorted = time_means.sort_values()
# time_stds_sorted = time_stds[time_means_sorted.index]

# plt.figure(figsize=(6, 4))
# plt.bar(time_means_sorted.index, time_means_sorted, yerr=time_stds_sorted, capsize=5, color='skyblue')
# plt.title('Execution Time per Method')
# plt.xlabel('Method')
# plt.ylabel('Execution Time (min)')
# plt.xticks(rotation=45)
# plt.show()