In [2]:
import pandas as pd
import wandb
from tqdm import tqdm
import requests
import os
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
import scipy.stats as stats
import matplotlib.colors as mcolors
from read_wandb import wandb_results
import warnings 

In [3]:
!mkdir sweeps_csvs

mkdir: sweeps_csvs: File exists


In [6]:
project_name = 'NLP2024_PROJECT' # enter project name here (change to NLP2024_PROJECT_edenhindi when you run it)
username = 'eden-hindi' # enter group name here
BASE_METRIC = "accuracy_per_mean_user_and_bot"
api = wandb_results(project_name, wandb_username=username)

Helper functions

In [7]:
def get_final_results(df):
    epoch_acc = [f'ENV_Test_accuracy_per_mean_user_and_bot_epoch{i}' for i in range(25)]
    cols_to_keep = epoch_acc + ['config_learning_rate_gb','config_eps_incorrect','config_seed']
    # result = (df[cols_to_keep]
    #             .groupby(['config_eps_incorrect','config_learning_rate_gb']).max())[epoch_acc]
    results = df[cols_to_keep].set_index(['config_eps_incorrect','config_learning_rate_gb','config_seed']).max(axis=1)
    return results.groupby(['config_eps_incorrect','config_learning_rate_gb']).mean()

def result_metric(sweeps, group_name, drop_list=[0], drop_HPT=False, metric=BASE_METRIC, epoch="best"):
    df = api.get_sweeps_results(sweeps, metric=metric) 

    config_cols = [c for c in df.columns if "config_" in c and c!="config_wandb_run_id" and c!="config_online_simulation_size"]
    HPT_cols = [col for col in config_cols if df[col].nunique() > 1]
    print(HPT_cols)
    if drop_HPT:
        df=df.drop([c for c in HPT_cols if not c in ["config_eps_incorrect","config_learning_rate_gb", "config_seed"]], axis=1)
        HPT_cols = ["config_eps_incorrect","config_learning_rate_gb", "config_seed"]

    # Remove non-numeric columns before computing mean and std
    numeric_cols = df.select_dtypes(include=np.number).columns
    df_numeric = df[numeric_cols]

    grouped = df_numeric.groupby([c for c in HPT_cols if c != "config_seed"])

    mean_df = grouped.mean()
    std_df = grouped.std()

    # Re-add non-numeric columns before computing best_col
    for col in config_cols:
        if col not in mean_df.columns:
            mean_df[col] = df[col]

    if epoch=="best":
        best_col = mean_df[[c for c in mean_df.columns if (metric in c and metric[-4:] == c.split("_epoch")[0][-4:])]].idxmax(axis=1)
    else:
        best_col = mean_df[[c for c in mean_df.columns if f"{metric}_epoch{epoch}" in c]].idxmax(axis=1)

    result = 100*grouped.apply(lambda x: x[best_col.loc[x.name]].values)
    means = 100*grouped.apply(lambda x: x[best_col.loc[x.name]].mean())
    stds = 100*grouped.apply(lambda x: x[best_col.loc[x.name]].std())


    df_cols = {'mean': np.round(means,4), 'std': np.round(stds,4), 'values': result.values}
    if epoch == "best": df_cols['epoch'] = best_col.apply(lambda x: int(x.split("epoch")[1]) if "epoch" in x else "last")

    df_cols['CI'] = result.apply(lambda x: tuple(round(x,4) for x in bootstrap_ci(x)))

    summary_df = pd.DataFrame(df_cols, index=best_col.index)
    for d in drop_list:
        if d in summary_df.index:
            summary_df=summary_df.drop(d)
    if len(summary_df.index.names) == 1:
        return summary_df.rename_axis(group_name)
    else:
        return summary_df

def bootstrap_ci(data, n_bootstrap=1000, ci=0.95):
    bootstrapped_means = []
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=len(data), replace=True)
        bootstrapped_means.append(np.mean(sample))
    lower_bound = np.percentile(bootstrapped_means, (1 - ci) / 2 * 100)
    upper_bound = np.percentile(bootstrapped_means, (1 + ci) / 2 * 100)
    return lower_bound, upper_bound

def result_metric_test(sweeps, drop_HPT=False, metric=BASE_METRIC):
    df = api.get_sweeps_results(sweeps, metric=metric)
    config_cols = [c for c in df.columns if
                   "config_" in c and c != "config_wandb_run_id" and c != "config_online_simulation_size"]
    HPT_cols = [col for col in config_cols if (df[col].nunique() > 1) and (col not in ["config_input_dim", "config_use_user_vector"])]
    if drop_HPT:
        df = df.drop([c for c in HPT_cols if not c in ["config_LLM_SIM_SIZE", "config_seed"]], axis=1)
        HPT_cols = ["config_LLM_SIM_SIZE", "config_seed"]

    # dropping all non-numeric columns that are not in the HPT_cols list
    drop_names = [c for c in df.columns if
                  (c not in HPT_cols) and (c not in [metric, "epoch"]) and (df[c].dtype not in [np.number])]
    df = df.drop(drop_names, axis=1)
    cols = [c for c in df.columns if "ENV_Test_accuracy_per_mean_user_and_bot_epoch" in c]
    new_df = df[cols]

    new_df = new_df.to_numpy()
    new_df = np.max(new_df, axis=1)
    return {"mean": round(100*np.mean(new_df),4), "Confidence interval": np.round(100*np.array(bootstrap_ci(new_df)),4)}

Test Results

In [8]:
sweep_ids = ["q651v7dc","6nb7in1g"] #enter sweep id here
warnings.filterwarnings("ignore")
configurations = [[0.3,0.1],[0.2,0.1]]
for sweep, conf in zip(sweep_ids,configurations):
    print(f"Eps {conf[0]} Learning Rate {conf[1]}")
    
    print(result_metric_test([sweep],"learning_rate",metric="accuracy_per_mean_user_and_bot"))
    print("\n")



Eps 0.3 Learning Rate 0.1
Total number of sweeps: 1
Download sweep_id='q651v7dc' data...
{'mean': 83.7419, 'Confidence interval': array([83.6274, 83.8564])}


Eps 0.2 Learning Rate 0.1
Total number of sweeps: 1
Download sweep_id='6nb7in1g' data...
{'mean': 83.6139, 'Confidence interval': array([83.4552, 83.772 ])}




HPT Results

In [9]:
BASE_METRIC = "accuracy_per_mean_user_and_bot"
sweep_ids = ["8k21y23j","yv6zhyv3","bhjscedc","paqdt76o","ir4nzq36","z6h1n8mf","1ztqh2s0","2985wxrz"] #enter sweep id here
# df = api.get_sweeps_results(sweep_ids, metric="accuracy_per_mean_user_and_bot")
results = result_metric(sweep_ids,"eps_incorrect", epoch="best",drop_list=[-1],drop_HPT=False,metric="accuracy_per_mean_user_and_bot")
results.drop(["values","epoch"],axis=1)

Total number of sweeps: 8
Download sweep_id='8k21y23j' data...
Download sweep_id='yv6zhyv3' data...
Download sweep_id='bhjscedc' data...
Download sweep_id='paqdt76o' data...
Download sweep_id='ir4nzq36' data...
Download sweep_id='z6h1n8mf' data...
Download sweep_id='1ztqh2s0' data...
Download sweep_id='2985wxrz' data...
['config_seed', 'config_eps_incorrect', 'config_learning_rate_gb']


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,CI
config_eps_incorrect,config_learning_rate_gb,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.02,79.4657,0.6138,"(78.976, 79.9554)"
0.0,0.1,79.7322,1.2455,"(78.832, 80.7816)"
0.0,0.5,79.7785,0.5472,"(79.3627, 80.2105)"
0.1,0.02,79.5465,1.039,"(78.7213, 80.3053)"
0.1,0.1,79.7984,0.6748,"(79.2274, 80.2163)"
0.1,0.5,79.3615,0.6846,"(78.8358, 79.8923)"
0.2,0.02,79.4516,1.0483,"(78.7786, 80.2922)"
0.2,0.1,79.8441,1.0829,"(78.9862, 80.5944)"
0.2,0.5,79.5435,0.9113,"(78.8106, 80.2635)"
0.3,0.02,79.1722,0.9025,"(78.4719, 79.8976)"
