In [252]:
from pathlib import Path

In [258]:
path = Path("../data/microbenchmark")
df = pd.read_csv(path.joinpath("impressions_bias.csv"))

In [259]:
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
shuffled_df.to_csv(path.joinpath("impressions_bias_shuffled.csv"), index=False)


In [238]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [273]:
import mlflow
import pandas as pd

from cookiemonster.utils import LOGS_PATH

In [274]:
from mlflow.tracking import MlflowClient

tracking_uri = str(LOGS_PATH.joinpath("mlflow"))

mlflow.set_tracking_uri(tracking_uri)
client = MlflowClient(
    tracking_uri=tracking_uri
)

In [275]:
# experiment_name="bias_detection_09-09_11-39"
experiment_name="bias_detection_09-09_18-06"


runs = mlflow.search_runs(experiment_names=[experiment_name])
run_ids = list(runs.run_id)
run_ids

['20b7291d300d438f90bc61170b163de5',
 '5921038127a7494c9c4a8ceb033b295c',
 '5fca17087f3f461cb045ba00ff95d3c5',
 'ae42eca11ac74d21b4f5f71a5ff533bd',
 'c7dc8b6a5a4b4eba8a83f5f4f6147313',
 '35d5e8d8e9944adab893d52a4899264c',
 'f2d5edc7951f4939b6c17f02df0a9848',
 '144899f9e5cc4e16b3182af937cda7b1',
 'f2e5917a642245d1a709c4cab4185de7',
 '9281bb3213ed48c9a284835537c1b9f8',
 'f832a815cccc461c8bcc4b10ff0f0d1b',
 '0121013b88d64a13bb8b013a1cc168b9',
 'ea472cc86f9f4b5c9f8bc671d845b588',
 '3a23b25628f24c48adc3be9d0fbef1fa',
 '920b54647b2342aeb2721eb6938839aa',
 '68e45ef9e1aa4deb9bd611d9620ed9af',
 'b179c92f82d4426a9ad77548985fab46',
 '0295b8508c2e452ea2e25b525783ebd6']

In [276]:
data = []
metric_names = ["rmsre", "rmsre_prediction"]
param_names = ["user.baseline", "user.bias_detection_knob", "dataset.num_days_per_epoch"]

for run_id in run_ids:
    
    run_params = client.get_run(run_id).data.params
    
    metric_histories = {}
    for metric_name in metric_names:
        metric_histories[metric_name] = client.get_metric_history(run_id, metric_name)
    
    for step in range(len(metric_histories[metric_names[0]])):
        step_data = {
            "step": step,
        }
        for metric_name in metric_names:
            try:
                step_data[metric_name] = metric_histories[metric_name][step].value
            except IndexError:
                step_data[metric_name] = None
                    
    
        for param_name in param_names:
            step_data[param_name] = run_params[param_name]
    
        data.append(step_data)

df = pd.DataFrame(data)
df

Unnamed: 0,step,rmsre,rmsre_prediction,user.baseline,user.bias_detection_knob,dataset.num_days_per_epoch
0,0,0.086262,,cookiemonster,0,30
1,1,0.084832,,cookiemonster,0,30
2,2,0.086749,,cookiemonster,0,30
3,3,0.086262,,cookiemonster,0,30
4,4,0.070759,,cookiemonster,0,30
...,...,...,...,...,...,...
8995,495,0.654350,3.081646,cookiemonster,1,7
8996,496,0.662951,3.480346,cookiemonster,1,7
8997,497,0.648393,3.252572,cookiemonster,1,7
8998,498,0.653141,3.528841,cookiemonster,1,7


In [277]:
# df["baseline"] = df["user.baseline"]
df["bias_detection_knob"] = df["user.bias_detection_knob"]
df["num_days_per_epoch"] = df["dataset.num_days_per_epoch"]

def get_baseline_name(row):
    if row["user.bias_detection_knob"] == "0":
        return row["user.baseline"]
    return row["user.baseline"] + "_bias"

df["baseline"] = df.apply(get_baseline_name, axis=1)



In [278]:
df.baseline.unique()

array(['cookiemonster', 'ipa', 'cookiemonster_base', 'cookiemonster_bias'],
      dtype=object)

In [279]:
df.bias_detection_knob.unique()

array(['0', '0.5', '5', '1'], dtype=object)

In [280]:
from plotting.macros import *
from plotting.plot_template import *


In [281]:
cdf_args = {
        "df": df.query("(bias_detection_knob == '0' or bias_detection_knob == '0.5') and num_days_per_epoch == '7'"),
        "metric": "rmsre",
        "x_axis": None,
        "x_axis_title": RMSRE_CDF_X,
        "y_axis_title": RMSRE_Y,
        "ordering": None,
        "log_y": False,
        "x_range": [1, 100],
        "showlegend": False,
        "marker_pos": 0.98,
        "baselines_order": BIAS_BASELINES_ORDER
    }

In [282]:
figs_args = {
    "axis_title_font_size": {"x": 18, "y": 18},
    "axis_tick_font_size": {"x": 14, "y": 14},
    "legend": {
        "yanchor": "top",
        "y": 1.2,
        "xanchor": "left",
        "x": 0.2,
        "orientation": "h",
    },
    "output_path": f"cdf_rmsre_{experiment_name}.png",
    "height": 300,
    "width": 1500,
}
# make_plots([(cdf, cdf_args)], cols=1, **figs_args)


In [283]:
fig = go.Figure()
for trace in cdf(**cdf_args):
    fig.add_trace(trace)
fig

In [284]:
def apply_bias_detection(row):
    if row["rmsre_prediction"] and row["rmsre_prediction"] > 0.2:
        return None
    return row["rmsre_original"]

chopped_df = df.copy()
chopped_df["rmsre_original"] = chopped_df["rmsre"]
chopped_df["rmsre"] = chopped_df.apply(apply_bias_detection, axis=1)
chopped_df.drop(columns=["rmsre_prediction"], inplace=True)

In [285]:
cdf_args["df"] = chopped_df.query("(bias_detection_knob == '0' or bias_detection_knob == '1') and num_days_per_epoch == '7'")
fig = go.Figure()
for trace in cdf(**cdf_args):
    fig.add_trace(trace)
fig