In [None]:
# This notebook is used to evaluate all created models. For the thesis this is phase 2.
# It also shows a lot of statistics about different aspects of all trained models.
# It also prints the "erfolgreiche Kombinationen".
# Much more is possible to do with the data.

In [None]:
# Evaluate all 2400 models (reward + criticality score) - either read in already created file or create file and use df
import pandas as pd
import os
import glob
from src.main.rl.evaluation.phase2_evaluation import create_evaluation_df_phase2

path_to_save = "src/main/rl/evaluation/output/phase2_evaluation_results.csv"
os.makedirs("src/main/rl/evaluation/output/",exist_ok=True)
os.makedirs("src/main/rl/evaluation/plot_results/",exist_ok=True)
pd.options.display.max_colwidth = 300

df = pd.DataFrame()
try:
    df = pd.read_csv(path_to_save)
except:
    pass
if df.empty:
    all_files = []
    for file in glob.glob("src/main/rl/models/*/*/*/*.zip", recursive=True):
        all_files.append(file)
    create_evaluation_df_phase2(path_to_save, all_files)
    df = pd.read_csv(path_to_save)
len(df)

In [None]:
df_wo_automation = df.query("automation_wrapper.isna() == True")
df_w_automation = df[df["automation_wrapper"] == "NPPAutomationWrapper"]
assert len(df_wo_automation) == len(df_wo_automation) == 1200

In [None]:
# Created aggreagted metrics with NPPAutotmation activated and deactivated
from scipy.stats import iqr

statistics_wo = (
    df_wo_automation.set_index("full_path")
    .groupby("combination")
    .agg(
        return_mean=("cum_reward", "mean"),
        return_max=("cum_reward", "max"),
        return_std=("cum_reward", "std"),
        return_iqr=("cum_reward", iqr),
        timesteps_min=("total_timesteps", "min"),
        criticality_score_max=("criticality_score", "max"),
        criticality_score_std=("criticality_score", "std"),
        criticality_score__iqr=("criticality_score", iqr),
        scenario=("scenario", "first"),
        alg=("alg", "first"),
        action_wrapper=("action_wrapper", "first"),
        obs_wrapper=("obs_wrapper", "first"),
        automation_wrapper=("automation_wrapper", "first"),
    )
)

statistics_w = (
    df_w_automation.set_index("full_path")
    .groupby("combination")
    .agg(
        return_mean=("cum_reward", "mean"),
        return_max=("cum_reward", "max"),
        return_std=("cum_reward", "std"),
        return_iqr=("cum_reward", iqr),
        timesteps_min=("total_timesteps", "min"),
        criticality_score_max=("criticality_score", "max"),
        criticality_score_std=("criticality_score", "std"),
        criticality_score__iqr=("criticality_score", iqr),
        scenario=("scenario", "first"),
        alg=("alg", "first"),
        action_wrapper=("action_wrapper", "first"),
        obs_wrapper=("obs_wrapper", "first"),
        automation_wrapper=("automation_wrapper", "first"),
    )
)
assert len(statistics_wo) == 120
assert len(statistics_w) == 120

statistics_wo.merge(statistics_w, how="outer").sort_values("return_mean")

In [None]:
# Create scatter plots with std and max return per combination
from src.main.rl.evaluation.plots.phase2_plots import (
    create_multi_object_plot,
    create_phase_2_counts_plots,
)

figures = create_multi_object_plot(statistics_wo.merge(statistics_w, how="outer"))
for idx, fig in enumerate(figures):
    fig.savefig(
        f"src/main/rl/evaluation/plot_results/phase2_summary_plots{idx}.png",
        format="png",
        dpi=300,
    )

In [None]:
# Get highest returns overall
highest_return_wo_automation = df_wo_automation.query("cum_reward == cum_reward.max()")
print(highest_return_wo_automation[["cum_reward", "combination"]])
highest_return_w_automation = df_w_automation.query("cum_reward == cum_reward.max()")
print(highest_return_w_automation[["cum_reward", "combination"]])

In [None]:
# Highest return of scenario 2
df_all = statistics_wo.merge(statistics_w, how="outer")
df_all.query("scenario=='scenario2'")["return_max"].max()

In [None]:
# Number of combiantions with return STD under 15
len(df_all.query("return_std<15"))

In [None]:
# Number of combiantions with return STD over 80
len(df_all.query("return_std>80"))

In [None]:
# Minimum STD
min_std = df_all["return_std"].min()
df_all.query("return_std==@min_std")

In [None]:
# Maximum STD
df_all["return_std"].max()

In [None]:
# Create statistics on total df for different groups e.g obs_wrapper, scenario
df.loc[df["automation_wrapper"].isna(), "automation_wrapper"] = "NaN"
for item in ["obs_wrapper", "scenario", "action_wrapper", "alg", "automation_wrapper"]:
    df_special = (
        df.groupby(item)
        .agg(
            return_mean=("cum_reward", "mean"),
            return_max=("cum_reward", "max"),
            return_min=("cum_reward", "min"),
            return_std=("cum_reward", "std"),
            return_timestep_mean=("total_timesteps", "mean"),
        )
        .round(2)
    )
    print(df_special)
# print(df_special.to_latex())

In [None]:
# Create statistics on df without automation for different groups e.g obs_wrapper, scenario
for item in ["obs_wrapper", "scenario", "action_wrapper", "alg"]:
    df_special = (
        df_wo_automation.groupby(item)
        .agg(
            return_mean=("cum_reward", "mean"),
            return_max=("cum_reward", "max"),
            return_std=("cum_reward", "std"),
            return_iqr=("cum_reward", iqr),
        )
        .round(2)
    )
    print(df_special)

In [None]:
# Create statistics on df with automation for different groups e.g obs_wrapper, scenario
for item in ["obs_wrapper", "scenario", "action_wrapper", "alg"]:
    df_special = (
        df_w_automation.groupby(item)
        .agg(
            return_mean=("cum_reward", "mean"),
            return_max=("cum_reward", "max"),
            return_std=("cum_reward", "std"),
            return_iqr=("cum_reward", iqr),
        )
        .round(2)
    )
    print(df_special)

In [None]:
# T-tests (Welch) - no normality of most data therefore needs to be taken with care + no correction factor for multiple tests here
from scipy.stats import ttest_ind
from scipy import stats
from scipy.stats import shapiro

scen1 = df.query("scenario=='scenario1'")["cum_reward"]
scen2 = df.query("scenario=='scenario2'")["cum_reward"]
scen3 = df.query("scenario=='scenario3'")["cum_reward"]

stats, p = shapiro(scen1)
print(p)

print(ttest_ind(scen1, scen3, equal_var=False))
print(ttest_ind(scen1, scen2, equal_var=False))
print(ttest_ind(scen3, scen2, equal_var=False))

In [None]:
# T-tests (Welch) - no normality of most data therefore needs to be taken with care + no correction factor for multiple tests here

act1 = df.query("action_wrapper=='ActionSpaceOption1Wrapper'")["cum_reward"]
act2 = df.query("action_wrapper=='ActionSpaceOption2Wrapper'")["cum_reward"]
act3 = df.query("action_wrapper=='ActionSpaceOption3Wrapper'")["cum_reward"]
print(ttest_ind(act1, act2, equal_var=False))
print(ttest_ind(act1, act3, equal_var=False))
print(ttest_ind(act2, act3, equal_var=False))

In [None]:
# Combiniation that are "Successfull" per definition without automation
paths_that_fulfil_condition_wo_automation = statistics_wo.query(
    "return_max>200 and return_std<15 and timesteps_min==250"
)
len(paths_that_fulfil_condition_wo_automation)

In [None]:
# Combiniation that are "Successfull" per definition with automation
paths_that_fulfil_condition_w_automation = statistics_w.query(
    "return_max>200 and return_std<15 and timesteps_min==250"
)
len(paths_that_fulfil_condition_w_automation)

In [None]:
# Create a plot with all the counts of successfull combinations
from src.main.rl.evaluation.plots.phase2_plots import (
    create_multi_object_plot,
    create_phase_2_counts_plots,
)

fig = create_phase_2_counts_plots(
    paths_that_fulfil_condition_wo_automation.merge(
        paths_that_fulfil_condition_w_automation, how="outer"
    )
)
fig.savefig(
    f"src/main/rl/evaluation/plot_results/phase2_count_plots.png",
    format="png",
    dpi=300,
)

In [None]:
# Get counts of different things for successfull combiniations
cols_to_count = [
    "alg",
    "scenario",
    "action_wrapper",
    "obs_wrapper",
    "automation_wrapper",
]
statistics_wo_value_counts = pd.Series()
statistics_w_value_counts = pd.Series()
for col in cols_to_count:
    statistics_wo_value_counts = pd.concat(
        [
            statistics_wo_value_counts,
            paths_that_fulfil_condition_wo_automation[col].value_counts(),
        ]
    )
    statistics_w_value_counts = pd.concat(
        [
            statistics_w_value_counts,
            paths_that_fulfil_condition_w_automation[col].value_counts(),
        ]
    )
print(statistics_wo_value_counts)
print(statistics_w_value_counts)

In [None]:
# Get successfull combiniations split by scenario/nppautomation and using action space option 3
wo_automation_scenario1_action_space3 = paths_that_fulfil_condition_wo_automation.query(
    "scenario=='scenario1' and action_wrapper=='ActionSpaceOption3Wrapper' and timesteps_min==250"
)
wo_automation_scenario2_action_space3 = paths_that_fulfil_condition_wo_automation.query(
    "scenario=='scenario2' and action_wrapper=='ActionSpaceOption3Wrapper' and timesteps_min==250"
)
wo_automation_scenario3_action_space3 = paths_that_fulfil_condition_wo_automation.query(
    "scenario=='scenario3' and action_wrapper=='ActionSpaceOption3Wrapper' and timesteps_min==250"
)
w_automation_scenario1_action_space3 = paths_that_fulfil_condition_w_automation.query(
    "scenario=='scenario1' and action_wrapper=='ActionSpaceOption3Wrapper' and timesteps_min==250"
)
w_automation_scenario2_action_space3 = paths_that_fulfil_condition_w_automation.query(
    "scenario=='scenario2' and action_wrapper=='ActionSpaceOption3Wrapper' and timesteps_min==250"
)
w_automation_scenario3_action_space3 = paths_that_fulfil_condition_w_automation.query(
    "scenario=='scenario3' and action_wrapper=='ActionSpaceOption3Wrapper' and timesteps_min==250"
)
print(
    f"Only ActionSpace3 and scenario1 without automation:\n {wo_automation_scenario1_action_space3[['return_max', 'return_mean']]}"
)
print(
    f"Only ActionSpace3 and scenario2 without automation:\n {wo_automation_scenario2_action_space3[['return_max', 'return_mean']]}"
)
print(
    f"Only ActionSpace3 and scenario3 without automation:\n {wo_automation_scenario3_action_space3[['return_max', 'return_mean']]}"
)
print(
    f"Only ActionSpace3 and scenario1 with automation:\n {w_automation_scenario1_action_space3[['return_max', 'return_mean', ]]}"
)
print(
    f"Only ActionSpace3 and scenario2 with automation:\n {w_automation_scenario2_action_space3[['return_max', 'return_mean', ]]}"
)
print(
    f"Only ActionSpace3 and scenario3 with automation:\n {w_automation_scenario3_action_space3[['return_max', 'return_mean', ]]}"
)

In [None]:
# All TD3 combinations
statistics_wo[statistics_wo["alg"] == "<class 'stable_baselines3.td3.td3.TD3'>"]

In [None]:
# Average STD per Combiniation per Algorithm
sac = (
    df[df["alg"] == "<class 'stable_baselines3.sac.sac.SAC'>"]
    .groupby("combination")
    .agg("std")["cum_reward"]
    .mean()
    .round(2)
)
td3 = (
    df[df["alg"] == "<class 'stable_baselines3.td3.td3.TD3'>"]
    .groupby("combination")
    .agg("std")["cum_reward"]
    .mean()
    .round(2)
)
a2c = (
    df[df["alg"] == "<class 'stable_baselines3.a2c.a2c.A2C'>"]
    .groupby("combination")
    .agg("std")["cum_reward"]
    .mean()
    .round(2)
)
ppo = (
    df[df["alg"] == "<class 'stable_baselines3.ppo.ppo.PPO'>"]
    .groupby("combination")
    .agg("std")["cum_reward"]
    .mean()
    .round(2)
)
print(f"SAC: {sac}, TD3: {td3}, A2C: {a2c}, PPO: {ppo}")

In [None]:
# Average STD per Combiniation per scenario
scen1 = (
    df[df["scenario"] == "scenario1"]
    .groupby("combination")
    .agg("std")["cum_reward"]
    .std()
)
scen2 = (
    df[df["scenario"] == "scenario2"]
    .groupby("combination")
    .agg("std")["cum_reward"]
    .std()
)
scen3 = (
    df[df["scenario"] == "scenario3"]
    .groupby("combination")
    .agg("std")["cum_reward"]
    .std()
)
print(f"Scenario 1: {scen1}, Scenario 2: {scen2}, Scenario 3: {scen3}")

In [None]:
# Average STD per Combiniation per NPPAutomation
w_auto = (
    df[df["automation_wrapper"] == "NPPAutomationWrapper"]
    .groupby("combination")
    .agg("std")["cum_reward"]
    .mean()
)
wo_auto = (
    df[df["automation_wrapper"].isna()]
    .groupby("combination")
    .agg("std")["cum_reward"]
    .mean()
)
print(w_auto, wo_auto)

In [None]:
# Printing single results per combiniation (from each of the ten models)
df[
    df["combination"].str.contains(
        "scenario1_ActionSpaceOption3Wrapper_ObservationOption5Wrapper_NPPAutomationWrapper_RewardOption2Wrapper_TD3"
    )
]

In [None]:
# How often were all timesteps fully done per scenario - keep in mind that scenario 1 had double as many combiniations
# scenario 1
df[df["scenario"] == "scenario1"]["total_timesteps"].eq(250).sum()

In [None]:
# scenario 2
df[df["scenario"] == "scenario2"]["total_timesteps"].eq(250).sum()

In [None]:
## scenario 3
df[df["scenario"] == "scenario3"]["total_timesteps"].eq(250).sum()

In [None]:
# How often was the return above 200 per scenario
# scenario 1
len(df.query("scenario == 'scenario1' and cum_reward>200"))

In [None]:
# scenario 2
len(df.query("scenario == 'scenario2' and cum_reward>200"))

In [None]:
# scenario 3
len(df.query("scenario == 'scenario3' and cum_reward>200"))

In [None]:
# Count combinations with STD under 15 per scenario
# scenario 1
scen = df[df["scenario"] == "scenario1"].groupby("combination").agg("std")
(scen["cum_reward"] < 15).sum()

In [None]:
# scenario 2
scen = df[df["scenario"] == "scenario2"].groupby("combination").agg("std")["cum_reward"]
(scen < 15).sum()

In [None]:
# scenario 3
scen = df[df["scenario"] == "scenario3"].groupby("combination").agg("std")["cum_reward"]
(scen < 15).sum()