In [None]:
import analyze_results_code
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import shap
from sklearn.model_selection import train_test_split
import pickle

# GLOBAL ANALYSIS

## Load ranked contingencies

In [None]:
df_contg = analyze_results_code.load_df("/home/guiu/Projects/CONT_SCR_CRV_REC/Data/Results_TAPS_Test/")

In [None]:
ML_model = pickle.load(
    open("/home/guiu/Projects/CONT_SCR_CRV_REC/Data/FINAL_MODELS/TAPS/GBR_model.pkl", "rb")
)
df_contg["PREDICTED_SCORE"] = ML_model.predict(
    df_contg.drop(columns=["PREDICTED_SCORE", "STATUS", "REAL_SCORE", "DATE"])
)

## MAE (Real score vs Predicted score)

In [None]:
df_filtered = df_contg[df_contg["STATUS"] == "BOTH"]

mae = mean_absolute_error(df_filtered["REAL_SCORE"], df_filtered["PREDICTED_SCORE"])
print("Mean Absolute Error:", mae)

## RMSE (Real score vs Predicted score)

In [None]:
rmse = root_mean_squared_error(df_filtered["REAL_SCORE"], df_filtered["PREDICTED_SCORE"])
print("Root Mean Squared Error:", rmse)

In [None]:
analyze_results_code.real_vs_predicted_score(df_filtered, mae)

In [None]:
analyze_results_code.plot_residual_distribution(df_filtered)

## Hour boxplot of real scores

In [None]:
analyze_results_code.hour_boxplot(df_contg, "REAL_SCORE")

## Hour boxplot of predicted scores

In [None]:
str_date_1 = "2024-12-21 00:00:00"
str_date_2 = "2024-12-21 23:59:59"
df_contg = df_contg.sort_values(by="DATE", ascending=True)

mask = (df_contg["DATE"] > datetime.strptime(str_date_1, "%Y-%m-%d %H:%M:%S")) & (
    df_contg["DATE"] <= datetime.strptime(str_date_2, "%Y-%m-%d %H:%M:%S")
)

df_filtered = df_contg.loc[mask]

df_filtered = df_filtered[df_filtered["STATUS"] == "BOTH"]

if not df_filtered.empty:
    plt.figure(figsize=(12, 6))  # Set the size of the figure
    ax = plt.axes()
    ax.set_facecolor("white")
    sns.boxplot(
        x=df_filtered["DATE"].dt.strftime("%Y/%m/%d, %H:%M"),
        y=pd.to_numeric(df_filtered["PREDICTED_SCORE"]),
    ).set(xlabel="DATE", ylabel="PREDICTED_SCORE")
    plt.xticks(rotation=45, ha="right")  # Rotate the x-axis labels and align them to the right

    # Calculate the dynamic limits for the y-axis (5th and 95th percentiles)
    lower_limit = df_filtered["PREDICTED_SCORE"].quantile(0.05)
    upper_limit = df_filtered["PREDICTED_SCORE"].quantile(0.95)
    if not pd.isna(lower_limit) and not pd.isna(upper_limit):
        plt.ylim(lower_limit, upper_limit)

    plt.grid(color="grey", linewidth=0.5)
    plt.title("Boxplot of PREDICTED_SCORE")  # Add a title to the plot
    plt.tight_layout()  # Adjust the layout so that elements do not overlap
    plt.show()
else:
    print("The DataFrame is empty, the plot cannot be generated.")

## Day boxplot of real scores

In [None]:
analyze_results_code.day_boxplot(df_contg, "REAL_SCORE")

## Day boxplot of predicted scores

In [None]:
str_date_1 = "2024-12-01 00:00:00"
str_date_2 = "2024-12-31 23:59:59"
df_contg = df_contg.sort_values(by="DATE", ascending=True)

mask = (df_contg["DATE"] > datetime.strptime(str_date_1, "%Y-%m-%d %H:%M:%S")) & (
    df_contg["DATE"] <= datetime.strptime(str_date_2, "%Y-%m-%d %H:%M:%S")
)

df_filtered = df_contg.loc[mask]

df_filtered = df_filtered[df_filtered["STATUS"] == "BOTH"]

df_filtered["DATE"] = pd.to_datetime(df_filtered["DATE"], format="%Y-%m-%d %H:%M:%S").dt.date

if not df_filtered.empty:
    plt.figure(figsize=(12, 6))  # Set the size of the figure
    ax = plt.axes()
    ax.set_facecolor("white")
    sns.boxplot(x=df_filtered["DATE"], y=pd.to_numeric(df_filtered["PREDICTED_SCORE"])).set(
        xlabel="DATE", ylabel="PREDICTED_SCORE"
    )
    plt.xticks(rotation=45, ha="right")  # Rotate the x-axis labels and align them to the right

    # Calculate the dynamic limits for the y-axis (5th and 95th percentiles)
    lower_limit = df_filtered["PREDICTED_SCORE"].quantile(0.05)
    upper_limit = df_filtered["PREDICTED_SCORE"].quantile(0.95)
    if not pd.isna(lower_limit) and not pd.isna(upper_limit):
        plt.ylim(lower_limit, upper_limit)

    plt.grid(color="grey", linewidth=0.5)
    plt.title("Boxplot of PREDICTED_SCORE")  # Add a title to the plot
    plt.tight_layout()  # Adjust the layout so that elements do not overlap
    plt.show()
else:
    print("The DataFrame is empty, the plot cannot be generated.")

## Real score histogram

In [None]:
analyze_results_code.score_histogram(df_contg, "REAL_SCORE")

## Predicted score histogram

In [None]:
analyze_results_code.score_histogram(df_contg, "PREDICTED_SCORE")

## GBM explanation

In [None]:
# Get shap values
np.bool = bool
explainer = shap.TreeExplainer(
    ML_model,
    link="logit",
)

In [None]:
_, X_test = train_test_split(
    df_contg.drop(columns=["PREDICTED_SCORE", "STATUS", "REAL_SCORE", "DATE"]),
    test_size=0.2,
    random_state=42,
)

shap_values = explainer.shap_values(X_test)

plt.clf()
shap.summary_plot(shap_values, X_test, show=False)
plt.show()

In [None]:
shap.dependence_plot(
    "MIN_VOLT",
    shap_values,
    X_test,
)

In [None]:
shap.dependence_plot(
    "MAX_VOLT",
    shap_values,
    X_test,
)

In [None]:
shap.dependence_plot(
    "MAX_FLOW",
    shap_values,
    X_test,
)

In [None]:
shap.dependence_plot(
    "N_ITER",
    shap_values,
    X_test,
)

In [None]:
shap.dependence_plot(
    "AFFECTED_ELEM",
    shap_values,
    X_test,
)

In [None]:
shap.dependence_plot(
    "CONSTR_GEN_Q",
    shap_values,
    X_test,
)

In [None]:
shap.dependence_plot(
    "CONSTR_VOLT",
    shap_values,
    X_test,
)

In [None]:
shap.dependence_plot(
    "CONSTR_FLOW",
    shap_values,
    X_test,
)

In [None]:
shap.dependence_plot(
    "RES_NODE",
    shap_values,
    X_test,
)

In [None]:
shap.dependence_plot(
    "COEF_REPORT",
    shap_values,
    X_test,
)

In [None]:
shap.dependence_plot(
    "TAP_CHANGERS",
    shap_values,
    X_test,
)