In [1]:
%load_ext nb_black

import pandas as pd
import os
import glob
import numpy as np

<IPython.core.display.Javascript object>

In [3]:
import warnings

# Suppress specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn._oldcore")


<IPython.core.display.Javascript object>

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

<IPython.core.display.Javascript object>

In [5]:
from sklearn.preprocessing import StandardScaler

<IPython.core.display.Javascript object>

In [6]:
from collections import deque, defaultdict


<IPython.core.display.Javascript object>

In [7]:
def read_csv_files_grouped(csv_files, plant, header=[0, 1]):
    results = []
    for filepath in csv_files:
        df = pd.read_csv(filepath, header=header, low_memory=False)
        df["plant"] = plant  # Add the 'plant' column
        results.append(df)
    df = pd.concat(results, ignore_index=True)
    return df

<IPython.core.display.Javascript object>

In [8]:
def read_csv_files_full(csv_files, plant):
    results = []
    for filepath in csv_files:
        df = pd.read_csv(filepath, low_memory=False)
        df["plant"] = plant  # Add the 'plant' column
        results.append(df)
    df = pd.concat(results, ignore_index=True)
    return df

<IPython.core.display.Javascript object>

### Renaming the Columns

In [9]:
def preprocess_rename_columns(df):
    df.columns = pd.MultiIndex.from_tuples(
        list(
            {
                col: (col[0], "") if "Unnamed" in col[1] else col for col in df.columns
            }.values()
        )
    )
    return df

<IPython.core.display.Javascript object>

### Changing the order_of columns Columns

In [10]:
def preprocess_change_columns_order(df, column, pos):
    # Get the list of columns
    cols = df.columns.tolist()
    # Remove the specified column
    cols.remove(column)
    # Insert the column at the desired position
    cols.insert(pos, column)
    # Reorder the DataFrame columns
    return df[cols]

<IPython.core.display.Javascript object>

In [11]:
def read_csv_files_path(csv_files_path_dict, path, plant):
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    csv_files_path_dict[plant] = csv_files
    return csv_files_path_dict

<IPython.core.display.Javascript object>

# Topological Analysis - NEW

## Definitions

In [12]:
# Function to compare two models strictly based on means only
def dominates(model1, model2):
    """Return True if model1 strictly dominates model2, False otherwise."""
    # Check if model1 is at least as good as model2 in all metrics
    at_least_as_good = (
        model1["RMSE_mean"] <= model2["RMSE_mean"]
        and model1["MAE_mean"] <= model2["MAE_mean"]
        and model1["MAPE_mean"] <= model2["MAPE_mean"]
        and model1["R2_mean"] >= model2["R2_mean"]
    )

    # Check if model1 is strictly better in at least one metric
    strictly_better_in_one = (
        model1["RMSE_mean"] < model2["RMSE_mean"]
        or model1["MAE_mean"] < model2["MAE_mean"]
        or model1["MAPE_mean"] < model2["MAPE_mean"]
        or model1["R2_mean"] > model2["R2_mean"]
    )

    # Model1 strictly dominates Model2 if it is at least as good in all metrics and
    # strictly better in at least one metric
    return at_least_as_good and strictly_better_in_one

<IPython.core.display.Javascript object>

In [13]:
def get_dominance_matrix_and_graph(df):
    n = len(df)
    dominance_matrix = np.zeros((n, n), dtype=bool)

    for i in range(n):
        for j in range(n):
            if i != j:
                dominance_matrix[i, j] = dominates(df.iloc[i], df.iloc[j])

    # Create the dominance graph
    dominance_graph = {i: [] for i in range(n)}
    for i in range(n):
        for j in range(n):
            if dominance_matrix[i, j]:
                dominance_graph[i].append(j)

    return dominance_matrix, dominance_graph

<IPython.core.display.Javascript object>

In [14]:
# Topological Sorting
def topological_sort(graph):
    """Perform topological sorting on the given graph."""
    in_degree = {u: 0 for u in graph}
    for u in graph:
        for v in graph[u]:
            in_degree[v] += 1

    queue = deque([u for u in graph if in_degree[u] == 0])
    topo_order = []

    while queue:
        u = queue.popleft()
        topo_order.append(u)

        for v in graph[u]:
            in_degree[v] -= 1
            if in_degree[v] == 0:
                queue.append(v)

    return topo_order

<IPython.core.display.Javascript object>

In [15]:
# Function to find the top models in each group
def find_top_models(group):
    max_net_dominance = group["Net_Dominance"].max()
    top_models = group[group["Net_Dominance"] == max_net_dominance]
    return top_models

<IPython.core.display.Javascript object>

### Dominance Analysis and SCPM

In [16]:
def compute_scpm(df):
    scaler = StandardScaler()
    standardized_metrics = scaler.fit_transform(
        df[["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"]]
    )

    # Creating a new dataframe with standardized values
    standardized_df = pd.DataFrame(
        standardized_metrics,
        columns=["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"],
    )

    # Summing all metrics and subtracting R2_mean
    standardized_df["Result"] = (
        standardized_df[["RMSE_mean", "MAE_mean", "MAPE_mean"]].sum(axis=1)
        - standardized_df["R2_mean"]
    )

    df["SCPM"] = standardized_df["Result"]
    return df

<IPython.core.display.Javascript object>

In [17]:
def make_dominance_analysis(df):
    # Compute dominance matrix and graph
    dominance_matrix, dominance_graph = get_dominance_matrix_and_graph(df)

    # Get the topological order
    topo_order = topological_sort(dominance_graph)
    df = df.copy()
    n = len(topo_order)

    # Identify Dominant Models
    dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if not np.any(dominance_matrix[:, i])
    ]

    # Identify Non-Dominant Models
    non_dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i])
    ]

    # Identify Models that are both dominated and dominate at least one
    intermediate_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i]) and np.any(dominance_matrix[i, :])
    ]

    # Add count of times each model is dominated by others
    df["Dominated_Count"] = np.sum(dominance_matrix, axis=0)

    # Classify models
    df["Classification"] = "intermediate_model"  # default to intermediate_model
    df.loc[
        ~np.any(dominance_matrix, axis=1) & np.any(dominance_matrix, axis=0),
        "Classification",
    ] = "non_dominant_model"
    df.loc[~np.any(dominance_matrix, axis=0), "Classification"] = "dominant_model"

    # Map the topological order to the model names and get dominance counts
    sorted_models = [df["Model"].iloc[i] for i in topo_order]

    df_sorted = df.iloc[topo_order].copy()
    df_sorted["Dominates_Count"] = dominance_matrix.sum(axis=1)[topo_order]
    df_sorted_topo = df_sorted.copy()

    df_sorted = df_sorted.sort_values(by="Dominates_Count", ascending=False)
    df_sorted_count = df_sorted.copy()

    return {
        "dominance_matrix": dominance_matrix,
        "dominance_graph": dominance_graph,
        "df_sorted_topo": df_sorted_topo,
        "df_sorted_count": df_sorted_count,
    }

<IPython.core.display.Javascript object>

# Pre Training Analysis

# Reading The files

In [18]:
csv_files_path_fine_tuning = dict()
csv_files_path_pre_train = dict()

<IPython.core.display.Javascript object>

## 207

### Plant AT

In [19]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/207/at/pre_training/full/"
plant = "at"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

### Reading all data - pre training

#### Full

In [20]:
dfs_plant = dict()
column = ("plant", "")
pos = 2

for plant, csv_files in csv_files_path_pre_train.items():
    df = read_csv_files_full(csv_files, plant)
    df["plant"] = plant
    dfs_plant[plant] = df

<IPython.core.display.Javascript object>

In [21]:
df_pre_train = pd.concat(dfs_plant.values(), ignore_index=True)

<IPython.core.display.Javascript object>

In [22]:
df_pre_train.shape

(219, 23)

<IPython.core.display.Javascript object>

## Preprocessing

In [23]:
patterns = [
    "Chemical + Mineralogical + Feature Engineering",
    "Chemical + Mineralogical + CS7 + One-Hot",
    "Chemical + Mineralogical + Physical + One-Hot",
    "Chemical + Mineralogical + CS3 + One-Hot",
    "Chemical + Mineralogical + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + One-Hot",
    "Chemical + Mineralogical + CS1 + One-Hot",
    "Chemical + Mineralogical + CS2 + One-Hot",
    "Chemical + Feature Engineering",
]

FEATURES_TO_REPLACE_1 = {
    "Chemical + Mineralogical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS2": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS7": "Chemical + Mineralogical + Early CS",
}

FEATURES_TO_REPLACE_2 = {
    "Chemical": "Chemical + Mineralogical",
    "Chemical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Mineralogical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Physical": "Chemical + Mineralogical + Physical",
}

df_pre_train = df_pre_train[
    ~df_pre_train["Features"].apply(lambda x: x in patterns)
].reset_index(drop=True)

df_pre_train = df_pre_train[
    ~df_pre_train["Cross Validation"].eq("Out of time Split")
].reset_index(drop=True)

# removing afterwards
df_pre_train = df_pre_train[
    ~df_pre_train["Cross Validation"].eq("Repeated KFold")
].reset_index(drop=True)

df_pre_train["Features_bkp"] = df_pre_train["Features"].copy()
df_pre_train["Features"] = df_pre_train["Features"].replace(FEATURES_TO_REPLACE_1)

df_pre_train["Features"] = df_pre_train["Features"].replace(FEATURES_TO_REPLACE_2)


# removing afterwards
# df_pre_train = df_pre_train[
#     ~df_pre_train["Features"].eq("Chemical + Mineralogical + Early CS")
# ].reset_index(drop=True)

metrics_names = {
    "RMSE Test": "RMSE_mean",
    "MAE Test": "MAE_mean",
    "MAPE Test": "MAPE_mean",
    "R2 Test": "R2_mean",
}

df_pre_train = df_pre_train.rename(metrics_names, axis=1)

<IPython.core.display.Javascript object>

In [24]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [25]:
df_pre_train["MAPE Train"] = df_pre_train["MAPE Train"] * 100
df_pre_train["MAPE_mean"] = df_pre_train["MAPE_mean"] * 100

df_pre_train["Model_bkp"] = df_pre_train["Model"].copy()
df_pre_train["Model_bkp_2"] = df_pre_train["Model"].apply(
    lambda x: "" if pd.isna(x) else "_" + str(x)
)

<IPython.core.display.Javascript object>

In [26]:
df_pre_train["Model_bkp_2"] = df_pre_train["Model_bkp_2"].apply(
    lambda x: "MLP"
    if "MLP" in x
    else "Bi-LSTM"
    if "Bi-LSTM" in x
    else "LSTM"
    if "LSTM" in x
    else "Conv1D"
    if "Conv1D" in x
    else "Transformer"
)

<IPython.core.display.Javascript object>

In [27]:
df_pre_train["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [28]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [29]:
df_pre_train["Features_bkp"].unique()

array(['Chemical + Properties CS Less', 'Chemical + Physical', 'Chemical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [30]:
df_pre_train.shape

(219, 26)

<IPython.core.display.Javascript object>

In [31]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [32]:
df_results_cm = (
    df_pre_train[df_pre_train["Features"].eq("Chemical + Mineralogical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [33]:
df_results_cm["Features"].unique()

array(['Chemical + Mineralogical'], dtype=object)

<IPython.core.display.Javascript object>

In [34]:
df_results_cm.shape

(73, 26)

<IPython.core.display.Javascript object>

# Global Analysis (pre train results)

## ECICS - 207

### Variable Grouping: CM

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM were from:**

*BTiSS:*
    Dominance analysis: Plant AT
    SCPM:Plant AT

*TiSS:*
    Dominance analysis: Plant AT
    SCPM:Plant AT

### Plant AT

#### Hold Out

In [35]:
df_results_cm = (
    df_pre_train[df_pre_train["Features"].eq("Chemical + Mineralogical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [36]:
df_results_cm["Features"].unique()

array(['Chemical + Mineralogical'], dtype=object)

<IPython.core.display.Javascript object>

In [37]:
df_results_cm.shape

(73, 26)

<IPython.core.display.Javascript object>

In [38]:
df_results_cm = df_results_cm[df_results_cm["Plant"].eq("AT")].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [39]:
df_results_cm

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE Train,R2 Train,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2
0,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",,MLP_1,,Standard Scaler,,...,3.682350,0.900878,2.834218,2.096492,4.984425,0.783618,at,Chemical,MLP_1,MLP
1,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",,MLP_2,,Standard Scaler,,...,3.626862,0.901784,2.679632,1.969823,4.681624,0.806578,at,Chemical,MLP_2,MLP
2,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",,MLP_3,,Standard Scaler,,...,3.353553,0.918862,2.621882,1.896587,4.526861,0.814825,at,Chemical,MLP_3,MLP
3,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",,MLP_4,,Standard Scaler,,...,3.208367,0.927840,2.665905,1.928414,4.633699,0.808555,at,Chemical,MLP_4,MLP
4,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",,MLP_5,,Standard Scaler,,...,3.155657,0.925626,2.639911,1.889546,4.481479,0.812270,at,Chemical,MLP_5,MLP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",7.0,Conv1D_11,,Standard Scaler,,...,4.229581,0.878012,2.490204,1.878121,4.453265,0.832981,at,Chemical,Conv1D_11,Conv1D
69,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",14.0,Conv1D_12,,Standard Scaler,,...,4.781645,0.840024,2.534123,1.938795,4.530434,0.827234,at,Chemical,Conv1D_12,Conv1D
70,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",1.0,Conv1D_13,,Standard Scaler,,...,4.278021,0.861300,2.816559,2.058521,4.894430,0.786323,at,Chemical,Conv1D_13,Conv1D
71,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",7.0,Conv1D_14,,Standard Scaler,,...,3.617555,0.910126,2.387082,1.827734,4.329657,0.846527,at,Chemical,Conv1D_14,Conv1D


<IPython.core.display.Javascript object>

In [40]:
df_results_cm["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [41]:
df_results_cm_ho = (
    df_results_cm[df_results_cm["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [42]:
df_results_cm_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [43]:
df_results_cm_ho.shape

(73, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [44]:
df_results_cm_ho = compute_scpm(df_results_cm_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [45]:
dominance_dict = make_dominance_analysis(df_results_cm_ho)
dominance_matrix_cm_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [46]:
dominance_matrix_cm_ho.shape, len(dominance_graph_cm_ho)

((73, 73), 73)

<IPython.core.display.Javascript object>

In [47]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    70
dominant_model         2
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [48]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
38,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",7.0,Bi-LSTM11,,Standard Scaler,,...,4.101545,0.845872,at,Chemical,Bi-LSTM11,Bi-LSTM,-0.772459,0,dominant_model,67
42,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",14.0,Bi-LSTM15,,Standard Scaler,,...,4.142043,0.853383,at,Chemical,Bi-LSTM15,Bi-LSTM,-0.784721,0,dominant_model,70
17,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",7.0,LSTM5,,Standard Scaler,,...,4.139438,0.844656,at,Chemical,LSTM5,LSTM,-0.759506,1,intermediate_model,65
21,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",14.0,LSTM9,,Standard Scaler,,...,4.171853,0.848335,at,Chemical,LSTM9,LSTM,-0.762867,1,intermediate_model,67
23,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",7.0,LSTM11,,Standard Scaler,,...,4.250362,0.849726,at,Chemical,LSTM11,LSTM,-0.752402,1,intermediate_model,66


<IPython.core.display.Javascript object>

In [49]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [50]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
42,Bi-LSTM15,14.0,2.33,1.77,4.14,0.85,-0.78,0,dominant_model,70,AT
21,LSTM9,14.0,2.37,1.79,4.17,0.85,-0.76,1,intermediate_model,67,AT
38,Bi-LSTM11,7.0,2.39,1.76,4.1,0.85,-0.77,0,dominant_model,67,AT


<IPython.core.display.Javascript object>

##### Top intermediate models

In [51]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
21,LSTM9,14.0,2.37,1.79,4.17,0.85,-0.76,1,intermediate_model,67,AT
23,LSTM11,7.0,2.36,1.8,4.25,0.85,-0.75,1,intermediate_model,66,AT


<IPython.core.display.Javascript object>

##### Top non dominant models

In [52]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
39,Bi-LSTM12,14.0,33.38,32.81,75.57,-28.97,33.86,72,non_dominant_model,0,AT


<IPython.core.display.Javascript object>

In [53]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [54]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Bi-LSTM        15
LSTM           15
Conv1D         15
Transformer    15
MLP            13
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [55]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [56]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [57]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [58]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,15.0,31.67,28.06,0.0,6.5,21.0,58.0,72.0,15.0,34.73,26.76,0.0,7.5,38.0,58.0,70.0
Conv1D,15.0,22.6,14.2,4.0,13.0,16.0,39.0,43.0,15.0,41.6,11.66,27.0,30.0,41.0,49.5,62.0
LSTM,15.0,26.0,20.18,1.0,10.0,25.0,48.0,54.0,15.0,38.33,19.18,11.0,21.5,36.0,51.5,67.0
MLP,13.0,21.46,12.27,4.0,15.0,20.0,30.0,43.0,13.0,37.69,6.45,26.0,35.0,36.0,41.0,49.0
Transformer,15.0,57.4,7.14,47.0,52.0,55.0,63.5,70.0,15.0,8.93,5.39,1.0,5.0,8.0,12.0,19.0


<IPython.core.display.Javascript object>

In [59]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,15,3,intermediate_model,12
Conv1D,15,1,intermediate_model,15
LSTM,15,1,intermediate_model,15
MLP,13,1,intermediate_model,13
Transformer,15,1,intermediate_model,15


<IPython.core.display.Javascript object>

In [60]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    12
             dominant_model         2
             non_dominant_model     1
Conv1D       intermediate_model    15
LSTM         intermediate_model    15
MLP          intermediate_model    13
Transformer  intermediate_model    15
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [61]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,80.0
Bi-LSTM,dominant_model,13.33
Bi-LSTM,non_dominant_model,6.67
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,100.0
MLP,intermediate_model,100.0
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [62]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
38,Bi-LSTM,0,67,dominant_model
42,Bi-LSTM,0,70,dominant_model
17,LSTM,1,65,intermediate_model
21,LSTM,1,67,intermediate_model
23,LSTM,1,66,intermediate_model
...,...,...,...,...
28,Bi-LSTM,67,1,intermediate_model
36,Bi-LSTM,68,2,intermediate_model
57,Transformer,68,2,intermediate_model
47,Transformer,70,1,intermediate_model


<IPython.core.display.Javascript object>

In [63]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [64]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,15,34.733333,38.0,475,521,7.136986
Conv1D,15,41.6,41.0,339,624,8.547945
LSTM,15,38.333333,36.0,390,575,7.876712
MLP,13,37.692308,36.0,279,490,6.712329
Transformer,15,8.933333,8.0,861,134,1.835616


<IPython.core.display.Javascript object>

In [65]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,4.797401,...,58.0,72.0,15.0,34.733333,26.759422,0.0,7.5,38.0,58.0,70.0
Conv1D,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.595402,...,39.0,43.0,15.0,41.6,11.660679,27.0,30.0,41.0,49.5,62.0
LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.665356,...,48.0,54.0,15.0,38.333333,19.182085,11.0,21.5,36.0,51.5,67.0
MLP,0.0,,,,,,,,13.0,2.651934,...,30.0,43.0,13.0,37.692308,6.447023,26.0,35.0,36.0,41.0,49.0
Transformer,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.126165,...,63.5,70.0,15.0,8.933333,5.391351,1.0,5.0,8.0,12.0,19.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [66]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [67]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [68]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [69]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [70]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
42,Bi-LSTM15,Bi-LSTM,14.0,2.33,1.77,4.14,0.85,-0.78,0,70,dominant_model,AT,False
38,Bi-LSTM11,Bi-LSTM,7.0,2.39,1.76,4.1,0.85,-0.77,0,67,dominant_model,AT,False
21,LSTM9,LSTM,14.0,2.37,1.79,4.17,0.85,-0.76,1,67,intermediate_model,AT,False
17,LSTM5,LSTM,7.0,2.4,1.78,4.14,0.84,-0.76,1,65,intermediate_model,AT,False
23,LSTM11,LSTM,7.0,2.36,1.8,4.25,0.85,-0.75,1,66,intermediate_model,AT,True
29,Bi-LSTM2,Bi-LSTM,7.0,2.36,1.8,4.27,0.85,-0.75,1,66,intermediate_model,AT,True
63,Conv1D_6,Conv1D,14.0,2.39,1.83,4.3,0.85,-0.73,5,62,intermediate_model,AT,False


<IPython.core.display.Javascript object>

In [71]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].head(1)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
42,Bi-LSTM15,Bi-LSTM,14.0,2.334487,1.76689,4.142043,0.853383,-0.784721,0,70,dominant_model,AT,False


<IPython.core.display.Javascript object>

### Variable Grouping: CM-P

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P were from:**

*BTiSS:*
    Dominance analysis: Plant AT
    SCPM:Plant AT

*TiSS:*
    Dominance analysis: Plant AT
    SCPM:Plant AT

In [72]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [73]:
df_results_cm_p = (
    df_pre_train[
        df_pre_train["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

### Plant AT

In [74]:
df_results_cm_p_am = df_results_cm_p[df_results_cm_p["Plant"].eq("AT")].reset_index(
    drop=True
)

<IPython.core.display.Javascript object>

In [75]:
df_results_cm_p_am.shape

(73, 26)

<IPython.core.display.Javascript object>

#### Hold Out

In [76]:
df_results_cm_p_am["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [77]:
df_results_cm_p_am_ho = (
    df_results_cm_p_am[df_results_cm_p_am["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [78]:
df_results_cm_p_am_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [79]:
df_results_cm_p_am_ho.shape

(73, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [80]:
df_results_cm_p_am_ho = compute_scpm(df_results_cm_p_am_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [81]:
dominance_dict = make_dominance_analysis(df_results_cm_p_am_ho)
dominance_matrix_cm_p_am_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_am_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [82]:
dominance_matrix_cm_p_am_ho.shape, len(dominance_graph_cm_p_am_ho)

((73, 73), 73)

<IPython.core.display.Javascript object>

In [83]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    70
dominant_model         2
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [84]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
66,Global Model,207,AT,Chemical + Mineralogical + Physical - Early CS,"(62749, 16)",,MLP_7,,Standard Scaler,,...,3.514956,0.891546,at,Chemical + Properties CS Less,MLP_7,MLP,-2.796924,0,dominant_model,71
70,Global Model,207,AT,Chemical + Mineralogical + Physical - Early CS,"(62749, 16)",,MLP_11,,Standard Scaler,,...,3.517973,0.893385,at,Chemical + Properties CS Less,MLP_11,MLP,-2.812821,0,dominant_model,71
69,Global Model,207,AT,Chemical + Mineralogical + Physical - Early CS,"(62749, 16)",,MLP_10,,Standard Scaler,,...,3.535726,0.885287,at,Chemical + Properties CS Less,MLP_10,MLP,-2.730082,2,intermediate_model,69
71,Global Model,207,AT,Chemical + Mineralogical + Physical - Early CS,"(62749, 16)",,MLP_12,,Standard Scaler,,...,3.635425,0.887517,at,Chemical + Properties CS Less,MLP_12,MLP,-2.695926,2,intermediate_model,67
65,Global Model,207,AT,Chemical + Mineralogical + Physical - Early CS,"(62749, 16)",,MLP_6,,Standard Scaler,,...,3.616965,0.885106,at,Chemical + Properties CS Less,MLP_6,MLP,-2.687823,3,intermediate_model,68


<IPython.core.display.Javascript object>

In [85]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [86]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
66,MLP_7,,2.01,1.49,3.51,0.89,-2.8,0,dominant_model,71,AT
70,MLP_11,,1.99,1.49,3.52,0.89,-2.81,0,dominant_model,71,AT
69,MLP_10,,2.06,1.51,3.54,0.89,-2.73,2,intermediate_model,69,AT


<IPython.core.display.Javascript object>

##### Top intermediate models

In [87]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
69,MLP_10,,2.06,1.51,3.54,0.89,-2.73,2,intermediate_model,69,AT
65,MLP_6,,2.07,1.54,3.62,0.89,-2.69,3,intermediate_model,68,AT


<IPython.core.display.Javascript object>

##### Top non dominant models

In [88]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
38,Bi-LSTM9,14.0,13.72,12.29,26.98,-4.06,27.86,72,non_dominant_model,0,AT


<IPython.core.display.Javascript object>

In [89]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [90]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Conv1D         15
LSTM           15
Bi-LSTM        15
Transformer    15
MLP            13
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [91]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [92]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [93]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [94]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,15.0,50.73,15.26,29.0,34.5,53.0,63.5,72.0,15.0,18.87,14.89,0.0,6.0,15.0,33.5,43.0
Conv1D,15.0,18.93,4.45,13.0,15.5,18.0,22.5,28.0,15.0,50.4,4.87,44.0,45.5,51.0,54.0,59.0
LSTM,15.0,49.53,15.8,21.0,39.5,45.0,62.0,70.0,15.0,19.87,14.82,1.0,7.0,22.0,30.0,45.0
MLP,13.0,5.54,4.01,0.0,2.0,6.0,9.0,12.0,13.0,65.54,3.71,60.0,63.0,66.0,68.0,71.0
Transformer,15.0,45.53,8.88,30.0,39.5,46.0,52.0,60.0,15.0,23.6,8.59,11.0,17.5,22.0,29.0,41.0


<IPython.core.display.Javascript object>

In [95]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,15,2,intermediate_model,14
Conv1D,15,1,intermediate_model,15
LSTM,15,1,intermediate_model,15
MLP,13,2,intermediate_model,11
Transformer,15,1,intermediate_model,15


<IPython.core.display.Javascript object>

In [96]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    14
             non_dominant_model     1
Conv1D       intermediate_model    15
LSTM         intermediate_model    15
MLP          intermediate_model    11
             dominant_model         2
Transformer  intermediate_model    15
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [97]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,93.33
Bi-LSTM,non_dominant_model,6.67
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,100.0
MLP,intermediate_model,84.62
MLP,dominant_model,15.38
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [98]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
66,MLP,0,71,dominant_model
70,MLP,0,71,dominant_model
69,MLP,2,69,intermediate_model
71,MLP,2,67,intermediate_model
65,MLP,3,68,intermediate_model
...,...,...,...,...
2,LSTM,68,4,intermediate_model
41,Bi-LSTM,69,3,intermediate_model
7,LSTM,70,1,intermediate_model
14,LSTM,70,1,intermediate_model


<IPython.core.display.Javascript object>

In [99]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [100]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,15,18.866667,15.0,761,283,3.876712
Conv1D,15,50.4,51.0,284,756,10.356164
LSTM,15,19.866667,22.0,743,298,4.082192
MLP,13,65.538462,66.0,72,852,11.671233
Transformer,15,23.6,22.0,683,354,4.849315


<IPython.core.display.Javascript object>

In [101]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,4.442209,...,63.5,72.0,15.0,18.866667,14.89423,0.0,6.0,15.0,33.5,43.0
Conv1D,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.475234,...,22.5,28.0,15.0,50.4,4.866797,44.0,45.5,51.0,54.0,59.0
LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.94346,...,62.0,70.0,15.0,19.866667,14.82212,1.0,7.0,22.0,30.0,45.0
MLP,0.0,,,,,,,,13.0,2.123608,...,9.0,12.0,13.0,65.538462,3.710691,60.0,63.0,66.0,68.0,71.0
Transformer,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.2606,...,52.0,60.0,15.0,23.6,8.592355,11.0,17.5,22.0,29.0,41.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [102]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [103]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [104]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [105]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [106]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
70,MLP_11,MLP,,1.99,1.49,3.52,0.89,-2.81,0,71,dominant_model,AT,False
66,MLP_7,MLP,,2.01,1.49,3.51,0.89,-2.8,0,71,dominant_model,AT,False
69,MLP_10,MLP,,2.06,1.51,3.54,0.89,-2.73,2,69,intermediate_model,AT,True
71,MLP_12,MLP,,2.04,1.54,3.64,0.89,-2.7,2,67,intermediate_model,AT,True
65,MLP_6,MLP,,2.07,1.54,3.62,0.89,-2.69,3,68,intermediate_model,AT,True
64,MLP_5,MLP,,2.09,1.54,3.63,0.88,-2.66,4,67,intermediate_model,AT,True
72,MLP_13,MLP,,2.1,1.57,3.7,0.88,-2.61,6,66,intermediate_model,AT,True


<IPython.core.display.Javascript object>

### Variable Grouping: CM-P-CS

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P-CS were from:**

*BTiSS:*
    Dominance analysis: Plant AT
    SCPM:Plant AT

*TiSS:*
    Dominance analysis: Plant AT
    SCPM:Plant AT

In [107]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

### Plant AT

In [108]:
df_results_cm_p_cs = (
    df_pre_train[df_pre_train["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [109]:
df_results_cm_p_cs_s = df_results_cm_p_cs[
    df_results_cm_p_cs["Plant"].eq("AT")
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [110]:
df_results_cm_p_cs_s.shape

(73, 26)

<IPython.core.display.Javascript object>

#### Hold Out

In [111]:
df_results_cm_p_cs_s["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [112]:
df_results_cm_p_cs_s_ho = (
    df_results_cm_p_cs_s[df_results_cm_p_cs_s["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [113]:
df_results_cm_p_cs_s_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [114]:
df_results_cm_p_cs_s_ho.shape

(73, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [115]:
df_results_cm_p_cs_s_ho = compute_scpm(df_results_cm_p_cs_s_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [116]:
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_s_ho)
dominance_matrix_cm_p_cs_s_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_s_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [117]:
dominance_matrix_cm_p_cs_s_ho.shape, len(dominance_graph_cm_p_cs_s_ho)

((73, 73), 73)

<IPython.core.display.Javascript object>

In [118]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    71
dominant_model         1
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [119]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
20,Global Model,207,AT,Chemical + Mineralogical + Physical,"(62749, 18)",,MLP_6,,Standard Scaler,,...,2.505854,0.944895,at,Chemical + Physical,MLP_6,MLP,-1.639541,0,dominant_model,72
25,Global Model,207,AT,Chemical + Mineralogical + Physical,"(62749, 18)",,MLP_11,,Standard Scaler,,...,2.525874,0.944668,at,Chemical + Physical,MLP_11,MLP,-1.632624,1,intermediate_model,71
21,Global Model,207,AT,Chemical + Mineralogical + Physical,"(62749, 18)",,MLP_7,,Standard Scaler,,...,2.541619,0.942447,at,Chemical + Physical,MLP_7,MLP,-1.613816,2,intermediate_model,69
26,Global Model,207,AT,Chemical + Mineralogical + Physical,"(62749, 18)",,MLP_12,,Standard Scaler,,...,2.59103,0.94271,at,Chemical + Physical,MLP_12,MLP,-1.600683,2,intermediate_model,68
19,Global Model,207,AT,Chemical + Mineralogical + Physical,"(62749, 18)",,MLP_5,,Standard Scaler,,...,2.574999,0.940148,at,Chemical + Physical,MLP_5,MLP,-1.58855,3,intermediate_model,66


<IPython.core.display.Javascript object>

In [120]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [121]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
20,MLP_6,,1.43,1.07,2.51,0.94,-1.64,0,dominant_model,72,AT
25,MLP_11,,1.43,1.07,2.53,0.94,-1.63,1,intermediate_model,71,AT
21,MLP_7,,1.46,1.08,2.54,0.94,-1.61,2,intermediate_model,69,AT


<IPython.core.display.Javascript object>

##### Top intermediate models

In [122]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
25,MLP_11,,1.43,1.07,2.53,0.94,-1.63,1,intermediate_model,71,AT
21,MLP_7,,1.46,1.08,2.54,0.94,-1.61,2,intermediate_model,69,AT


<IPython.core.display.Javascript object>

##### Top non dominant models

In [123]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
54,Bi-LSTM12,14.0,23.21,22.39,50.89,-13.49,32.12,72,non_dominant_model,0,AT


<IPython.core.display.Javascript object>

In [124]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [125]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Conv1D         15
LSTM           15
Bi-LSTM        15
Transformer    15
MLP            13
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [126]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [127]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [128]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [129]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,15.0,51.4,16.36,25.0,38.0,57.0,66.5,72.0,15.0,18.8,15.19,0.0,5.5,13.0,32.0,42.0
Conv1D,15.0,18.93,7.63,11.0,13.5,15.0,24.5,34.0,15.0,48.27,8.0,35.0,41.0,52.0,52.5,59.0
LSTM,15.0,42.47,17.21,20.0,26.0,43.0,54.0,71.0,15.0,27.8,16.14,1.0,16.5,29.0,41.0,50.0
MLP,13.0,6.46,6.35,0.0,2.0,4.0,9.0,24.0,13.0,63.85,7.66,42.0,63.0,66.0,68.0,72.0
Transformer,15.0,51.2,8.58,31.0,47.0,52.0,57.5,63.0,15.0,19.4,8.16,8.0,12.5,19.0,23.5,37.0


<IPython.core.display.Javascript object>

In [130]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,15,2,intermediate_model,14
Conv1D,15,1,intermediate_model,15
LSTM,15,1,intermediate_model,15
MLP,13,2,intermediate_model,12
Transformer,15,1,intermediate_model,15


<IPython.core.display.Javascript object>

In [131]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    14
             non_dominant_model     1
Conv1D       intermediate_model    15
LSTM         intermediate_model    15
MLP          intermediate_model    12
             dominant_model         1
Transformer  intermediate_model    15
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [132]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,93.33
Bi-LSTM,non_dominant_model,6.67
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,100.0
MLP,intermediate_model,92.31
MLP,dominant_model,7.69
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [133]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
20,MLP,0,72,dominant_model
25,MLP,1,71,intermediate_model
21,MLP,2,69,intermediate_model
26,MLP,2,68,intermediate_model
19,MLP,3,66,intermediate_model
...,...,...,...,...
48,Bi-LSTM,68,4,intermediate_model
50,Bi-LSTM,69,2,intermediate_model
51,Bi-LSTM,69,2,intermediate_model
8,LSTM,71,1,intermediate_model


<IPython.core.display.Javascript object>

In [134]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [135]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,15,18.8,13.0,771,282,3.863014
Conv1D,15,48.266667,52.0,284,724,9.917808
LSTM,15,27.8,29.0,637,417,5.712329
MLP,13,63.846154,66.0,84,830,11.369863
Transformer,15,19.4,19.0,768,291,3.986301


<IPython.core.display.Javascript object>

In [136]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,4.576509,...,66.5,72.0,15.0,18.8,15.190222,0.0,5.5,13.0,32.0,42.0
Conv1D,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,1.799253,...,24.5,34.0,15.0,48.266667,7.995237,35.0,41.0,52.0,52.5,59.0
LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.811719,...,54.0,71.0,15.0,27.8,16.138685,1.0,16.5,29.0,41.0,50.0
MLP,0.0,,,,,,,,13.0,1.523827,...,9.0,24.0,13.0,63.846154,7.657743,42.0,63.0,66.0,68.0,72.0
Transformer,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.88561,...,57.5,63.0,15.0,19.4,8.157381,8.0,12.5,19.0,23.5,37.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [137]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [138]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [139]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [140]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [141]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
20,MLP_6,MLP,,1.43,1.07,2.51,0.94,-1.64,0,72,dominant_model,AT,True
25,MLP_11,MLP,,1.43,1.07,2.53,0.94,-1.63,1,71,intermediate_model,AT,True
21,MLP_7,MLP,,1.46,1.08,2.54,0.94,-1.61,2,69,intermediate_model,AT,True
26,MLP_12,MLP,,1.46,1.1,2.59,0.94,-1.6,2,68,intermediate_model,AT,True
19,MLP_5,MLP,,1.49,1.1,2.57,0.94,-1.59,3,66,intermediate_model,AT,True
16,MLP_2,MLP,,1.48,1.11,2.62,0.94,-1.58,4,66,intermediate_model,AT,True
24,MLP_10,MLP,,1.49,1.11,2.61,0.94,-1.58,4,66,intermediate_model,AT,True


<IPython.core.display.Javascript object>

# Global Analysis (fine tuning results)

# Definitions

# Topological Analysis - NEW

## Definitions

In [142]:
# Function to compare two models strictly based on means only
def dominates(model1, model2):
    """Return True if model1 strictly dominates model2, False otherwise."""
    # Check if model1 is at least as good as model2 in all metrics
    at_least_as_good = (
        model1["RMSE_mean"] <= model2["RMSE_mean"]
        and model1["MAE_mean"] <= model2["MAE_mean"]
        and model1["MAPE_mean"] <= model2["MAPE_mean"]
        and model1["R2_mean"] >= model2["R2_mean"]
    )

    # Check if model1 is strictly better in at least one metric
    strictly_better_in_one = (
        model1["RMSE_mean"] < model2["RMSE_mean"]
        or model1["MAE_mean"] < model2["MAE_mean"]
        or model1["MAPE_mean"] < model2["MAPE_mean"]
        or model1["R2_mean"] > model2["R2_mean"]
    )

    # Model1 strictly dominates Model2 if it is at least as good in all metrics and
    # strictly better in at least one metric
    return at_least_as_good and strictly_better_in_one

<IPython.core.display.Javascript object>

In [143]:
def get_dominance_matrix_and_graph(df):
    n = len(df)
    dominance_matrix = np.zeros((n, n), dtype=bool)

    for i in range(n):
        for j in range(n):
            if i != j:
                dominance_matrix[i, j] = dominates(df.iloc[i], df.iloc[j])

    # Create the dominance graph
    dominance_graph = {i: [] for i in range(n)}
    for i in range(n):
        for j in range(n):
            if dominance_matrix[i, j]:
                dominance_graph[i].append(j)

    return dominance_matrix, dominance_graph

<IPython.core.display.Javascript object>

In [144]:
# Topological Sorting
def topological_sort(graph):
    """Perform topological sorting on the given graph."""
    in_degree = {u: 0 for u in graph}
    for u in graph:
        for v in graph[u]:
            in_degree[v] += 1

    queue = deque([u for u in graph if in_degree[u] == 0])
    topo_order = []

    while queue:
        u = queue.popleft()
        topo_order.append(u)

        for v in graph[u]:
            in_degree[v] -= 1
            if in_degree[v] == 0:
                queue.append(v)

    return topo_order

<IPython.core.display.Javascript object>

In [145]:
# Function to find the top models in each group
def find_top_models(group):
    max_net_dominance = group["Net_Dominance"].max()
    top_models = group[group["Net_Dominance"] == max_net_dominance]
    return top_models

<IPython.core.display.Javascript object>

### Dominance Analysis and SCPM

In [146]:
def compute_scpm(df):
    scaler = StandardScaler()
    standardized_metrics = scaler.fit_transform(
        df[["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"]]
    )

    # Creating a new dataframe with standardized values
    standardized_df = pd.DataFrame(
        standardized_metrics,
        columns=["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"],
    )

    # Summing all metrics and subtracting R2_mean
    standardized_df["Result"] = (
        standardized_df[["RMSE_mean", "MAE_mean", "MAPE_mean"]].sum(axis=1)
        - standardized_df["R2_mean"]
    )

    df["SCPM"] = standardized_df["Result"]
    return df

<IPython.core.display.Javascript object>

In [147]:
def make_dominance_analysis(df):
    # Compute dominance matrix and graph
    dominance_matrix, dominance_graph = get_dominance_matrix_and_graph(df)

    # Get the topological order
    topo_order = topological_sort(dominance_graph)
    df = df.copy()
    n = len(topo_order)

    # Identify Dominant Models
    dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if not np.any(dominance_matrix[:, i])
    ]

    # Identify Non-Dominant Models
    non_dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i])
    ]

    # Identify Models that are both dominated and dominate at least one
    intermediate_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i]) and np.any(dominance_matrix[i, :])
    ]

    # Add count of times each model is dominated by others
    df["Dominated_Count"] = np.sum(dominance_matrix, axis=0)

    # Classify models
    df["Classification"] = "intermediate_model"  # default to intermediate_model
    df.loc[
        ~np.any(dominance_matrix, axis=1) & np.any(dominance_matrix, axis=0),
        "Classification",
    ] = "non_dominant_model"
    df.loc[~np.any(dominance_matrix, axis=0), "Classification"] = "dominant_model"

    # Map the topological order to the model names and get dominance counts
    sorted_models = [df["Model"].iloc[i] for i in topo_order]

    df_sorted = df.iloc[topo_order].copy()
    df_sorted["Dominates_Count"] = dominance_matrix.sum(axis=1)[topo_order]
    df_sorted_topo = df_sorted.copy()

    df_sorted = df_sorted.sort_values(by="Dominates_Count", ascending=False)
    df_sorted_count = df_sorted.copy()

    return {
        "dominance_matrix": dominance_matrix,
        "dominance_graph": dominance_graph,
        "df_sorted_topo": df_sorted_topo,
        "df_sorted_count": df_sorted_count,
    }

<IPython.core.display.Javascript object>

# Reading The files

In [148]:
csv_files_path_fine_tuning_full = dict()
csv_files_path_fine_tuning_grouped = dict()

<IPython.core.display.Javascript object>

## 207

### Plant AT

In [149]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/207/at/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/207/at/fine_tuning/grouped/"
plant = "at"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

### Reading all data - pre training

#### Full

In [150]:
dfs_plant = dict()
column = ("plant", "")
pos = 2

for plant, csv_files in csv_files_path_fine_tuning_full.items():
    df = read_csv_files_full(csv_files, plant)
    df["plant"] = plant
    dfs_plant[plant] = df

<IPython.core.display.Javascript object>

In [151]:
df_fine_tuning_full = pd.concat(dfs_plant.values(), ignore_index=True)

<IPython.core.display.Javascript object>

In [152]:
df_fine_tuning_full.shape

(720, 23)

<IPython.core.display.Javascript object>

#### Grouped

In [153]:
dfs_plant = dict()
column = ("plant", "")
pos = 2

for plant, csv_files in csv_files_path_fine_tuning_grouped.items():
    df = read_csv_files_grouped(csv_files, plant)
    df["plant"] = plant
    df = preprocess_rename_columns(df)
    df = preprocess_change_columns_order(df, column, pos)
    dfs_plant[plant] = df

<IPython.core.display.Javascript object>

In [154]:
df_fine_tuning_grouped = pd.concat(dfs_plant.values(), ignore_index=True)

<IPython.core.display.Javascript object>

In [155]:
df_fine_tuning_grouped.shape

(60, 25)

<IPython.core.display.Javascript object>

In [156]:
df_copy = df_fine_tuning_grouped.copy()
df_copy = (
    df_copy.reset_index(level=0)
    .rename({"level_0": "Plant"}, axis=1)
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

## Preprocessing steps

In [157]:
patterns = [
    "Chemical + Mineralogical + Feature Engineering",
    "Chemical + Mineralogical + CS7 + One-Hot",
    "Chemical + Mineralogical + Physical + One-Hot",
    "Chemical + Mineralogical + CS3 + One-Hot",
    "Chemical + Mineralogical + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + One-Hot",
    "Chemical + Mineralogical + CS1 + One-Hot",
    "Chemical + Mineralogical + CS2 + One-Hot",
    "Chemical + Feature Engineering",
]

replace_dict = {
    "Chemical + Mineralogical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS2": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS7": "Chemical + Mineralogical + Early CS",
}

replace_dict_2 = {
    "Chemical": "Chemical + Mineralogical",
    "Chemical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Mineralogical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Physical": "Chemical + Mineralogical + Physical",
}

df_copy = df_copy[~df_copy["Cross Validation"].eq("Out of time Split")].reset_index(
    drop=True
)

# Removing afterwards
df_copy = df_copy[~df_copy["Cross Validation"].eq("Repeated KFold")].reset_index(
    drop=True
)

df_copy = df_copy[~df_copy["Features"].apply(lambda x: x in patterns)].reset_index(
    drop=True
)
df_copy["Features_bkp"] = df_copy["Features"].copy()
df_copy["Features"] = df_copy["Features"].replace(replace_dict)

df_copy["Features"] = df_copy["Features"].replace(replace_dict_2)

# Removing afterwards
# df_copy = df_copy[
#     ~df_copy["Features"].eq("Chemical + Mineralogical + Early CS")
# ].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [158]:
df_copy[("MAPE Train", "mean")] = df_copy[("MAPE Train", "mean")] * 100
df_copy[("MAPE Train", "std")] = df_copy[("MAPE Train", "std")] * 100
df_copy[("MAPE Test", "mean")] = df_copy[("MAPE Test", "mean")] * 100
df_copy[("MAPE Test", "std")] = df_copy[("MAPE Test", "std")] * 100

df_copy["Model_bkp"] = df_copy["Model"].copy()
df_copy["Model_bkp_2"] = df_copy["Model"] + df_copy["Timesteps"].apply(
    lambda x: "" if pd.isna(x) else "_" + str(x)
)
df_copy["Model"] = df_copy["Model"].replace(
    {
        "MLP": "Neural Networks",
        "LSTM": "Neural Networks",
        "GRU": "Neural Networks",
        "BidirectionalLSTM": "Neural Networks",
        "BidirectionalGRU": "Neural Networks",
        "Transformer": "Neural Networks",
        "Decision Tree": "Trees",
        "Random Forest": "Trees",
        "XGBoost": "Trees",
    }
)

<IPython.core.display.Javascript object>

In [159]:
df_copy["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [160]:
df_copy["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [161]:
df_copy["Model"].unique()

array(['Neural Networks', 'Conv1D'], dtype=object)

<IPython.core.display.Javascript object>

In [162]:
df_copy["Model_bkp"].unique()

array(['Transformer', 'MLP', 'BidirectionalLSTM', 'Conv1D', 'LSTM'],
      dtype=object)

<IPython.core.display.Javascript object>

In [163]:
df_copy["Model_bkp_2"].unique()

array(['Transformer_14.0', 'MLP', 'BidirectionalLSTM_14.0', 'Conv1D_14.0',
       'BidirectionalLSTM_1.0', 'LSTM_1.0', 'LSTM_14.0', 'Conv1D_7.0',
       'Transformer_1.0'], dtype=object)

<IPython.core.display.Javascript object>

In [164]:
df_copy.shape

(30, 29)

<IPython.core.display.Javascript object>

In [165]:
df_copy_grouped = df_copy.copy()

<IPython.core.display.Javascript object>

In [166]:
df_copy[
    [
        ("RMSE Test", "mean"),
        ("MAE Test", "mean"),
        ("MAPE Test", "mean"),
        ("R2 Test", "mean"),
    ]
].describe().round(2).T

Unnamed: 0,Unnamed: 1,count,mean,std,min,25%,50%,75%,max
RMSE Test,mean,30.0,3.45,2.51,1.95,2.25,2.73,3.37,13.73
MAE Test,mean,30.0,2.56,1.51,1.55,1.81,2.07,2.52,8.0
MAPE Test,mean,30.0,5.73,3.34,3.45,4.01,4.73,5.68,17.92
R2 Test,mean,30.0,-2.51,8.33,-37.59,-0.74,-0.11,0.17,0.39


<IPython.core.display.Javascript object>

In [167]:
df_copy["plant"].unique()

array(['at'], dtype=object)

<IPython.core.display.Javascript object>

# Global Analysis (fine tuning results)

## ECICS - 207

In [168]:
df_ecics_grouped = df_copy.copy()

<IPython.core.display.Javascript object>

In [169]:
df_results = df_ecics_grouped[
    [
        ("Company", ""),
        ("plant", ""),
        ("Cross Validation", ""),
        ("Features", ""),
        ('Features_bkp', ''),
        ("Model", ""),
        ("Model_bkp", ""),         
        ('Model_bkp_2',''),
        ("Timesteps", ""),
        ("RMSE Test", "mean"),
        ("MAE Test", "mean"),
        ("MAPE Test", "mean"),
        ("R2 Test", "mean"),
        ("RMSE Test", "std"),
        ("MAE Test", "std"),
        ("MAPE Test", "std"),
        ("R2 Test", "std"),
    ]
].copy()
df_results = df_results.reset_index(drop=True)

# Rename columns
new_column_names = [
    "Company",
    "Plant",
    "Cross Validation",
    "Features",
    "Features_bkp",
    "Model",
    "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "RMSE_std",
    "MAE_std",
    "MAPE_std",
    "R2_std",
]

df_results.columns = new_column_names


<IPython.core.display.Javascript object>

In [170]:
df_results.shape

(30, 17)

<IPython.core.display.Javascript object>

In [171]:
df_results["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

## Variable Grouping: CM

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM were from:**

*BTiSS:*
    Dominance analysis: Plant AT
    SCPM:Plant AT

*TiSS:*
    Dominance analysis: Plant AT
    SCPM:Plant AT

In [172]:
df_results["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

### Plant AT

In [173]:
df_results_cm = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [174]:
df_results_cm["Features"].unique()

array(['Chemical + Mineralogical'], dtype=object)

<IPython.core.display.Javascript object>

In [175]:
df_results_cm.shape

(10, 17)

<IPython.core.display.Javascript object>

In [176]:
df_results_cm = df_results_cm[df_results_cm["Plant"].eq("at")].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [177]:
df_results_cm.shape

(10, 17)

<IPython.core.display.Javascript object>

### Blocking time series

In [178]:
df_results_cm["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [179]:
df_results_cm_btss = (
    df_results_cm[df_results_cm["Cross Validation"].eq("Blocking Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [180]:
df_results_cm_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [181]:
df_results_cm_btss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [182]:
df_results_cm_btss = compute_scpm(df_results_cm_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [183]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_btss)
dominance_matrix_cm_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [184]:
dominance_matrix_cm_btss.shape, len(dominance_graph_cm_btss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [185]:
df_sorted_topo["Classification"].value_counts()

Classification
dominant_model        2
intermediate_model    2
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [186]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/ecics_207_at_dominance_analysis_cm_btss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [187]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,207,at,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,Transformer,Transformer_14.0,14.0,2.060204,...,3.623526,0.237924,0.480383,0.467227,0.908381,0.449357,-3.103841,0,dominant_model,3
3,207,at,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,MLP,MLP,,2.257964,...,4.092964,0.240324,0.3594,0.314438,0.711985,0.434475,-2.869078,0,dominant_model,3
4,207,at,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,LSTM,LSTM_14.0,14.0,3.001613,...,5.245664,-0.430635,0.727365,0.595234,1.27291,0.85393,-2.067374,2,intermediate_model,2
2,207,at,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Conv1D,Conv1D,Conv1D_14.0,14.0,4.999674,...,8.756619,-4.828465,2.04869,1.434962,3.209838,6.779544,0.4933,3,intermediate_model,1
1,207,at,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_14.0,14.0,10.4679,...,16.136962,-27.489537,3.873571,2.42628,5.24296,31.353197,7.546993,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [188]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [189]:
# Top 4
df_sorted_topo_cols.sort_values(
    by=["Dominates_Count", "SCPM"], ascending=[False, True]
).iloc[0:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,Transformer_14.0,2.06,0.48,1.67,0.47,3.62,0.91,0.24,0.45,-3.1,0,dominant_model,3,at
3,MLP,2.26,0.36,1.82,0.31,4.09,0.71,0.24,0.43,-2.87,0,dominant_model,3,at


<IPython.core.display.Javascript object>

In [190]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(2, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [191]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
4,LSTM_14.0,3.0,0.73,2.37,0.6,5.25,1.27,-0.43,0.85,-2.07,2,intermediate_model,2,at
2,Conv1D_14.0,5.0,2.05,3.99,1.43,8.76,3.21,-4.83,6.78,0.49,3,intermediate_model,1,at


<IPython.core.display.Javascript object>

In [192]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(2, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [193]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,BidirectionalLSTM_14.0,10.47,3.87,7.42,2.43,16.14,5.24,-27.49,31.35,7.55,4,non_dominant_model,0,at


<IPython.core.display.Javascript object>

In [194]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [195]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [196]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [197]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [198]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [199]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [200]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,dominant_model,50.0
Neural Networks,intermediate_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [201]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,3.0,1.5
Dominated_Count,std,,1.91
Dominated_Count,min,3.0,0.0
Dominated_Count,25%,3.0,0.0
Dominated_Count,50%,3.0,1.0
Dominated_Count,75%,3.0,2.5
Dominated_Count,max,3.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,1.0,2.0


<IPython.core.display.Javascript object>

In [202]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [203]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,dominant_model,2


<IPython.core.display.Javascript object>

In [204]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  dominant_model        2
                 intermediate_model    1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [205]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [206]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [207]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,1.0,1.0,3,1,0.2
Neural Networks,4,2.0,2.5,6,8,1.6


<IPython.core.display.Javascript object>

In [208]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,4.999674,,4.999674,4.999674,4.999674,4.999674,4.999674,1.0,2.04869,...,3.0,3.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
Neural Networks,4.0,4.446921,4.034396,2.060204,2.208524,2.629789,4.868185,10.4679,4.0,1.36018,...,2.5,4.0,4.0,2.0,1.414214,0.0,1.5,2.5,3.0,3.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [209]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [210]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [211]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [212]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,Transformer_14.0,2.06,0.48,1.67,0.47,3.62,0.91,0.24,0.45,-3.1,0,3,dominant_model,at,True
3,MLP,2.26,0.36,1.82,0.31,4.09,0.71,0.24,0.43,-2.87,0,3,dominant_model,at,True
4,LSTM_14.0,3.0,0.73,2.37,0.6,5.25,1.27,-0.43,0.85,-2.07,2,2,intermediate_model,at,True
2,Conv1D_14.0,5.0,2.05,3.99,1.43,8.76,3.21,-4.83,6.78,0.49,3,1,intermediate_model,at,True
1,BidirectionalLSTM_14.0,10.47,3.87,7.42,2.43,16.14,5.24,-27.49,31.35,7.55,4,0,non_dominant_model,at,True


<IPython.core.display.Javascript object>

### Time Series Split

In [213]:
df_results_cm = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [214]:
df_results_cm["Features"].unique()

array(['Chemical + Mineralogical'], dtype=object)

<IPython.core.display.Javascript object>

In [215]:
df_results_cm.shape

(10, 17)

<IPython.core.display.Javascript object>

In [216]:
df_results_cm.shape

(10, 17)

<IPython.core.display.Javascript object>

In [217]:
df_results_cm["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [218]:
df_results_cm_tss = (
    df_results_cm[df_results_cm["Cross Validation"].eq("Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [219]:
df_results_cm_tss["Cross Validation"].unique()

array(['Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [220]:
df_results_cm_tss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [221]:
df_results_cm_tss = compute_scpm(df_results_cm_tss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [222]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_tss)
dominance_matrix_cm_tss = dominance_dict["dominance_matrix"]
dominance_graph_cm_tss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [223]:
dominance_matrix_cm_tss.shape, len(dominance_graph_cm_tss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [224]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    3
dominant_model        1
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [225]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/ecics_207_at_dominance_analysis_cm_tss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [226]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
3,207,at,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,MLP,MLP,,2.74879,...,4.454845,-0.072376,0.563044,0.303241,0.565309,0.615426,-2.667642,0,dominant_model,4
0,207,at,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,Transformer,Transformer_14.0,14.0,2.916147,...,5.38092,-0.189517,0.293796,0.250324,0.823294,0.636073,-2.263931,1,intermediate_model,3
4,207,at,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,LSTM,LSTM_14.0,14.0,3.653403,...,6.025757,-0.58757,1.031342,0.574545,1.849335,0.669439,-1.84518,2,intermediate_model,2
2,207,at,Time Series Split,Chemical + Mineralogical,Chemical,Conv1D,Conv1D,Conv1D_14.0,14.0,4.242589,...,7.178119,-1.239602,0.744986,0.464501,1.349709,0.836809,-1.143118,3,intermediate_model,1
1,207,at,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_14.0,14.0,13.725374,...,17.917479,-37.589314,6.50983,2.775789,5.967878,47.748064,7.919872,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [227]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [228]:
# Top 4
df_sorted_topo_cols.iloc[0:2].sort_values(
    by=["Dominates_Count", "SCPM"], ascending=[False, True]
).round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
3,MLP,2.75,0.56,2.01,0.3,4.45,0.57,-0.07,0.62,-2.67,0,dominant_model,4,at
0,Transformer_14.0,2.92,0.29,2.38,0.25,5.38,0.82,-0.19,0.64,-2.26,1,intermediate_model,3,at


<IPython.core.display.Javascript object>

In [229]:
# Top 4
df_sorted_topo_cols.iloc[0:2].sort_values(
    by=["Dominates_Count", "SCPM"], ascending=[False, True]
).iloc[:4].round(2).round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
3,MLP,2.75,0.56,2.01,0.3,4.45,0.57,-0.07,0.62,-2.67,0,dominant_model,4,at
0,Transformer_14.0,2.92,0.29,2.38,0.25,5.38,0.82,-0.19,0.64,-2.26,1,intermediate_model,3,at


<IPython.core.display.Javascript object>

In [230]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [231]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:4].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,Transformer_14.0,2.92,0.29,2.38,0.25,5.38,0.82,-0.19,0.64,-2.26,1,intermediate_model,3,at
4,LSTM_14.0,3.65,1.03,2.57,0.57,6.03,1.85,-0.59,0.67,-1.85,2,intermediate_model,2,at
2,Conv1D_14.0,4.24,0.74,3.2,0.46,7.18,1.35,-1.24,0.84,-1.14,3,intermediate_model,1,at


<IPython.core.display.Javascript object>

In [232]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(3, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [233]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,BidirectionalLSTM_14.0,13.725374,6.50983,7.997681,2.775789,17.917479,5.967878,-37.589314,47.748064,7.919872,4,non_dominant_model,0,at


<IPython.core.display.Javascript object>

In [234]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [235]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [236]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [237]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [238]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [239]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [240]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,3.0,1.75
Dominated_Count,std,,1.71
Dominated_Count,min,3.0,0.0
Dominated_Count,25%,3.0,0.75
Dominated_Count,50%,3.0,1.5
Dominated_Count,75%,3.0,2.5
Dominated_Count,max,3.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,1.0,2.25


<IPython.core.display.Javascript object>

In [241]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [242]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,intermediate_model,2


<IPython.core.display.Javascript object>

In [243]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  intermediate_model    2
                 dominant_model        1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [244]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [245]:
summary_stats.round(2)

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,1.0,1.0,3,1,0.2
Neural Networks,4,2.25,2.5,7,9,1.8


<IPython.core.display.Javascript object>

In [246]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,4.242589,,4.242589,4.242589,4.242589,4.242589,4.242589,1.0,0.744986,...,3.0,3.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
Neural Networks,4.0,5.760928,5.324153,2.74879,2.874308,3.284775,6.171395,13.725374,4.0,2.099503,...,2.5,4.0,4.0,2.25,1.707825,0.0,1.5,2.5,3.25,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [247]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [248]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [249]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [250]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
3,MLP,2.75,0.56,2.01,0.3,4.45,0.57,-0.07,0.62,-2.67,0,4,dominant_model,at,True
0,Transformer_14.0,2.92,0.29,2.38,0.25,5.38,0.82,-0.19,0.64,-2.26,1,3,intermediate_model,at,True
4,LSTM_14.0,3.65,1.03,2.57,0.57,6.03,1.85,-0.59,0.67,-1.85,2,2,intermediate_model,at,True
2,Conv1D_14.0,4.24,0.74,3.2,0.46,7.18,1.35,-1.24,0.84,-1.14,3,1,intermediate_model,at,True
1,BidirectionalLSTM_14.0,13.73,6.51,8.0,2.78,17.92,5.97,-37.59,47.75,7.92,4,0,non_dominant_model,at,True


<IPython.core.display.Javascript object>

## Variable Grouping: CM-P

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P were from:**

*BTiSS:*
    Dominance analysis: Plant AT
    SCPM:Plant AT

*TiSS:*
    Dominance analysis: Plant AT
    SCPM:Plant AT

In [251]:
df_results["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [252]:
df_results_cm_p = (
    df_results[
        df_results["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [253]:
df_results_cm_p["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [254]:
df_results_cm_p.shape

(10, 17)

<IPython.core.display.Javascript object>

### Blocking time series

In [255]:
df_results_cm_p["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [256]:
df_results_cm_p_btss = (
    df_results_cm_p[
        df_results_cm_p["Cross Validation"].eq("Blocking Time Series Split")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [257]:
df_results_cm_p_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [258]:
df_results_cm_p_btss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [259]:
df_results_cm_p_btss = compute_scpm(df_results_cm_p_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [260]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_btss)
dominance_matrix_cm_p_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [261]:
dominance_matrix_cm_p_btss.shape, len(dominance_graph_cm_p_btss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [262]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    2
non_dominant_model    2
dominant_model        1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [263]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/ecics_207_at_dominance_analysis_cm_p_btss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [264]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
4,207,at,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,Transformer,Transformer_1.0,1.0,2.09679,...,3.690571,0.293241,0.467582,0.380521,0.734037,0.442786,-5.05125,0,dominant_model,4
0,207,at,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,MLP,MLP,,2.246884,...,4.021817,0.116973,0.415372,0.371317,0.71595,0.70259,-2.523689,1,intermediate_model,2
3,207,at,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Conv1D,Conv1D,Conv1D_14.0,14.0,2.279616,...,4.006937,-0.125932,1.120279,0.914495,1.801502,0.838963,-1.472859,1,intermediate_model,2
1,207,at,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_1.0,1.0,2.810234,...,4.940736,-0.390278,0.660203,0.610094,1.318457,1.020089,5.155862,3,non_dominant_model,0
2,207,at,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,LSTM,LSTM_1.0,1.0,2.708926,...,4.571182,-0.647666,0.917708,0.673569,1.455977,1.892128,3.891936,3,non_dominant_model,0


<IPython.core.display.Javascript object>

In [265]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [266]:
# Top 4
df_sorted_topo_cols.sort_values(
    by=["Dominates_Count", "SCPM"], ascending=[False, True]
).iloc[0:5]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
4,Transformer_1.0,2.1,0.47,1.67,0.38,3.69,0.73,0.29,0.44,-5.05,0,dominant_model,4,at
0,MLP,2.25,0.42,1.81,0.37,4.02,0.72,0.12,0.7,-2.52,1,intermediate_model,2,at
3,Conv1D_14.0,2.28,1.12,1.86,0.91,4.01,1.8,-0.13,0.84,-1.47,1,intermediate_model,2,at
2,LSTM_1.0,2.71,0.92,2.06,0.67,4.57,1.46,-0.65,1.89,3.89,3,non_dominant_model,0,at
1,BidirectionalLSTM_1.0,2.81,0.66,2.22,0.61,4.94,1.32,-0.39,1.02,5.16,3,non_dominant_model,0,at


<IPython.core.display.Javascript object>

In [267]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [268]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,MLP,2.25,0.42,1.81,0.37,4.02,0.72,0.12,0.7,-2.52,1,intermediate_model,2,at
3,Conv1D_14.0,2.28,1.12,1.86,0.91,4.01,1.8,-0.13,0.84,-1.47,1,intermediate_model,2,at


<IPython.core.display.Javascript object>

In [269]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(2, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [270]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
2,LSTM_1.0,2.71,0.92,2.06,0.67,4.57,1.46,-0.65,1.89,3.89,3,non_dominant_model,0,at
1,BidirectionalLSTM_1.0,2.81,0.66,2.22,0.61,4.94,1.32,-0.39,1.02,5.16,3,non_dominant_model,0,at


<IPython.core.display.Javascript object>

In [271]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(2, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [272]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [273]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [274]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [275]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [276]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [277]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,non_dominant_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,intermediate_model,25.0


<IPython.core.display.Javascript object>

In [278]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,1.0,1.75
Dominated_Count,std,,1.5
Dominated_Count,min,1.0,0.0
Dominated_Count,25%,1.0,0.75
Dominated_Count,50%,1.0,2.0
Dominated_Count,75%,1.0,3.0
Dominated_Count,max,1.0,3.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,2.0,1.5


<IPython.core.display.Javascript object>

In [279]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [280]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,non_dominant_model,2


<IPython.core.display.Javascript object>

In [281]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  non_dominant_model    2
                 dominant_model        1
                 intermediate_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [282]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [283]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [284]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,2.0,2.0,1,2,0.4
Neural Networks,4,1.5,1.0,7,6,1.2


<IPython.core.display.Javascript object>

In [285]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,2.279616,,2.279616,2.279616,2.279616,2.279616,2.279616,1.0,1.120279,...,1.0,1.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
Neural Networks,4.0,2.465708,0.347293,2.09679,2.209361,2.477905,2.734253,2.810234,4.0,0.615216,...,3.0,3.0,4.0,1.5,1.914854,0.0,0.0,1.0,2.5,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [286]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [287]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [288]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [289]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
4,Transformer_1.0,2.1,0.47,1.67,0.38,3.69,0.73,0.29,0.44,-5.05,0,4,dominant_model,at,True
0,MLP,2.25,0.42,1.81,0.37,4.02,0.72,0.12,0.7,-2.52,1,2,intermediate_model,at,True
3,Conv1D_14.0,2.28,1.12,1.86,0.91,4.01,1.8,-0.13,0.84,-1.47,1,2,intermediate_model,at,True
2,LSTM_1.0,2.71,0.92,2.06,0.67,4.57,1.46,-0.65,1.89,3.89,3,0,non_dominant_model,at,False
1,BidirectionalLSTM_1.0,2.81,0.66,2.22,0.61,4.94,1.32,-0.39,1.02,5.16,3,0,non_dominant_model,at,False


<IPython.core.display.Javascript object>

### Time Series Split

In [290]:
df_results_cm_p = (
    df_results[
        df_results["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [291]:
df_results_cm_p["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [292]:
df_results_cm_p.shape

(10, 17)

<IPython.core.display.Javascript object>

In [293]:
df_results_cm["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [294]:
df_results_cm_p_tss = (
    df_results_cm_p[df_results_cm_p["Cross Validation"].eq("Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [295]:
df_results_cm_p_tss["Cross Validation"].unique()

array(['Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [296]:
df_results_cm_p_tss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [297]:
df_results_cm_p_tss = compute_scpm(df_results_cm_p_tss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [298]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_tss)
dominance_matrix_cm_p_tss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_tss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [299]:
dominance_matrix_cm_tss.shape, len(dominance_graph_cm_tss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [300]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    2
non_dominant_model    2
dominant_model        1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [301]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/ecics_207_at_dominance_analysis_cm_p_tss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [302]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,207,at,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,MLP,MLP,,2.669472,...,4.715338,0.083997,0.279328,0.206893,0.548872,0.384021,-4.49221,0,dominant_model,4
4,207,at,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,Transformer,Transformer_1.0,1.0,2.671705,...,4.736594,0.041793,0.560313,0.364428,0.961048,0.41398,-4.386315,1,intermediate_model,3
2,207,at,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,LSTM,LSTM_1.0,1.0,3.496979,...,5.775192,-0.772366,0.522908,0.325167,0.569056,0.98788,0.033728,2,intermediate_model,2
1,207,at,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_1.0,1.0,4.27694,...,6.951162,-1.250738,1.591414,0.94878,2.304203,1.117185,4.006643,3,non_dominant_model,0
3,207,at,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Conv1D,Conv1D,Conv1D_14.0,14.0,4.16768,...,7.550541,-1.26394,1.18191,1.021107,2.2372,1.299202,4.838154,3,non_dominant_model,0


<IPython.core.display.Javascript object>

In [303]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [304]:
# Top 4
df_sorted_topo_cols.sort_values(
    by=["Dominates_Count", "SCPM"], ascending=[False, True]
).iloc[:2].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,MLP,2.67,0.28,2.07,0.21,4.72,0.55,0.08,0.38,-4.49,0,dominant_model,4,at
4,Transformer_1.0,2.67,0.56,2.08,0.36,4.74,0.96,0.04,0.41,-4.39,1,intermediate_model,3,at


<IPython.core.display.Javascript object>

In [305]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [306]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:4].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
4,Transformer_1.0,2.67,0.56,2.08,0.36,4.74,0.96,0.04,0.41,-4.39,1,intermediate_model,3,at
2,LSTM_1.0,3.5,0.52,2.57,0.33,5.78,0.57,-0.77,0.99,0.03,2,intermediate_model,2,at


<IPython.core.display.Javascript object>

In [307]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(2, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [308]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,BidirectionalLSTM_1.0,4.27694,1.591414,3.100156,0.94878,6.951162,2.304203,-1.250738,1.117185,4.006643,3,non_dominant_model,0,at
3,Conv1D_14.0,4.16768,1.18191,3.329923,1.021107,7.550541,2.2372,-1.26394,1.299202,4.838154,3,non_dominant_model,0,at


<IPython.core.display.Javascript object>

In [309]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(2, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [310]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [311]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [312]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [313]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [314]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,non_dominant_model,100.0
Neural Networks,intermediate_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [315]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,3.0,1.5
Dominated_Count,std,,1.29
Dominated_Count,min,3.0,0.0
Dominated_Count,25%,3.0,0.75
Dominated_Count,50%,3.0,1.5
Dominated_Count,75%,3.0,2.25
Dominated_Count,max,3.0,3.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,0.0,2.25


<IPython.core.display.Javascript object>

In [316]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [317]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,non_dominant_model,1
Neural Networks,4,3,intermediate_model,2


<IPython.core.display.Javascript object>

In [318]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           non_dominant_model    1
Neural Networks  intermediate_model    2
                 dominant_model        1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [319]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [320]:
summary_stats.round(2)

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,0.0,0.0,3,0,0.0
Neural Networks,4,2.25,2.5,6,9,1.8


<IPython.core.display.Javascript object>

In [321]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,4.16768,,4.16768,4.16768,4.16768,4.16768,4.16768,1.0,1.18191,...,3.0,3.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
Neural Networks,4.0,3.278774,0.771088,2.669472,2.671147,3.084342,3.691969,4.27694,4.0,0.738491,...,2.25,3.0,4.0,2.25,1.707825,0.0,1.5,2.5,3.25,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [322]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [323]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [324]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [325]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,MLP,2.67,0.28,2.07,0.21,4.72,0.55,0.08,0.38,-4.49,0,4,dominant_model,at,True
4,Transformer_1.0,2.67,0.56,2.08,0.36,4.74,0.96,0.04,0.41,-4.39,1,3,intermediate_model,at,True
2,LSTM_1.0,3.5,0.52,2.57,0.33,5.78,0.57,-0.77,0.99,0.03,2,2,intermediate_model,at,True
1,BidirectionalLSTM_1.0,4.28,1.59,3.1,0.95,6.95,2.3,-1.25,1.12,4.01,3,0,non_dominant_model,at,True
3,Conv1D_14.0,4.17,1.18,3.33,1.02,7.55,2.24,-1.26,1.3,4.84,3,0,non_dominant_model,at,True


<IPython.core.display.Javascript object>

## Variable Grouping: CM-P-CS

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P-CS were from:**

*BTiSS:*
    Dominance analysis: Plant S and Plant AT
    SCPM: Plant S and Plant AT

*TiSS:*
    Dominance analysis: Plant AB
    SCPM: Plant AB

In [326]:
df_results["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

### Blocking time series

### Plant AT

In [327]:
df_results_cm_p_cs = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [328]:
df_results_cm_p_cs["Features"].unique()

array(['Chemical + Mineralogical + Physical'], dtype=object)

<IPython.core.display.Javascript object>

In [329]:
df_results_cm_p_cs.shape

(10, 17)

<IPython.core.display.Javascript object>

In [330]:
df_results_cm_p_cs = df_results_cm_p_cs[
    df_results_cm_p_cs["Plant"].eq("at")
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [331]:
df_results_cm_p_cs["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [332]:
df_results_cm_p_cs_btss = (
    df_results_cm_p_cs[
        df_results_cm_p_cs["Cross Validation"].eq("Blocking Time Series Split")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [333]:
df_results_cm_p_cs_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [334]:
df_results_cm_p_cs_btss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [335]:
df_results_cm_p_cs_btss = compute_scpm(df_results_cm_p_cs_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [336]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_btss)
dominance_matrix_cm_p_cs_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [337]:
dominance_matrix_cm_p_cs_btss.shape, len(dominance_graph_cm_p_cs_btss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [338]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    3
dominant_model        1
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [339]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/ecics_207_at_dominance_analysis_cm_p_cs_btss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [340]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
2,207,at,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,Transformer,Transformer_1.0,1.0,1.945013,...,3.452376,0.391541,0.450001,0.370325,0.702625,0.392363,-3.681001,0,dominant_model,4
0,207,at,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,MLP,MLP,,2.053505,...,3.698528,0.321113,0.431668,0.353065,0.665893,0.441324,-2.131213,1,intermediate_model,2
1,207,at,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Conv1D,Conv1D,Conv1D_7.0,7.0,2.130671,...,3.661226,0.176096,0.520016,0.423585,0.800329,0.573285,-1.516732,1,intermediate_model,2
4,207,at,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_1.0,1.0,2.194938,...,3.945848,0.16461,0.433971,0.368883,0.659248,0.592958,-0.343696,3,intermediate_model,1
3,207,at,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,LSTM,LSTM_1.0,1.0,2.868904,...,4.824424,-0.832348,1.093266,0.774195,1.641083,2.27654,7.672642,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [341]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [342]:
# Top 4
df_sorted_topo_cols.sort_values(
    by=["Dominates_Count", "SCPM"], ascending=[False, True]
).iloc[:5]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
2,Transformer_1.0,1.95,0.45,1.55,0.37,3.45,0.7,0.39,0.39,-3.68,0,dominant_model,4,at
0,MLP,2.05,0.43,1.66,0.35,3.7,0.67,0.32,0.44,-2.13,1,intermediate_model,2,at
1,Conv1D_7.0,2.13,0.52,1.69,0.42,3.66,0.8,0.18,0.57,-1.52,1,intermediate_model,2,at
4,BidirectionalLSTM_1.0,2.19,0.43,1.76,0.37,3.95,0.66,0.16,0.59,-0.34,3,intermediate_model,1,at
3,LSTM_1.0,2.87,1.09,2.15,0.77,4.82,1.64,-0.83,2.28,7.67,4,non_dominant_model,0,at


<IPython.core.display.Javascript object>

In [343]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [344]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,MLP,2.05,0.43,1.66,0.35,3.7,0.67,0.32,0.44,-2.13,1,intermediate_model,2,at
1,Conv1D_7.0,2.13,0.52,1.69,0.42,3.66,0.8,0.18,0.57,-1.52,1,intermediate_model,2,at
4,BidirectionalLSTM_1.0,2.19,0.43,1.76,0.37,3.95,0.66,0.16,0.59,-0.34,3,intermediate_model,1,at


<IPython.core.display.Javascript object>

In [345]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(3, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [346]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
3,LSTM_1.0,2.87,1.09,2.15,0.77,4.82,1.64,-0.83,2.28,7.67,4,non_dominant_model,0,at


<IPython.core.display.Javascript object>

In [347]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [348]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [349]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [350]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [351]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [352]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [353]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [354]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,1.0,2.0
Dominated_Count,std,,1.83
Dominated_Count,min,1.0,0.0
Dominated_Count,25%,1.0,0.75
Dominated_Count,50%,1.0,2.0
Dominated_Count,75%,1.0,3.25
Dominated_Count,max,1.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,2.0,1.75


<IPython.core.display.Javascript object>

In [355]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [356]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,intermediate_model,2


<IPython.core.display.Javascript object>

In [357]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  intermediate_model    2
                 dominant_model        1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [358]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [359]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [360]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,2.0,2.0,1,2,0.4
Neural Networks,4,1.75,1.5,8,7,1.4


<IPython.core.display.Javascript object>

In [361]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,2.130671,,2.130671,2.130671,2.130671,2.130671,2.130671,1.0,0.520016,...,1.0,1.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
Neural Networks,4.0,2.26559,0.415022,1.945013,2.026382,2.124222,2.36343,2.868904,4.0,0.602226,...,3.25,4.0,4.0,1.75,1.707825,0.0,0.75,1.5,2.5,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [362]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [363]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [364]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [365]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
2,Transformer_1.0,1.95,0.45,1.55,0.37,3.45,0.7,0.39,0.39,-3.68,0,4,dominant_model,at,True
0,MLP,2.05,0.43,1.66,0.35,3.7,0.67,0.32,0.44,-2.13,1,2,intermediate_model,at,True
1,Conv1D_7.0,2.13,0.52,1.69,0.42,3.66,0.8,0.18,0.57,-1.52,1,2,intermediate_model,at,True
4,BidirectionalLSTM_1.0,2.19,0.43,1.76,0.37,3.95,0.66,0.16,0.59,-0.34,3,1,intermediate_model,at,True
3,LSTM_1.0,2.87,1.09,2.15,0.77,4.82,1.64,-0.83,2.28,7.67,4,0,non_dominant_model,at,True


<IPython.core.display.Javascript object>

### Time Series Split

In [366]:
df_results_cm_p_cs = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [367]:
df_results_cm_p_cs["Features"].unique()

array(['Chemical + Mineralogical + Physical'], dtype=object)

<IPython.core.display.Javascript object>

In [368]:
df_results_cm_p_cs.shape

(10, 17)

<IPython.core.display.Javascript object>

### Plant AT

In [369]:
df_results_cm_p_cs = df_results_cm_p_cs[
    df_results_cm_p_cs["Plant"].eq("at")
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [370]:
df_results_cm_p_cs["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [371]:
df_results_cm_p_cs_tss = (
    df_results_cm_p_cs[df_results_cm_p_cs["Cross Validation"].eq("Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [372]:
df_results_cm_p_cs_tss["Cross Validation"].unique()

array(['Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [373]:
df_results_cm_p_cs_tss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [374]:
df_results_cm_p_cs_tss = compute_scpm(df_results_cm_p_cs_tss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [375]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_tss)
dominance_matrix_cm_p_cs_tss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_tss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [376]:
dominance_matrix_cm_p_cs_tss.shape, len(dominance_graph_cm_p_cs_tss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [377]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    2
non_dominant_model    2
dominant_model        1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [378]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/ecics_207_at_dominance_analysis_cm_p_cs_tss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [379]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,207,at,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,MLP,MLP,,2.035683,...,3.533589,0.357218,0.396688,0.287654,0.49217,0.402826,-6.45748,0,dominant_model,4
4,207,at,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_1.0,1.0,2.407082,...,4.330651,0.199196,0.307413,0.278104,0.685612,0.373008,-1.725337,1,intermediate_model,3
3,207,at,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,LSTM,LSTM_1.0,1.0,2.592365,...,4.34965,-0.006979,0.424778,0.239545,0.3044,0.598615,0.057879,2,intermediate_model,2
1,207,at,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Conv1D,Conv1D,Conv1D_7.0,7.0,2.80101,...,5.041565,-0.128617,0.378777,0.283773,0.418911,0.613507,3.76036,3,non_dominant_model,0
2,207,at,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,Transformer,Transformer_1.0,1.0,2.965125,...,5.186654,-0.099444,0.64683,0.393951,1.132405,0.395899,4.364577,3,non_dominant_model,0


<IPython.core.display.Javascript object>

In [380]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [381]:
# Top 4
df_sorted_topo_cols.iloc[0:2].sort_values(
    by=["Dominates_Count", "SCPM"], ascending=[False, True]
).round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,MLP,2.04,0.4,1.58,0.29,3.53,0.49,0.36,0.4,-6.46,0,dominant_model,4,at
4,BidirectionalLSTM_1.0,2.41,0.31,1.92,0.28,4.33,0.69,0.2,0.37,-1.73,1,intermediate_model,3,at


<IPython.core.display.Javascript object>

In [382]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [383]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:4].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
4,BidirectionalLSTM_1.0,2.41,0.31,1.92,0.28,4.33,0.69,0.2,0.37,-1.73,1,intermediate_model,3,at
3,LSTM_1.0,2.59,0.42,1.94,0.24,4.35,0.3,-0.01,0.6,0.06,2,intermediate_model,2,at


<IPython.core.display.Javascript object>

In [384]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(2, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [385]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,Conv1D_7.0,2.80101,0.378777,2.242115,0.283773,5.041565,0.418911,-0.128617,0.613507,3.76036,3,non_dominant_model,0,at
2,Transformer_1.0,2.965125,0.64683,2.243967,0.393951,5.186654,1.132405,-0.099444,0.395899,4.364577,3,non_dominant_model,0,at


<IPython.core.display.Javascript object>

In [386]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(2, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [387]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [388]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [389]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [390]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [391]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,non_dominant_model,100.0
Neural Networks,intermediate_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [392]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,3.0,1.5
Dominated_Count,std,,1.29
Dominated_Count,min,3.0,0.0
Dominated_Count,25%,3.0,0.75
Dominated_Count,50%,3.0,1.5
Dominated_Count,75%,3.0,2.25
Dominated_Count,max,3.0,3.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,0.0,2.25


<IPython.core.display.Javascript object>

In [393]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [394]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,non_dominant_model,1
Neural Networks,4,3,intermediate_model,2


<IPython.core.display.Javascript object>

In [395]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           non_dominant_model    1
Neural Networks  intermediate_model    2
                 dominant_model        1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [396]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [397]:
summary_stats.round(2)

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,0.0,0.0,3,0,0.0
Neural Networks,4,2.25,2.5,6,9,1.8


<IPython.core.display.Javascript object>

In [398]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,2.80101,,2.80101,2.80101,2.80101,2.80101,2.80101,1.0,0.378777,...,3.0,3.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
Neural Networks,4.0,2.500064,0.38691,2.035683,2.314232,2.499724,2.685555,2.965125,4.0,0.443928,...,2.25,3.0,4.0,2.25,1.707825,0.0,1.5,2.5,3.25,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [399]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [400]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [401]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [402]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,MLP,2.04,0.4,1.58,0.29,3.53,0.49,0.36,0.4,-6.46,0,4,dominant_model,at,True
4,BidirectionalLSTM_1.0,2.41,0.31,1.92,0.28,4.33,0.69,0.2,0.37,-1.73,1,3,intermediate_model,at,True
3,LSTM_1.0,2.59,0.42,1.94,0.24,4.35,0.3,-0.01,0.6,0.06,2,2,intermediate_model,at,True
1,Conv1D_7.0,2.8,0.38,2.24,0.28,5.04,0.42,-0.13,0.61,3.76,3,0,non_dominant_model,at,True
2,Transformer_1.0,2.97,0.65,2.24,0.39,5.19,1.13,-0.1,0.4,4.36,3,0,non_dominant_model,at,True


<IPython.core.display.Javascript object>