In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import os
import glob
import numpy as np

<IPython.core.display.Javascript object>

In [3]:
import warnings

# Suppress specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn._oldcore")


<IPython.core.display.Javascript object>

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

<IPython.core.display.Javascript object>

In [5]:
from sklearn.preprocessing import StandardScaler

<IPython.core.display.Javascript object>

In [6]:
from collections import deque, defaultdict


<IPython.core.display.Javascript object>

In [7]:
def read_csv_files_grouped(csv_files, plant, header=[0, 1]):
    results = []
    for filepath in csv_files:
        df = pd.read_csv(filepath, header=header, low_memory=False)
        df["plant"] = plant  # Add the 'plant' column
        results.append(df)
    df = pd.concat(results, ignore_index=True)
    return df

<IPython.core.display.Javascript object>

In [8]:
def read_csv_files_full(csv_files, plant):
    results = []
    for filepath in csv_files:
        df = pd.read_csv(filepath, low_memory=False)
        df["plant"] = plant  # Add the 'plant' column
        results.append(df)
    df = pd.concat(results, ignore_index=True)
    return df

<IPython.core.display.Javascript object>

### Renaming the Columns

In [9]:
def preprocess_rename_columns(df):
    df.columns = pd.MultiIndex.from_tuples(
        list(
            {
                col: (col[0], "") if "Unnamed" in col[1] else col for col in df.columns
            }.values()
        )
    )
    return df

<IPython.core.display.Javascript object>

### Changing the order_of columns Columns

In [10]:
def preprocess_change_columns_order(df, column, pos):
    # Get the list of columns
    cols = df.columns.tolist()
    # Remove the specified column
    cols.remove(column)
    # Insert the column at the desired position
    cols.insert(pos, column)
    # Reorder the DataFrame columns
    return df[cols]

<IPython.core.display.Javascript object>

In [11]:
def read_csv_files_path(csv_files_path_dict, path, plant):
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    csv_files_path_dict[plant] = csv_files
    return csv_files_path_dict

<IPython.core.display.Javascript object>

# Topological Analysis - NEW

## Definitions

In [12]:
# Function to compare two models strictly based on means only
def dominates(model1, model2):
    """Return True if model1 strictly dominates model2, False otherwise."""
    # Check if model1 is at least as good as model2 in all metrics
    at_least_as_good = (
        model1["RMSE_mean"] <= model2["RMSE_mean"]
        and model1["MAE_mean"] <= model2["MAE_mean"]
        and model1["MAPE_mean"] <= model2["MAPE_mean"]
        and model1["R2_mean"] >= model2["R2_mean"]
    )

    # Check if model1 is strictly better in at least one metric
    strictly_better_in_one = (
        model1["RMSE_mean"] < model2["RMSE_mean"]
        or model1["MAE_mean"] < model2["MAE_mean"]
        or model1["MAPE_mean"] < model2["MAPE_mean"]
        or model1["R2_mean"] > model2["R2_mean"]
    )

    # Model1 strictly dominates Model2 if it is at least as good in all metrics and
    # strictly better in at least one metric
    return at_least_as_good and strictly_better_in_one

<IPython.core.display.Javascript object>

In [13]:
def get_dominance_matrix_and_graph(df):
    n = len(df)
    dominance_matrix = np.zeros((n, n), dtype=bool)

    for i in range(n):
        for j in range(n):
            if i != j:
                dominance_matrix[i, j] = dominates(df.iloc[i], df.iloc[j])

    # Create the dominance graph
    dominance_graph = {i: [] for i in range(n)}
    for i in range(n):
        for j in range(n):
            if dominance_matrix[i, j]:
                dominance_graph[i].append(j)

    return dominance_matrix, dominance_graph

<IPython.core.display.Javascript object>

In [14]:
# Topological Sorting
def topological_sort(graph):
    """Perform topological sorting on the given graph."""
    in_degree = {u: 0 for u in graph}
    for u in graph:
        for v in graph[u]:
            in_degree[v] += 1

    queue = deque([u for u in graph if in_degree[u] == 0])
    topo_order = []

    while queue:
        u = queue.popleft()
        topo_order.append(u)

        for v in graph[u]:
            in_degree[v] -= 1
            if in_degree[v] == 0:
                queue.append(v)

    return topo_order

<IPython.core.display.Javascript object>

In [15]:
# Function to find the top models in each group
def find_top_models(group):
    max_net_dominance = group["Net_Dominance"].max()
    top_models = group[group["Net_Dominance"] == max_net_dominance]
    return top_models

<IPython.core.display.Javascript object>

### Dominance Analysis and SCPM

In [16]:
def compute_scpm(df):
    scaler = StandardScaler()
    standardized_metrics = scaler.fit_transform(
        df[["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"]]
    )

    # Creating a new dataframe with standardized values
    standardized_df = pd.DataFrame(
        standardized_metrics,
        columns=["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"],
    )

    # Summing all metrics and subtracting R2_mean
    standardized_df["Result"] = (
        standardized_df[["RMSE_mean", "MAE_mean", "MAPE_mean"]].sum(axis=1)
        - standardized_df["R2_mean"]
    )

    df["SCPM"] = standardized_df["Result"]
    return df

<IPython.core.display.Javascript object>

In [17]:
def make_dominance_analysis(df):
    # Compute dominance matrix and graph
    dominance_matrix, dominance_graph = get_dominance_matrix_and_graph(df)

    # Get the topological order
    topo_order = topological_sort(dominance_graph)
    df = df.copy()
    n = len(topo_order)

    # Identify Dominant Models
    dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if not np.any(dominance_matrix[:, i])
    ]

    # Identify Non-Dominant Models
    non_dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i])
    ]

    # Identify Models that are both dominated and dominate at least one
    intermediate_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i]) and np.any(dominance_matrix[i, :])
    ]

    # Add count of times each model is dominated by others
    df["Dominated_Count"] = np.sum(dominance_matrix, axis=0)

    # Classify models
    df["Classification"] = "intermediate_model"  # default to intermediate_model
    df.loc[
        ~np.any(dominance_matrix, axis=1) & np.any(dominance_matrix, axis=0),
        "Classification",
    ] = "non_dominant_model"
    df.loc[~np.any(dominance_matrix, axis=0), "Classification"] = "dominant_model"

    # Map the topological order to the model names and get dominance counts
    sorted_models = [df["Model"].iloc[i] for i in topo_order]

    df_sorted = df.iloc[topo_order].copy()
    df_sorted["Dominates_Count"] = dominance_matrix.sum(axis=1)[topo_order]
    df_sorted_topo = df_sorted.copy()

    df_sorted = df_sorted.sort_values(by="Dominates_Count", ascending=False)
    df_sorted_count = df_sorted.copy()

    return {
        "dominance_matrix": dominance_matrix,
        "dominance_graph": dominance_graph,
        "df_sorted_topo": df_sorted_topo,
        "df_sorted_count": df_sorted_count,
    }

<IPython.core.display.Javascript object>

# Pre Training Analysis

# Reading The files

In [18]:
csv_files_path_fine_tuning = dict()
csv_files_path_pre_train = dict()

<IPython.core.display.Javascript object>

## 204

### Plant AB

In [19]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/204/ab/pre_training/full/"
plant = "ab"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

## 204

### Plant F

In [20]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/204/f/pre_training/full/"
plant = "f"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

### Reading all data - pre training

#### Full

In [21]:
dfs_plant = dict()
column = ("plant", "")
pos = 2

for plant, csv_files in csv_files_path_pre_train.items():
    df = read_csv_files_full(csv_files, plant)
    df["plant"] = plant
    dfs_plant[plant] = df

<IPython.core.display.Javascript object>

In [22]:
df_pre_train = pd.concat(dfs_plant.values(), ignore_index=True)

<IPython.core.display.Javascript object>

In [23]:
df_pre_train.shape

(438, 23)

<IPython.core.display.Javascript object>

## Preprocessing

In [24]:
patterns = [
    "Chemical + Mineralogical + Feature Engineering",
    "Chemical + Mineralogical + CS7 + One-Hot",
    "Chemical + Mineralogical + Physical + One-Hot",
    "Chemical + Mineralogical + CS3 + One-Hot",
    "Chemical + Mineralogical + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + One-Hot",
    "Chemical + Mineralogical + CS1 + One-Hot",
    "Chemical + Mineralogical + CS2 + One-Hot",
    "Chemical + Feature Engineering",
]

FEATURES_TO_REPLACE_1 = {
    "Chemical + Mineralogical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS2": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS7": "Chemical + Mineralogical + Early CS",
}

FEATURES_TO_REPLACE_2 = {
    "Chemical": "Chemical + Mineralogical",
    "Chemical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Mineralogical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Physical": "Chemical + Mineralogical + Physical",
}

df_pre_train = df_pre_train[
    ~df_pre_train["Features"].apply(lambda x: x in patterns)
].reset_index(drop=True)

df_pre_train = df_pre_train[
    ~df_pre_train["Cross Validation"].eq("Out of time Split")
].reset_index(drop=True)

# removing afterwards
df_pre_train = df_pre_train[
    ~df_pre_train["Cross Validation"].eq("Repeated KFold")
].reset_index(drop=True)

df_pre_train["Features_bkp"] = df_pre_train["Features"].copy()
df_pre_train["Features"] = df_pre_train["Features"].replace(FEATURES_TO_REPLACE_1)

df_pre_train["Features"] = df_pre_train["Features"].replace(FEATURES_TO_REPLACE_2)


# removing afterwards
# df_pre_train = df_pre_train[
#     ~df_pre_train["Features"].eq("Chemical + Mineralogical + Early CS")
# ].reset_index(drop=True)

metrics_names = {
    "RMSE Test": "RMSE_mean",
    "MAE Test": "MAE_mean",
    "MAPE Test": "MAPE_mean",
    "R2 Test": "R2_mean",
}

df_pre_train = df_pre_train.rename(metrics_names, axis=1)

<IPython.core.display.Javascript object>

In [25]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [26]:
df_pre_train["MAPE Train"] = df_pre_train["MAPE Train"] * 100
df_pre_train["MAPE_mean"] = df_pre_train["MAPE_mean"] * 100

df_pre_train["Model_bkp"] = df_pre_train["Model"].copy()
df_pre_train["Model_bkp_2"] = df_pre_train["Model"].apply(
    lambda x: "" if pd.isna(x) else "_" + str(x)
)

<IPython.core.display.Javascript object>

In [27]:
df_pre_train["Model_bkp_2"] = df_pre_train["Model_bkp_2"].apply(
    lambda x: "MLP"
    if "MLP" in x
    else "Bi-LSTM"
    if "Bi-LSTM" in x
    else "LSTM"
    if "LSTM" in x
    else "Conv1D"
    if "Conv1D" in x
    else "Transformer"
)

<IPython.core.display.Javascript object>

In [28]:
df_pre_train["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [29]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [30]:
df_pre_train["Features_bkp"].unique()

array(['Chemical + Properties CS Less', 'Chemical + Physical', 'Chemical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [31]:
df_pre_train.shape

(438, 26)

<IPython.core.display.Javascript object>

In [32]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [33]:
df_results_cm = (
    df_pre_train[df_pre_train["Features"].eq("Chemical + Mineralogical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [34]:
df_results_cm["Features"].unique()

array(['Chemical + Mineralogical'], dtype=object)

<IPython.core.display.Javascript object>

In [35]:
df_results_cm.shape

(146, 26)

<IPython.core.display.Javascript object>

# Global Analysis (pre train results)

## ECICS - 204

### Variable Grouping: CM

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM were from:**

*BTiSS:*
    Dominance analysis: Plant AB
    SCPM:Plant AB

*TiSS:*
    Dominance analysis: Plant F
    SCPM:Plant F

### Plant AB

#### Hold Out

In [36]:
df_results_cm = (
    df_pre_train[df_pre_train["Features"].eq("Chemical + Mineralogical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [37]:
df_results_cm["Features"].unique()

array(['Chemical + Mineralogical'], dtype=object)

<IPython.core.display.Javascript object>

In [38]:
df_results_cm.shape

(146, 26)

<IPython.core.display.Javascript object>

In [39]:
df_results_cm = df_results_cm[df_results_cm["Plant"].eq("AB")].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [40]:
df_results_cm["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [41]:
df_results_cm_ho = (
    df_results_cm[df_results_cm["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [42]:
df_results_cm_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [43]:
df_results_cm_ho.shape

(73, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [44]:
df_results_cm_ho = compute_scpm(df_results_cm_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [45]:
dominance_dict = make_dominance_analysis(df_results_cm_ho)
dominance_matrix_cm_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [46]:
dominance_matrix_cm_ho.shape, len(dominance_graph_cm_ho)

((73, 73), 73)

<IPython.core.display.Javascript object>

In [47]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    71
dominant_model         1
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [48]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/local_models/ecics/ecics_dominance_analysis_cm_kf.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [49]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
41,Global Model,204,AB,Chemical + Mineralogical,"(62581, 5)",7.0,Bi-LSTM14,,Standard Scaler,,...,4.312253,0.830659,ab,Chemical,Bi-LSTM14,Bi-LSTM,-2.763996,0,dominant_model,72
33,Global Model,204,AB,Chemical + Mineralogical,"(62581, 5)",14.0,Bi-LSTM6,,Standard Scaler,,...,4.523614,0.822068,ab,Chemical,Bi-LSTM6,Bi-LSTM,-2.473252,1,intermediate_model,69
38,Global Model,204,AB,Chemical + Mineralogical,"(62581, 5)",7.0,Bi-LSTM11,,Standard Scaler,,...,4.367373,0.817875,ab,Chemical,Bi-LSTM11,Bi-LSTM,-2.57677,1,intermediate_model,70
24,Global Model,204,AB,Chemical + Mineralogical,"(62581, 5)",14.0,LSTM12,,Standard Scaler,,...,4.464608,0.816831,ab,Chemical,LSTM12,LSTM,-2.46259,2,intermediate_model,69
15,Global Model,204,AB,Chemical + Mineralogical,"(62581, 5)",14.0,LSTM3,,Standard Scaler,,...,4.572102,0.815705,ab,Chemical,LSTM3,LSTM,-2.344712,4,intermediate_model,66


<IPython.core.display.Javascript object>

In [50]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [51]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
41,Bi-LSTM14,7.0,2.49,1.83,4.31,0.83,-2.76,0,dominant_model,72,AB
38,Bi-LSTM11,7.0,2.58,1.86,4.37,0.82,-2.58,1,intermediate_model,70,AB
24,LSTM12,14.0,2.59,1.91,4.46,0.82,-2.46,2,intermediate_model,69,AB


<IPython.core.display.Javascript object>

##### Top intermediate models

In [52]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
38,Bi-LSTM11,7.0,2.58,1.86,4.37,0.82,-2.58,1,intermediate_model,70,AB
33,Bi-LSTM6,14.0,2.55,1.91,4.52,0.82,-2.47,1,intermediate_model,69,AB


<IPython.core.display.Javascript object>

##### Top non dominant models

In [53]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
36,Bi-LSTM9,14.0,11.29,8.54,19.26,-2.49,30.74,72,non_dominant_model,0,AB


<IPython.core.display.Javascript object>

In [54]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [55]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Bi-LSTM        15
LSTM           15
Conv1D         15
Transformer    15
MLP            13
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [56]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [57]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [58]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [59]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,15.0,31.13,28.85,0.0,4.5,20.0,56.5,72.0,15.0,38.87,29.07,0.0,11.5,47.0,66.5,72.0
Conv1D,15.0,31.4,17.22,9.0,18.0,24.0,47.5,57.0,15.0,34.47,18.1,10.0,17.5,37.0,50.0,61.0
LSTM,15.0,26.4,23.6,2.0,9.0,13.0,52.0,65.0,15.0,42.67,23.04,7.0,18.5,56.0,59.5,69.0
MLP,13.0,32.69,10.13,19.0,26.0,29.0,42.0,50.0,13.0,29.69,6.87,22.0,24.0,30.0,35.0,43.0
Transformer,15.0,43.87,19.25,19.0,28.5,36.0,66.0,70.0,15.0,19.4,13.24,1.0,5.0,24.0,31.0,39.0


<IPython.core.display.Javascript object>

In [60]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,15,3,intermediate_model,13
Conv1D,15,1,intermediate_model,15
LSTM,15,1,intermediate_model,15
MLP,13,1,intermediate_model,13
Transformer,15,1,intermediate_model,15


<IPython.core.display.Javascript object>

In [61]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    13
             dominant_model         1
             non_dominant_model     1
Conv1D       intermediate_model    15
LSTM         intermediate_model    15
MLP          intermediate_model    13
Transformer  intermediate_model    15
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [62]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,86.67
Bi-LSTM,dominant_model,6.67
Bi-LSTM,non_dominant_model,6.67
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,100.0
MLP,intermediate_model,100.0
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [63]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
41,Bi-LSTM,0,72,dominant_model
33,Bi-LSTM,1,69,intermediate_model
38,Bi-LSTM,1,70,intermediate_model
24,LSTM,2,69,intermediate_model
15,LSTM,4,66,intermediate_model
...,...,...,...,...
46,Transformer,68,4,intermediate_model
43,Transformer,69,3,intermediate_model
39,Bi-LSTM,70,1,intermediate_model
55,Transformer,70,1,intermediate_model


<IPython.core.display.Javascript object>

In [64]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [65]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,15,38.866667,47.0,467,583,7.986301
Conv1D,15,34.466667,37.0,471,517,7.082192
LSTM,15,42.666667,56.0,396,640,8.767123
MLP,13,29.692308,30.0,425,386,5.287671
Transformer,15,19.4,24.0,658,291,3.986301


<IPython.core.display.Javascript object>

In [66]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.643373,...,56.5,72.0,15.0,38.866667,29.066099,0.0,11.5,47.0,66.5,72.0
Conv1D,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.11956,...,47.5,57.0,15.0,34.466667,18.102354,10.0,17.5,37.0,50.0,61.0
LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.026647,...,52.0,65.0,15.0,42.666667,23.036203,7.0,18.5,56.0,59.5,69.0
MLP,0.0,,,,,,,,13.0,3.183583,...,42.0,50.0,13.0,29.692308,6.872465,22.0,24.0,30.0,35.0,43.0
Transformer,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.464101,...,66.0,70.0,15.0,19.4,13.243867,1.0,5.0,24.0,31.0,39.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [67]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [68]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [69]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [70]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [71]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
41,Bi-LSTM14,Bi-LSTM,7.0,2.49,1.83,4.31,0.83,-2.76,0,72,dominant_model,AB,True
38,Bi-LSTM11,Bi-LSTM,7.0,2.58,1.86,4.37,0.82,-2.58,1,70,intermediate_model,AB,False
33,Bi-LSTM6,Bi-LSTM,14.0,2.55,1.91,4.52,0.82,-2.47,1,69,intermediate_model,AB,False
24,LSTM12,LSTM,14.0,2.59,1.91,4.46,0.82,-2.46,2,69,intermediate_model,AB,True
29,Bi-LSTM2,Bi-LSTM,7.0,2.6,1.93,4.54,0.82,-2.38,4,67,intermediate_model,AB,False
15,LSTM3,LSTM,14.0,2.59,1.94,4.57,0.82,-2.34,4,66,intermediate_model,AB,False
42,Bi-LSTM15,Bi-LSTM,14.0,2.62,1.94,4.57,0.81,-2.31,5,66,intermediate_model,AB,True


<IPython.core.display.Javascript object>

In [72]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].head(1)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
41,Bi-LSTM14,Bi-LSTM,7.0,2.486856,1.834692,4.312253,0.830659,-2.763996,0,72,dominant_model,AB,True


<IPython.core.display.Javascript object>

### Plant F

#### Hold Out

In [73]:
df_results_cm = (
    df_pre_train[df_pre_train["Features"].eq("Chemical + Mineralogical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [74]:
df_results_cm["Features"].unique()

array(['Chemical + Mineralogical'], dtype=object)

<IPython.core.display.Javascript object>

In [75]:
df_results_cm.shape

(146, 26)

<IPython.core.display.Javascript object>

In [76]:
df_results_cm = df_results_cm[df_results_cm["Plant"].eq("F")].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [77]:
df_results_cm["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [78]:
df_results_cm_ho = (
    df_results_cm[df_results_cm["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [79]:
df_results_cm_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [80]:
df_results_cm_ho.shape

(73, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [81]:
df_results_cm_ho = compute_scpm(df_results_cm_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [82]:
dominance_dict = make_dominance_analysis(df_results_cm_ho)
dominance_matrix_cm_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [83]:
dominance_matrix_cm_ho.shape, len(dominance_graph_cm_ho)

((73, 73), 73)

<IPython.core.display.Javascript object>

In [84]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    69
dominant_model         3
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [85]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/local_models/ecics/ecics_dominance_analysis_cm_kf.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [86]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
14,Global Model,204,F,Chemical + Mineralogical,"(62318, 9)",7.0,LSTM2,,Standard Scaler,,...,4.45131,0.827792,f,Chemical,LSTM2,LSTM,-4.408101,0,dominant_model,68
33,Global Model,204,F,Chemical + Mineralogical,"(62318, 9)",14.0,Bi-LSTM6,,Standard Scaler,,...,4.219215,0.827575,f,Chemical,Bi-LSTM6,Bi-LSTM,-4.952151,0,dominant_model,70
36,Global Model,204,F,Chemical + Mineralogical,"(62318, 9)",14.0,Bi-LSTM9,,Standard Scaler,,...,4.443989,0.832924,f,Chemical,Bi-LSTM9,Bi-LSTM,-4.580752,0,dominant_model,68
39,Global Model,204,F,Chemical + Mineralogical,"(62318, 9)",14.0,Bi-LSTM12,,Standard Scaler,,...,4.304836,0.824702,f,Chemical,Bi-LSTM12,Bi-LSTM,-4.541505,1,intermediate_model,69
10,Global Model,204,F,Chemical + Mineralogical,"(62318, 9)",,MLP_11,,Standard Scaler,,...,4.436817,0.802472,f,Chemical,MLP_11,MLP,-3.530093,2,intermediate_model,51


<IPython.core.display.Javascript object>

In [87]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [88]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
33,Bi-LSTM6,14.0,2.49,1.8,4.22,0.83,-4.95,0,dominant_model,70,F
39,Bi-LSTM12,14.0,2.51,1.86,4.3,0.82,-4.54,1,intermediate_model,69,F
14,LSTM2,7.0,2.49,1.88,4.45,0.83,-4.41,0,dominant_model,68,F


<IPython.core.display.Javascript object>

##### Top intermediate models

In [89]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
39,Bi-LSTM12,14.0,2.51,1.86,4.3,0.82,-4.54,1,intermediate_model,69,F
24,LSTM12,14.0,2.53,1.92,4.55,0.82,-3.91,4,intermediate_model,59,F


<IPython.core.display.Javascript object>

##### Top non dominant models

In [90]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
46,Transformer_4,1.0,4.16,3.11,7.59,0.52,13.11,72,non_dominant_model,0,F


<IPython.core.display.Javascript object>

In [91]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [92]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
LSTM           15
Bi-LSTM        15
Conv1D         15
Transformer    15
MLP            13
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [93]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [94]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [95]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [96]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,15.0,30.47,21.17,0.0,14.0,35.0,44.0,65.0,15.0,34.93,21.7,3.0,21.5,28.0,48.0,70.0
Conv1D,15.0,23.2,14.94,5.0,11.0,18.0,37.5,47.0,15.0,38.87,11.53,21.0,29.0,39.0,49.0,55.0
LSTM,15.0,27.07,26.04,0.0,4.0,18.0,51.0,70.0,15.0,36.33,23.3,1.0,17.0,41.0,56.5,68.0
MLP,13.0,17.62,12.49,2.0,8.0,15.0,27.0,38.0,13.0,39.85,7.01,26.0,38.0,39.0,43.0,51.0
Transformer,15.0,58.2,7.71,44.0,53.5,58.0,62.5,72.0,15.0,9.53,7.19,0.0,4.5,8.0,14.0,25.0


<IPython.core.display.Javascript object>

In [97]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,15,2,intermediate_model,13
Conv1D,15,1,intermediate_model,15
LSTM,15,2,intermediate_model,14
MLP,13,1,intermediate_model,13
Transformer,15,2,intermediate_model,14


<IPython.core.display.Javascript object>

In [98]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    13
             dominant_model         2
Conv1D       intermediate_model    15
LSTM         intermediate_model    14
             dominant_model         1
MLP          intermediate_model    13
Transformer  intermediate_model    14
             non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [99]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,86.67
Bi-LSTM,dominant_model,13.33
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,93.33
LSTM,dominant_model,6.67
MLP,intermediate_model,100.0
Transformer,intermediate_model,93.33
Transformer,non_dominant_model,6.67


<IPython.core.display.Javascript object>

In [100]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
14,LSTM,0,68,dominant_model
33,Bi-LSTM,0,70,dominant_model
36,Bi-LSTM,0,68,dominant_model
39,Bi-LSTM,1,69,intermediate_model
10,MLP,2,51,intermediate_model
...,...,...,...,...
48,Transformer,63,4,intermediate_model
57,Transformer,65,3,intermediate_model
22,LSTM,70,1,intermediate_model
43,Transformer,70,1,intermediate_model


<IPython.core.display.Javascript object>

In [101]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [102]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,15,34.933333,28.0,457,524,7.178082
Conv1D,15,38.866667,39.0,348,583,7.986301
LSTM,15,36.333333,41.0,406,545,7.465753
MLP,13,39.846154,39.0,229,518,7.09589
Transformer,15,9.533333,8.0,873,143,1.958904


<IPython.core.display.Javascript object>

In [103]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.880929,...,44.0,65.0,15.0,34.933333,21.70407,3.0,21.5,28.0,48.0,70.0
Conv1D,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.734598,...,37.5,47.0,15.0,38.866667,11.531737,21.0,29.0,39.0,49.0,55.0
LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.87748,...,51.0,70.0,15.0,36.333333,23.304404,1.0,17.0,41.0,56.5,68.0
MLP,0.0,,,,,,,,13.0,2.78035,...,27.0,38.0,13.0,39.846154,7.010066,26.0,38.0,39.0,43.0,51.0
Transformer,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.356702,...,62.5,72.0,15.0,9.533333,7.18994,0.0,4.5,8.0,14.0,25.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [104]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [105]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [106]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [107]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [108]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
33,Bi-LSTM6,Bi-LSTM,14.0,2.49,1.8,4.22,0.83,-4.95,0,70,dominant_model,F,False
36,Bi-LSTM9,Bi-LSTM,14.0,2.45,1.88,4.44,0.83,-4.58,0,68,dominant_model,F,False
39,Bi-LSTM12,Bi-LSTM,14.0,2.51,1.86,4.3,0.82,-4.54,1,69,intermediate_model,F,False
14,LSTM2,LSTM,7.0,2.49,1.88,4.45,0.83,-4.41,0,68,dominant_model,F,False
26,LSTM14,LSTM,7.0,2.56,1.9,4.49,0.82,-3.93,4,59,intermediate_model,F,False
24,LSTM12,LSTM,14.0,2.53,1.92,4.55,0.82,-3.91,4,59,intermediate_model,F,False
23,LSTM11,LSTM,7.0,2.52,1.93,4.62,0.82,-3.85,4,53,intermediate_model,F,False


<IPython.core.display.Javascript object>

In [109]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].head(1)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
33,Bi-LSTM6,Bi-LSTM,14.0,2.488382,1.80098,4.219215,0.827575,-4.952151,0,70,dominant_model,F,False


<IPython.core.display.Javascript object>

### Variable Grouping: CM-P

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P were from:**

*BTiSS:*
    Dominance analysis: Plant F
    SCPM:Plant F

*TiSS:*
    Dominance analysis: Plant F
    SCPM:Plant F

In [110]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [111]:
df_results_cm_p = (
    df_pre_train[
        df_pre_train["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

### Plant F

In [112]:
df_results_cm_p_am = df_results_cm_p[df_results_cm_p["Plant"].eq("F")].reset_index(
    drop=True
)

<IPython.core.display.Javascript object>

In [113]:
df_results_cm_p_am.shape

(73, 26)

<IPython.core.display.Javascript object>

#### Hold Out

In [114]:
df_results_cm_p_am["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [115]:
df_results_cm_p_am_ho = (
    df_results_cm_p_am[df_results_cm_p_am["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [116]:
df_results_cm_p_am_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [117]:
df_results_cm_p_am_ho.shape

(73, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [118]:
df_results_cm_p_am_ho = compute_scpm(df_results_cm_p_am_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [119]:
dominance_dict = make_dominance_analysis(df_results_cm_p_am_ho)
dominance_matrix_cm_p_am_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_am_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [120]:
dominance_matrix_cm_p_am_ho.shape, len(dominance_graph_cm_p_am_ho)

((73, 73), 73)

<IPython.core.display.Javascript object>

In [121]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    71
dominant_model         1
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [122]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
66,Global Model,204,F,Chemical + Mineralogical + Physical - Early CS,"(62318, 14)",,MLP_7,,Standard Scaler,,...,3.687292,0.868935,f,Chemical + Properties CS Less,MLP_7,MLP,-1.50391,0,dominant_model,72
71,Global Model,204,F,Chemical + Mineralogical + Physical - Early CS,"(62318, 14)",,MLP_12,,Standard Scaler,,...,3.795167,0.864139,f,Chemical + Properties CS Less,MLP_12,MLP,-1.467262,1,intermediate_model,71
70,Global Model,204,F,Chemical + Mineralogical + Physical - Early CS,"(62318, 14)",,MLP_11,,Standard Scaler,,...,3.813354,0.862341,f,Chemical + Properties CS Less,MLP_11,MLP,-1.458,2,intermediate_model,70
65,Global Model,204,F,Chemical + Mineralogical + Physical - Early CS,"(62318, 14)",,MLP_6,,Standard Scaler,,...,3.864181,0.855133,f,Chemical + Properties CS Less,MLP_6,MLP,-1.428462,3,intermediate_model,69
63,Global Model,204,F,Chemical + Mineralogical + Physical - Early CS,"(62318, 14)",,MLP_4,,Standard Scaler,,...,3.971155,0.848673,f,Chemical + Properties CS Less,MLP_4,MLP,-1.386158,4,intermediate_model,66


<IPython.core.display.Javascript object>

In [123]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [124]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
66,MLP_7,,2.17,1.57,3.69,0.87,-1.5,0,dominant_model,72,F
71,MLP_12,,2.21,1.6,3.8,0.86,-1.47,1,intermediate_model,71,F
70,MLP_11,,2.22,1.61,3.81,0.86,-1.46,2,intermediate_model,70,F


<IPython.core.display.Javascript object>

##### Top intermediate models

In [125]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
71,MLP_12,,2.21,1.6,3.8,0.86,-1.47,1,intermediate_model,71,F
70,MLP_11,,2.22,1.61,3.81,0.86,-1.46,2,intermediate_model,70,F


<IPython.core.display.Javascript object>

##### Top non dominant models

In [126]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
38,Bi-LSTM9,14.0,31.79,31.22,71.97,-27.14,32.94,72,non_dominant_model,0,F


<IPython.core.display.Javascript object>

In [127]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [128]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Conv1D         15
Transformer    15
Bi-LSTM        15
LSTM           15
MLP            13
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [129]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [130]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [131]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [132]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,15.0,48.47,15.31,29.0,33.0,53.0,61.0,72.0,15.0,21.2,14.73,0.0,8.5,17.0,36.0,43.0
Conv1D,15.0,19.4,5.08,12.0,15.5,17.0,23.5,28.0,15.0,50.93,4.56,44.0,47.0,52.0,54.0,59.0
LSTM,15.0,54.53,11.23,34.0,46.0,57.0,64.5,70.0,15.0,15.4,10.28,1.0,6.0,14.0,23.5,35.0
MLP,13.0,5.23,3.35,0.0,3.0,5.0,7.0,12.0,13.0,65.15,4.43,59.0,61.0,66.0,69.0,72.0
Transformer,15.0,43.0,12.58,22.0,36.0,40.0,50.5,70.0,15.0,25.93,11.53,1.0,21.0,27.0,33.0,46.0


<IPython.core.display.Javascript object>

In [133]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,15,2,intermediate_model,14
Conv1D,15,1,intermediate_model,15
LSTM,15,1,intermediate_model,15
MLP,13,2,intermediate_model,12
Transformer,15,1,intermediate_model,15


<IPython.core.display.Javascript object>

In [134]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    14
             non_dominant_model     1
Conv1D       intermediate_model    15
LSTM         intermediate_model    15
MLP          intermediate_model    12
             dominant_model         1
Transformer  intermediate_model    15
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [135]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,93.33
Bi-LSTM,non_dominant_model,6.67
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,100.0
MLP,intermediate_model,92.31
MLP,dominant_model,7.69
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [136]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
66,MLP,0,72,dominant_model
71,MLP,1,71,intermediate_model
70,MLP,2,70,intermediate_model
65,MLP,3,69,intermediate_model
63,MLP,4,66,intermediate_model
...,...,...,...,...
44,Bi-LSTM,67,4,intermediate_model
7,LSTM,69,3,intermediate_model
8,LSTM,70,1,intermediate_model
48,Transformer,70,1,intermediate_model


<IPython.core.display.Javascript object>

In [137]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [138]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,15,21.2,17.0,727,318,4.356164
Conv1D,15,50.933333,52.0,291,764,10.465753
LSTM,15,15.4,14.0,818,231,3.164384
MLP,13,65.153846,66.0,68,847,11.60274
Transformer,15,25.933333,27.0,645,389,5.328767


<IPython.core.display.Javascript object>

In [139]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,5.707708,...,61.0,72.0,15.0,21.2,14.73189,0.0,8.5,17.0,36.0,43.0
Conv1D,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.640814,...,23.5,28.0,15.0,50.933333,4.558613,44.0,47.0,52.0,54.0,59.0
LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,4.323992,...,64.5,70.0,15.0,15.4,10.280356,1.0,6.0,14.0,23.5,35.0
MLP,0.0,,,,,,,,13.0,2.335588,...,7.0,12.0,13.0,65.153846,4.43182,59.0,61.0,66.0,69.0,72.0
Transformer,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.601294,...,50.5,70.0,15.0,25.933333,11.529259,1.0,21.0,27.0,33.0,46.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [140]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [141]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [142]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [143]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [144]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
66,MLP_7,MLP,,2.17,1.57,3.69,0.87,-1.5,0,72,dominant_model,F,True
71,MLP_12,MLP,,2.21,1.6,3.8,0.86,-1.47,1,71,intermediate_model,F,True
70,MLP_11,MLP,,2.22,1.61,3.81,0.86,-1.46,2,70,intermediate_model,F,True
65,MLP_6,MLP,,2.28,1.62,3.86,0.86,-1.43,3,69,intermediate_model,F,True
69,MLP_10,MLP,,2.35,1.64,3.89,0.85,-1.4,4,67,intermediate_model,F,False
67,MLP_8,MLP,,2.35,1.65,3.91,0.85,-1.39,5,66,intermediate_model,F,False
63,MLP_4,MLP,,2.33,1.67,3.97,0.85,-1.39,4,66,intermediate_model,F,False


<IPython.core.display.Javascript object>

### Variable Grouping: CM-P-CS

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P-CS were from:**

*BTiSS:*
    Dominance analysis: Plant F
    SCPM:Plant F

*TiSS:*
    Dominance analysis: Plant AB
    SCPM:Plant AB

In [145]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

### Plant F

In [146]:
df_results_cm_p_cs = (
    df_pre_train[df_pre_train["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [147]:
df_results_cm_p_cs_s = df_results_cm_p_cs[
    df_results_cm_p_cs["Plant"].eq("F")
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [148]:
df_results_cm_p_cs_s.shape

(73, 26)

<IPython.core.display.Javascript object>

#### Hold Out

In [149]:
df_results_cm_p_cs_s["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [150]:
df_results_cm_p_cs_s_ho = (
    df_results_cm_p_cs_s[df_results_cm_p_cs_s["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [151]:
df_results_cm_p_cs_s_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [152]:
df_results_cm_p_cs_s_ho.shape

(73, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [153]:
df_results_cm_p_cs_s_ho = compute_scpm(df_results_cm_p_cs_s_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [154]:
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_s_ho)
dominance_matrix_cm_p_cs_s_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_s_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [155]:
dominance_matrix_cm_p_cs_s_ho.shape, len(dominance_graph_cm_p_cs_s_ho)

((73, 73), 73)

<IPython.core.display.Javascript object>

In [156]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    69
dominant_model         2
non_dominant_model     2
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [157]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
25,Global Model,204,F,Chemical + Mineralogical + Physical,"(62318, 17)",,MLP_11,,Standard Scaler,,...,2.611097,0.937708,f,Chemical + Physical,MLP_11,MLP,-1.424678,0,dominant_model,71
26,Global Model,204,F,Chemical + Mineralogical + Physical,"(62318, 17)",,MLP_12,,Standard Scaler,,...,2.614679,0.9386,f,Chemical + Physical,MLP_12,MLP,-1.425755,0,dominant_model,71
16,Global Model,204,F,Chemical + Mineralogical + Physical,"(62318, 17)",,MLP_2,,Standard Scaler,,...,2.691875,0.936247,f,Chemical + Physical,MLP_2,MLP,-1.398345,2,intermediate_model,66
21,Global Model,204,F,Chemical + Mineralogical + Physical,"(62318, 17)",,MLP_7,,Standard Scaler,,...,2.646504,0.934694,f,Chemical + Physical,MLP_7,MLP,-1.407392,2,intermediate_model,69
20,Global Model,204,F,Chemical + Mineralogical + Physical,"(62318, 17)",,MLP_6,,Standard Scaler,,...,2.698501,0.934161,f,Chemical + Physical,MLP_6,MLP,-1.391619,4,intermediate_model,65


<IPython.core.display.Javascript object>

In [158]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [159]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
25,MLP_11,,1.5,1.11,2.61,0.94,-1.42,0,dominant_model,71,F
26,MLP_12,,1.48,1.11,2.61,0.94,-1.43,0,dominant_model,71,F
21,MLP_7,,1.53,1.13,2.65,0.93,-1.41,2,intermediate_model,69,F


<IPython.core.display.Javascript object>

##### Top intermediate models

In [160]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
21,MLP_7,,1.53,1.13,2.65,0.93,-1.41,2,intermediate_model,69,F
22,MLP_8,,1.54,1.13,2.65,0.93,-1.4,3,intermediate_model,66,F


<IPython.core.display.Javascript object>

##### Top non dominant models

In [161]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
27,MLP_13,,49.65,1.84,4.51,-67.67,15.31,38,non_dominant_model,0,F
10,LSTM11,7.0,25.3,24.58,56.21,-16.82,21.27,71,non_dominant_model,0,F


<IPython.core.display.Javascript object>

In [162]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [163]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Conv1D         15
LSTM           15
Bi-LSTM        15
Transformer    15
MLP            13
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [164]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [165]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [166]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [167]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,15.0,45.53,16.23,19.0,35.5,43.0,60.0,70.0,15.0,23.33,15.68,1.0,10.5,22.0,33.5,52.0
Conv1D,15.0,23.0,11.76,11.0,14.5,20.0,32.0,43.0,15.0,45.6,13.65,23.0,36.0,48.0,56.0,61.0
LSTM,15.0,45.4,19.4,15.0,31.0,40.0,64.5,71.0,15.0,24.2,18.43,0.0,6.5,27.0,38.0,54.0
MLP,13.0,8.31,11.05,0.0,2.0,4.0,9.0,38.0,13.0,59.08,19.27,0.0,62.0,65.0,66.0,71.0
Transformer,15.0,46.2,14.03,25.0,33.5,49.0,56.0,69.0,15.0,23.0,13.41,2.0,14.5,20.0,34.5,46.0


<IPython.core.display.Javascript object>

In [168]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,15,1,intermediate_model,15
Conv1D,15,1,intermediate_model,15
LSTM,15,2,intermediate_model,14
MLP,13,3,intermediate_model,10
Transformer,15,1,intermediate_model,15


<IPython.core.display.Javascript object>

In [169]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    15
Conv1D       intermediate_model    15
LSTM         intermediate_model    14
             non_dominant_model     1
MLP          intermediate_model    10
             dominant_model         2
             non_dominant_model     1
Transformer  intermediate_model    15
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [170]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,100.0
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,93.33
LSTM,non_dominant_model,6.67
MLP,intermediate_model,76.92
MLP,dominant_model,15.38
MLP,non_dominant_model,7.69
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [171]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
25,MLP,0,71,dominant_model
26,MLP,0,71,dominant_model
16,MLP,2,66,intermediate_model
21,MLP,2,69,intermediate_model
20,MLP,4,65,intermediate_model
...,...,...,...,...
11,LSTM,67,3,intermediate_model
54,Bi-LSTM,67,3,intermediate_model
65,Transformer,69,2,intermediate_model
51,Bi-LSTM,70,1,intermediate_model


<IPython.core.display.Javascript object>

In [172]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [173]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,15,23.333333,22.0,683,350,4.794521
Conv1D,15,45.6,48.0,345,684,9.369863
LSTM,15,24.2,27.0,681,363,4.972603
MLP,13,59.076923,65.0,108,768,10.520548
Transformer,15,23.0,20.0,693,345,4.726027


<IPython.core.display.Javascript object>

In [174]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.326481,...,60.0,70.0,15.0,23.333333,15.682869,1.0,10.5,22.0,33.5,52.0
Conv1D,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.054101,...,32.0,43.0,15.0,45.6,13.647606,23.0,36.0,48.0,56.0,61.0
LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,4.606979,...,64.5,71.0,15.0,24.2,18.428239,0.0,6.5,27.0,38.0,54.0
MLP,0.0,,,,,,,,13.0,5.285192,...,9.0,38.0,13.0,59.076923,19.267683,0.0,62.0,65.0,66.0,71.0
Transformer,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.911772,...,56.0,69.0,15.0,23.0,13.405756,2.0,14.5,20.0,34.5,46.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [175]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [176]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [177]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [178]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [179]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
26,MLP_12,MLP,,1.48,1.11,2.61,0.94,-1.43,0,71,dominant_model,F,False
25,MLP_11,MLP,,1.5,1.11,2.61,0.94,-1.42,0,71,dominant_model,F,False
21,MLP_7,MLP,,1.53,1.13,2.65,0.93,-1.41,2,69,intermediate_model,F,False
22,MLP_8,MLP,,1.54,1.13,2.65,0.93,-1.4,3,66,intermediate_model,F,False
24,MLP_10,MLP,,1.54,1.14,2.67,0.93,-1.4,3,66,intermediate_model,F,False
16,MLP_2,MLP,,1.51,1.14,2.69,0.94,-1.4,2,66,intermediate_model,F,False
19,MLP_5,MLP,,1.55,1.14,2.69,0.93,-1.39,5,65,intermediate_model,F,False


<IPython.core.display.Javascript object>

### Plant AB

In [180]:
df_results_cm_p_cs = (
    df_pre_train[df_pre_train["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [181]:
df_results_cm_p_cs_s = df_results_cm_p_cs[
    df_results_cm_p_cs["Plant"].eq("AB")
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [182]:
df_results_cm_p_cs_s.shape

(73, 26)

<IPython.core.display.Javascript object>

#### Hold Out

In [183]:
df_results_cm_p_cs_s["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [184]:
df_results_cm_p_cs_s_ho = (
    df_results_cm_p_cs_s[df_results_cm_p_cs_s["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [185]:
df_results_cm_p_cs_s_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [186]:
df_results_cm_p_cs_s_ho.shape

(73, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [187]:
df_results_cm_p_cs_s_ho = compute_scpm(df_results_cm_p_cs_s_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [188]:
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_s_ho)
dominance_matrix_cm_p_cs_s_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_s_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [189]:
dominance_matrix_cm_p_cs_s_ho.shape, len(dominance_graph_cm_p_cs_s_ho)

((73, 73), 73)

<IPython.core.display.Javascript object>

In [190]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    70
dominant_model         2
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [191]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
20,Global Model,204,AB,Chemical + Mineralogical + Physical,"(62581, 13)",,MLP_6,,Standard Scaler,,...,2.702524,0.932954,ab,Chemical + Physical,MLP_6,MLP,-2.829402,0,dominant_model,71
25,Global Model,204,AB,Chemical + Mineralogical + Physical,"(62581, 13)",,MLP_11,,Standard Scaler,,...,2.726625,0.933053,ab,Chemical + Physical,MLP_11,MLP,-2.818502,0,dominant_model,71
21,Global Model,204,AB,Chemical + Mineralogical + Physical,"(62581, 13)",,MLP_7,,Standard Scaler,,...,2.793606,0.931116,ab,Chemical + Physical,MLP_7,MLP,-2.761625,2,intermediate_model,70
16,Global Model,204,AB,Chemical + Mineralogical + Physical,"(62581, 13)",,MLP_2,,Standard Scaler,,...,2.980284,0.923182,ab,Chemical + Physical,MLP_2,MLP,-2.59037,3,intermediate_model,68
26,Global Model,204,AB,Chemical + Mineralogical + Physical,"(62581, 13)",,MLP_12,,Standard Scaler,,...,2.993774,0.923704,ab,Chemical + Physical,MLP_12,MLP,-2.585847,3,intermediate_model,68


<IPython.core.display.Javascript object>

In [192]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [193]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
20,MLP_6,,1.56,1.15,2.7,0.93,-2.83,0,dominant_model,71,AB
25,MLP_11,,1.56,1.16,2.73,0.93,-2.82,0,dominant_model,71,AB
21,MLP_7,,1.59,1.19,2.79,0.93,-2.76,2,intermediate_model,70,AB


<IPython.core.display.Javascript object>

##### Top intermediate models

In [194]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
21,MLP_7,,1.59,1.19,2.79,0.93,-2.76,2,intermediate_model,70,AB
16,MLP_2,,1.67,1.26,2.98,0.92,-2.59,3,intermediate_model,68,AB


<IPython.core.display.Javascript object>

##### Top non dominant models

In [195]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
51,Bi-LSTM9,14.0,10.99,9.25,19.92,-2.31,19.68,72,non_dominant_model,0,AB


<IPython.core.display.Javascript object>

In [196]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [197]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Conv1D         15
LSTM           15
Bi-LSTM        15
Transformer    15
MLP            13
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [198]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [199]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [200]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [201]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,15.0,44.8,19.25,19.0,27.5,44.0,61.5,72.0,15.0,24.73,18.52,0.0,8.5,25.0,41.0,50.0
Conv1D,15.0,21.07,9.04,10.0,13.0,20.0,27.0,37.0,15.0,47.87,8.98,32.0,41.5,49.0,56.5,58.0
LSTM,15.0,45.2,16.44,19.0,34.5,46.0,59.0,67.0,15.0,23.93,15.97,3.0,9.5,23.0,34.0,52.0
MLP,13.0,7.15,7.3,0.0,3.0,6.0,9.0,28.0,13.0,63.62,8.67,39.0,63.0,66.0,68.0,71.0
Transformer,15.0,52.6,8.66,41.0,45.0,52.0,58.0,70.0,15.0,18.2,8.86,2.0,11.5,19.0,25.0,31.0


<IPython.core.display.Javascript object>

In [202]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,15,2,intermediate_model,14
Conv1D,15,1,intermediate_model,15
LSTM,15,1,intermediate_model,15
MLP,13,2,intermediate_model,11
Transformer,15,1,intermediate_model,15


<IPython.core.display.Javascript object>

In [203]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    14
             non_dominant_model     1
Conv1D       intermediate_model    15
LSTM         intermediate_model    15
MLP          intermediate_model    11
             dominant_model         2
Transformer  intermediate_model    15
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [204]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,93.33
Bi-LSTM,non_dominant_model,6.67
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,100.0
MLP,intermediate_model,84.62
MLP,dominant_model,15.38
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [205]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
20,MLP,0,71,dominant_model
25,MLP,0,71,dominant_model
21,MLP,2,70,intermediate_model
16,MLP,3,68,intermediate_model
26,MLP,3,68,intermediate_model
...,...,...,...,...
8,LSTM,67,3,intermediate_model
48,Bi-LSTM,67,3,intermediate_model
67,Transformer,70,2,intermediate_model
54,Bi-LSTM,71,1,intermediate_model


<IPython.core.display.Javascript object>

In [206]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [207]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,15,24.733333,25.0,672,371,5.082192
Conv1D,15,47.866667,49.0,316,718,9.835616
LSTM,15,23.933333,23.0,678,359,4.917808
MLP,13,63.615385,66.0,93,827,11.328767
Transformer,15,18.2,19.0,789,273,3.739726


<IPython.core.display.Javascript object>

In [208]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.912101,...,61.5,72.0,15.0,24.733333,18.518202,0.0,8.5,25.0,41.0,50.0
Conv1D,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.009897,...,27.0,37.0,15.0,47.866667,8.975098,32.0,41.5,49.0,56.5,58.0
LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.183144,...,59.0,67.0,15.0,23.933333,15.966334,3.0,9.5,23.0,34.0,52.0
MLP,0.0,,,,,,,,13.0,1.707924,...,9.0,28.0,13.0,63.615385,8.665434,39.0,63.0,66.0,68.0,71.0
Transformer,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.399303,...,58.0,70.0,15.0,18.2,8.857604,2.0,11.5,19.0,25.0,31.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [209]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [210]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [211]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [212]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [213]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
20,MLP_6,MLP,,1.56,1.15,2.7,0.93,-2.83,0,71,dominant_model,AB,True
25,MLP_11,MLP,,1.56,1.16,2.73,0.93,-2.82,0,71,dominant_model,AB,True
21,MLP_7,MLP,,1.59,1.19,2.79,0.93,-2.76,2,70,intermediate_model,AB,True
16,MLP_2,MLP,,1.67,1.26,2.98,0.92,-2.59,3,68,intermediate_model,AB,True
26,MLP_12,MLP,,1.67,1.27,2.99,0.92,-2.59,3,68,intermediate_model,AB,True
24,MLP_10,MLP,,1.68,1.27,3.03,0.92,-2.56,5,67,intermediate_model,AB,True
15,MLP_1,MLP,,1.69,1.28,3.03,0.92,-2.55,6,66,intermediate_model,AB,True


<IPython.core.display.Javascript object>

# Global Analysis (fine tuning results)

# Definitions

# Topological Analysis - NEW

## Definitions

In [214]:
# Function to compare two models strictly based on means only
def dominates(model1, model2):
    """Return True if model1 strictly dominates model2, False otherwise."""
    # Check if model1 is at least as good as model2 in all metrics
    at_least_as_good = (
        model1["RMSE_mean"] <= model2["RMSE_mean"]
        and model1["MAE_mean"] <= model2["MAE_mean"]
        and model1["MAPE_mean"] <= model2["MAPE_mean"]
        and model1["R2_mean"] >= model2["R2_mean"]
    )

    # Check if model1 is strictly better in at least one metric
    strictly_better_in_one = (
        model1["RMSE_mean"] < model2["RMSE_mean"]
        or model1["MAE_mean"] < model2["MAE_mean"]
        or model1["MAPE_mean"] < model2["MAPE_mean"]
        or model1["R2_mean"] > model2["R2_mean"]
    )

    # Model1 strictly dominates Model2 if it is at least as good in all metrics and
    # strictly better in at least one metric
    return at_least_as_good and strictly_better_in_one

<IPython.core.display.Javascript object>

In [215]:
def get_dominance_matrix_and_graph(df):
    n = len(df)
    dominance_matrix = np.zeros((n, n), dtype=bool)

    for i in range(n):
        for j in range(n):
            if i != j:
                dominance_matrix[i, j] = dominates(df.iloc[i], df.iloc[j])

    # Create the dominance graph
    dominance_graph = {i: [] for i in range(n)}
    for i in range(n):
        for j in range(n):
            if dominance_matrix[i, j]:
                dominance_graph[i].append(j)

    return dominance_matrix, dominance_graph

<IPython.core.display.Javascript object>

In [216]:
# Topological Sorting
def topological_sort(graph):
    """Perform topological sorting on the given graph."""
    in_degree = {u: 0 for u in graph}
    for u in graph:
        for v in graph[u]:
            in_degree[v] += 1

    queue = deque([u for u in graph if in_degree[u] == 0])
    topo_order = []

    while queue:
        u = queue.popleft()
        topo_order.append(u)

        for v in graph[u]:
            in_degree[v] -= 1
            if in_degree[v] == 0:
                queue.append(v)

    return topo_order

<IPython.core.display.Javascript object>

In [217]:
# Function to find the top models in each group
def find_top_models(group):
    max_net_dominance = group["Net_Dominance"].max()
    top_models = group[group["Net_Dominance"] == max_net_dominance]
    return top_models

<IPython.core.display.Javascript object>

### Dominance Analysis and SCPM

In [218]:
def compute_scpm(df):
    scaler = StandardScaler()
    standardized_metrics = scaler.fit_transform(
        df[["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"]]
    )

    # Creating a new dataframe with standardized values
    standardized_df = pd.DataFrame(
        standardized_metrics,
        columns=["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"],
    )

    # Summing all metrics and subtracting R2_mean
    standardized_df["Result"] = (
        standardized_df[["RMSE_mean", "MAE_mean", "MAPE_mean"]].sum(axis=1)
        - standardized_df["R2_mean"]
    )

    df["SCPM"] = standardized_df["Result"]
    return df

<IPython.core.display.Javascript object>

In [219]:
def make_dominance_analysis(df):
    # Compute dominance matrix and graph
    dominance_matrix, dominance_graph = get_dominance_matrix_and_graph(df)

    # Get the topological order
    topo_order = topological_sort(dominance_graph)
    df = df.copy()
    n = len(topo_order)

    # Identify Dominant Models
    dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if not np.any(dominance_matrix[:, i])
    ]

    # Identify Non-Dominant Models
    non_dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i])
    ]

    # Identify Models that are both dominated and dominate at least one
    intermediate_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i]) and np.any(dominance_matrix[i, :])
    ]

    # Add count of times each model is dominated by others
    df["Dominated_Count"] = np.sum(dominance_matrix, axis=0)

    # Classify models
    df["Classification"] = "intermediate_model"  # default to intermediate_model
    df.loc[
        ~np.any(dominance_matrix, axis=1) & np.any(dominance_matrix, axis=0),
        "Classification",
    ] = "non_dominant_model"
    df.loc[~np.any(dominance_matrix, axis=0), "Classification"] = "dominant_model"

    # Map the topological order to the model names and get dominance counts
    sorted_models = [df["Model"].iloc[i] for i in topo_order]

    df_sorted = df.iloc[topo_order].copy()
    df_sorted["Dominates_Count"] = dominance_matrix.sum(axis=1)[topo_order]
    df_sorted_topo = df_sorted.copy()

    df_sorted = df_sorted.sort_values(by="Dominates_Count", ascending=False)
    df_sorted_count = df_sorted.copy()

    return {
        "dominance_matrix": dominance_matrix,
        "dominance_graph": dominance_graph,
        "df_sorted_topo": df_sorted_topo,
        "df_sorted_count": df_sorted_count,
    }

<IPython.core.display.Javascript object>

# Reading The files

In [220]:
csv_files_path_fine_tuning_full = dict()
csv_files_path_fine_tuning_grouped = dict()

<IPython.core.display.Javascript object>

## 204

### Plant F

In [221]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/204/f/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/204/f/fine_tuning/grouped/"
plant = "f"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

### Plant AB

In [222]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/204/ab/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/204/ab/fine_tuning/grouped/"
plant = "ab"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

### Reading all data - pre training

#### Full

In [223]:
dfs_plant = dict()
column = ("plant", "")
pos = 2

for plant, csv_files in csv_files_path_fine_tuning_full.items():
    df = read_csv_files_full(csv_files, plant)
    df["plant"] = plant
    dfs_plant[plant] = df

<IPython.core.display.Javascript object>

In [224]:
df_fine_tuning_full = pd.concat(dfs_plant.values(), ignore_index=True)

<IPython.core.display.Javascript object>

In [225]:
df_fine_tuning_full.shape

(1023, 23)

<IPython.core.display.Javascript object>

#### Grouped

In [226]:
dfs_plant = dict()
column = ("plant", "")
pos = 2

for plant, csv_files in csv_files_path_fine_tuning_grouped.items():
    df = read_csv_files_grouped(csv_files, plant)
    df["plant"] = plant
    df = preprocess_rename_columns(df)
    df = preprocess_change_columns_order(df, column, pos)
    dfs_plant[plant] = df

<IPython.core.display.Javascript object>

In [227]:
df_fine_tuning_grouped = pd.concat(dfs_plant.values(), ignore_index=True)

<IPython.core.display.Javascript object>

In [228]:
df_fine_tuning_grouped.shape

(81, 25)

<IPython.core.display.Javascript object>

In [229]:
df_copy = df_fine_tuning_grouped.copy()
df_copy = (
    df_copy.reset_index(level=0)
    .rename({"level_0": "Plant"}, axis=1)
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

## Preprocessing steps

In [230]:
patterns = [
    "Chemical + Mineralogical + Feature Engineering",
    "Chemical + Mineralogical + CS7 + One-Hot",
    "Chemical + Mineralogical + Physical + One-Hot",
    "Chemical + Mineralogical + CS3 + One-Hot",
    "Chemical + Mineralogical + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + One-Hot",
    "Chemical + Mineralogical + CS1 + One-Hot",
    "Chemical + Mineralogical + CS2 + One-Hot",
    "Chemical + Feature Engineering",
]

replace_dict = {
    "Chemical + Mineralogical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS2": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS7": "Chemical + Mineralogical + Early CS",
}

replace_dict_2 = {
    "Chemical": "Chemical + Mineralogical",
    "Chemical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Mineralogical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Physical": "Chemical + Mineralogical + Physical",
}

df_copy = df_copy[~df_copy["Cross Validation"].eq("Out of time Split")].reset_index(
    drop=True
)

# Removing afterwards
df_copy = df_copy[~df_copy["Cross Validation"].eq("Repeated KFold")].reset_index(
    drop=True
)

df_copy = df_copy[~df_copy["Features"].apply(lambda x: x in patterns)].reset_index(
    drop=True
)
df_copy["Features_bkp"] = df_copy["Features"].copy()
df_copy["Features"] = df_copy["Features"].replace(replace_dict)

df_copy["Features"] = df_copy["Features"].replace(replace_dict_2)

# Removing afterwards
# df_copy = df_copy[
#     ~df_copy["Features"].eq("Chemical + Mineralogical + Early CS")
# ].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [231]:
df_copy[("MAPE Train", "mean")] = df_copy[("MAPE Train", "mean")] * 100
df_copy[("MAPE Train", "std")] = df_copy[("MAPE Train", "std")] * 100
df_copy[("MAPE Test", "mean")] = df_copy[("MAPE Test", "mean")] * 100
df_copy[("MAPE Test", "std")] = df_copy[("MAPE Test", "std")] * 100

df_copy["Model_bkp"] = df_copy["Model"].copy()
df_copy["Model_bkp_2"] = df_copy["Model"] + df_copy["Timesteps"].apply(
    lambda x: "" if pd.isna(x) else "_" + str(x)
)
df_copy["Model"] = df_copy["Model"].replace(
    {
        "MLP": "Neural Networks",
        "LSTM": "Neural Networks",
        "GRU": "Neural Networks",
        "BidirectionalLSTM": "Neural Networks",
        "BidirectionalGRU": "Neural Networks",
        "Transformer": "Neural Networks",
        "Decision Tree": "Trees",
        "Random Forest": "Trees",
        "XGBoost": "Trees",
    }
)

<IPython.core.display.Javascript object>

In [232]:
df_copy["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [233]:
df_copy["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [234]:
df_copy["Model"].unique()

array(['Neural Networks', 'Conv1D'], dtype=object)

<IPython.core.display.Javascript object>

In [235]:
df_copy["Model_bkp"].unique()

array(['Transformer', 'MLP', 'BidirectionalLSTM', 'Conv1D', 'LSTM'],
      dtype=object)

<IPython.core.display.Javascript object>

In [236]:
df_copy["Model_bkp_2"].unique()

array(['Transformer_1.0', 'MLP', 'BidirectionalLSTM_7.0', 'Conv1D_7.0',
       'LSTM_7.0', 'BidirectionalLSTM_14.0', 'Transformer_14.0',
       'LSTM_14.0', 'Conv1D_1.0', 'BidirectionalLSTM_1.0'], dtype=object)

<IPython.core.display.Javascript object>

In [237]:
df_copy.shape

(50, 29)

<IPython.core.display.Javascript object>

In [238]:
df_copy_grouped = df_copy.copy()

<IPython.core.display.Javascript object>

In [239]:
df_copy[
    [
        ("RMSE Test", "mean"),
        ("MAE Test", "mean"),
        ("MAPE Test", "mean"),
        ("R2 Test", "mean"),
    ]
].describe().round(2).T

Unnamed: 0,Unnamed: 1,count,mean,std,min,25%,50%,75%,max
RMSE Test,mean,50.0,15.16,67.51,1.37,1.66,2.02,3.17,453.49
MAE Test,mean,50.0,8.41,36.68,1.08,1.26,1.57,2.33,250.83
MAPE Test,mean,50.0,18.4,79.71,2.52,2.94,3.64,5.26,547.28
R2 Test,mean,50.0,-280.41,1747.47,-12263.58,0.69,0.87,0.94,0.96


<IPython.core.display.Javascript object>

In [240]:
df_copy["plant"].unique()

array(['f', 'ab'], dtype=object)

<IPython.core.display.Javascript object>

# Global Analysis (fine tuning results)

## ECICS - 204

In [241]:
df_ecics_grouped = df_copy.copy()

<IPython.core.display.Javascript object>

In [242]:
df_results = df_ecics_grouped[
    [
        ("Company", ""),
        ("plant", ""),
        ("Cross Validation", ""),
        ("Features", ""),
        ('Features_bkp', ''),
        ("Model", ""),
        ("Model_bkp", ""),         
        ('Model_bkp_2',''),
        ("Timesteps", ""),
        ("RMSE Test", "mean"),
        ("MAE Test", "mean"),
        ("MAPE Test", "mean"),
        ("R2 Test", "mean"),
        ("RMSE Test", "std"),
        ("MAE Test", "std"),
        ("MAPE Test", "std"),
        ("R2 Test", "std"),
    ]
].copy()
df_results = df_results.reset_index(drop=True)

# Rename columns
new_column_names = [
    "Company",
    "Plant",
    "Cross Validation",
    "Features",
    "Features_bkp",
    "Model",
    "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "RMSE_std",
    "MAE_std",
    "MAPE_std",
    "R2_std",
]

df_results.columns = new_column_names


<IPython.core.display.Javascript object>

In [243]:
df_results.shape

(50, 17)

<IPython.core.display.Javascript object>

In [244]:
df_results["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

## Variable Grouping: CM

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM were from:**

*BTiSS:*
    Dominance analysis: Plant AB
    SCPM:Plant AB

*TiSS:*
    Dominance analysis: Plant F
    SCPM:Plant F

In [245]:
df_results["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

### Plant AB

In [246]:
df_results_cm = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [247]:
df_results_cm["Features"].unique()

array(['Chemical + Mineralogical'], dtype=object)

<IPython.core.display.Javascript object>

In [248]:
df_results_cm.shape

(20, 17)

<IPython.core.display.Javascript object>

In [249]:
df_results_cm = df_results_cm[df_results_cm["Plant"].eq("ab")].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [250]:
df_results_cm.shape

(10, 17)

<IPython.core.display.Javascript object>

### Blocking time series

In [251]:
df_results_cm["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [252]:
df_results_cm_btss = (
    df_results_cm[df_results_cm["Cross Validation"].eq("Blocking Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [253]:
df_results_cm_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [254]:
df_results_cm_btss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [255]:
df_results_cm_btss = compute_scpm(df_results_cm_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [256]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_btss)
dominance_matrix_cm_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [257]:
dominance_matrix_cm_btss.shape, len(dominance_graph_cm_btss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [258]:
df_sorted_topo["Classification"].value_counts()

Classification
dominant_model        2
intermediate_model    2
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [259]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/ecics_204_ab_dominance_analysis_cm_btss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [260]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,204,ab,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,Transformer,Transformer_14.0,14.0,1.877239,...,3.364062,0.851889,0.717912,0.431818,1.058992,0.177338,-2.0217,0,dominant_model,3
3,204,ab,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,MLP,MLP,,1.896281,...,3.465779,0.873429,0.352691,0.254013,0.647049,0.109899,-2.020546,0,dominant_model,3
2,204,ab,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Conv1D,Conv1D,Conv1D_7.0,7.0,3.293655,...,5.815239,0.673772,0.655884,0.550276,1.095785,0.137525,-1.991323,2,intermediate_model,2
4,204,ab,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,LSTM,LSTM_14.0,14.0,4.337385,...,7.921215,0.420288,1.11021,1.124117,2.368821,0.339841,-1.966287,3,intermediate_model,1
1,204,ab,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_7.0,7.0,453.491348,...,547.284398,-12263.582957,308.52943,162.35494,354.304036,19055.295331,7.999856,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [261]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [262]:
# Top 4
df_sorted_topo_cols.sort_values(
    by=["Dominates_Count", "SCPM"], ascending=[False, True]
).iloc[0:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,Transformer_14.0,1.88,0.72,1.43,0.43,3.36,1.06,0.85,0.18,-2.02,0,dominant_model,3,ab
3,MLP,1.9,0.35,1.49,0.25,3.47,0.65,0.87,0.11,-2.02,0,dominant_model,3,ab


<IPython.core.display.Javascript object>

In [263]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(2, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [264]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
2,Conv1D_7.0,3.29,0.66,2.54,0.55,5.82,1.1,0.67,0.14,-1.99,2,intermediate_model,2,ab
4,LSTM_14.0,4.34,1.11,3.48,1.12,7.92,2.37,0.42,0.34,-1.97,3,intermediate_model,1,ab


<IPython.core.display.Javascript object>

In [265]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(2, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [266]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,BidirectionalLSTM_7.0,453.49,308.53,250.83,162.35,547.28,354.3,-12263.58,19055.3,8.0,4,non_dominant_model,0,ab


<IPython.core.display.Javascript object>

In [267]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [268]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [269]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [270]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [271]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [272]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [273]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,dominant_model,50.0
Neural Networks,intermediate_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [274]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,2.0,1.75
Dominated_Count,std,,2.06
Dominated_Count,min,2.0,0.0
Dominated_Count,25%,2.0,0.0
Dominated_Count,50%,2.0,1.5
Dominated_Count,75%,2.0,3.25
Dominated_Count,max,2.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,2.0,1.75


<IPython.core.display.Javascript object>

In [275]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [276]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,dominant_model,2


<IPython.core.display.Javascript object>

In [277]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  dominant_model        2
                 intermediate_model    1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [278]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [279]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [280]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,2.0,2.0,2,2,0.4
Neural Networks,4,1.75,2.0,7,7,1.4


<IPython.core.display.Javascript object>

In [281]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,3.293655,,3.293655,3.293655,3.293655,3.293655,3.293655,1.0,0.655884,...,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
Neural Networks,4.0,115.400563,225.396817,1.877239,1.891521,3.116833,116.625876,453.491348,4.0,77.677561,...,3.25,4.0,4.0,1.75,1.5,0.0,0.75,2.0,3.0,3.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [282]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [283]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [284]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [285]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,Transformer_14.0,1.88,0.72,1.43,0.43,3.36,1.06,0.85,0.18,-2.02,0,3,dominant_model,ab,True
3,MLP,1.9,0.35,1.49,0.25,3.47,0.65,0.87,0.11,-2.02,0,3,dominant_model,ab,True
2,Conv1D_7.0,3.29,0.66,2.54,0.55,5.82,1.1,0.67,0.14,-1.99,2,2,intermediate_model,ab,True
4,LSTM_14.0,4.34,1.11,3.48,1.12,7.92,2.37,0.42,0.34,-1.97,3,1,intermediate_model,ab,True
1,BidirectionalLSTM_7.0,453.49,308.53,250.83,162.35,547.28,354.3,-12263.58,19055.3,8.0,4,0,non_dominant_model,ab,True


<IPython.core.display.Javascript object>

### Plant F

In [286]:
df_results_cm = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [287]:
df_results_cm["Features"].unique()

array(['Chemical + Mineralogical'], dtype=object)

<IPython.core.display.Javascript object>

In [288]:
df_results_cm.shape

(20, 17)

<IPython.core.display.Javascript object>

In [289]:
df_results_cm = df_results_cm[df_results_cm["Plant"].eq("f")].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [290]:
df_results_cm.shape

(10, 17)

<IPython.core.display.Javascript object>

### Time Series Split

In [291]:
df_results_cm["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [292]:
df_results_cm_tss = (
    df_results_cm[df_results_cm["Cross Validation"].eq("Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [293]:
df_results_cm_tss["Cross Validation"].unique()

array(['Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [294]:
df_results_cm_tss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [295]:
df_results_cm_tss = compute_scpm(df_results_cm_tss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [296]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_tss)
dominance_matrix_cm_tss = dominance_dict["dominance_matrix"]
dominance_graph_cm_tss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [297]:
dominance_matrix_cm_tss.shape, len(dominance_graph_cm_tss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [298]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    3
dominant_model        1
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [299]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/ecics_204_f_dominance_analysis_cm_tss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [300]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,204,f,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,Transformer,Transformer_1.0,1.0,1.741329,...,2.935762,0.934552,0.236479,0.198124,0.448636,0.01669,-3.110627,0,dominant_model,4
3,204,f,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,MLP,MLP,,1.791195,...,3.22986,0.930404,0.26033,0.208885,0.50073,0.019883,-2.766405,1,intermediate_model,3
2,204,f,Time Series Split,Chemical + Mineralogical,Chemical,Conv1D,Conv1D,Conv1D_7.0,7.0,2.020378,...,3.646712,0.900267,0.671155,0.447168,1.099442,0.065173,-2.065939,2,intermediate_model,2
1,204,f,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_7.0,7.0,3.079405,...,5.103415,0.771033,0.988342,0.519538,1.253645,0.154638,0.432329,3,intermediate_model,1
4,204,f,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,LSTM,LSTM_7.0,7.0,6.84888,...,7.418795,-0.531873,4.69276,1.51415,3.876044,2.184704,7.510642,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [301]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [302]:
# Top 4
df_sorted_topo_cols.iloc[0:2].sort_values(by="Dominates_Count", ascending=False).round(
    2
)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,Transformer_1.0,1.74,0.24,1.27,0.2,2.94,0.45,0.93,0.02,-3.11,0,dominant_model,4,f
3,MLP,1.79,0.26,1.36,0.21,3.23,0.5,0.93,0.02,-2.77,1,intermediate_model,3,f


<IPython.core.display.Javascript object>

In [303]:
# Top 4
df_sorted_topo_cols.iloc[0:2].sort_values(
    by=["Dominates_Count", "SCPM"], ascending=[False, True]
).iloc[:4].round(2).round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,Transformer_1.0,1.74,0.24,1.27,0.2,2.94,0.45,0.93,0.02,-3.11,0,dominant_model,4,f
3,MLP,1.79,0.26,1.36,0.21,3.23,0.5,0.93,0.02,-2.77,1,intermediate_model,3,f


<IPython.core.display.Javascript object>

In [304]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [305]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:4].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
3,MLP,1.79,0.26,1.36,0.21,3.23,0.5,0.93,0.02,-2.77,1,intermediate_model,3,f
2,Conv1D_7.0,2.02,0.67,1.56,0.45,3.65,1.1,0.9,0.07,-2.07,2,intermediate_model,2,f
1,BidirectionalLSTM_7.0,3.08,0.99,2.17,0.52,5.1,1.25,0.77,0.15,0.43,3,intermediate_model,1,f


<IPython.core.display.Javascript object>

In [306]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(3, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [307]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
4,LSTM_7.0,6.84888,4.69276,3.216369,1.51415,7.418795,3.876044,-0.531873,2.184704,7.510642,4,non_dominant_model,0,f


<IPython.core.display.Javascript object>

In [308]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [309]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [310]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [311]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [312]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [313]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [314]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,2.0,2.0
Dominated_Count,std,,1.83
Dominated_Count,min,2.0,0.0
Dominated_Count,25%,2.0,0.75
Dominated_Count,50%,2.0,2.0
Dominated_Count,75%,2.0,3.25
Dominated_Count,max,2.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,2.0,2.0


<IPython.core.display.Javascript object>

In [315]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [316]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,intermediate_model,2


<IPython.core.display.Javascript object>

In [317]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  intermediate_model    2
                 dominant_model        1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [318]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [319]:
summary_stats.round(2)

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,2.0,2.0,2,2,0.4
Neural Networks,4,2.0,2.0,8,8,1.6


<IPython.core.display.Javascript object>

In [320]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,2.020378,,2.020378,2.020378,2.020378,2.020378,2.020378,1.0,0.671155,...,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
Neural Networks,4.0,3.365202,2.403619,1.741329,1.778729,2.4353,4.021774,6.84888,4.0,1.544478,...,3.25,4.0,4.0,2.0,1.825742,0.0,0.75,2.0,3.25,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [321]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [322]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [323]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [324]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,Transformer_1.0,1.74,0.24,1.27,0.2,2.94,0.45,0.93,0.02,-3.11,0,4,dominant_model,f,True
3,MLP,1.79,0.26,1.36,0.21,3.23,0.5,0.93,0.02,-2.77,1,3,intermediate_model,f,True
2,Conv1D_7.0,2.02,0.67,1.56,0.45,3.65,1.1,0.9,0.07,-2.07,2,2,intermediate_model,f,True
1,BidirectionalLSTM_7.0,3.08,0.99,2.17,0.52,5.1,1.25,0.77,0.15,0.43,3,1,intermediate_model,f,True
4,LSTM_7.0,6.85,4.69,3.22,1.51,7.42,3.88,-0.53,2.18,7.51,4,0,non_dominant_model,f,True


<IPython.core.display.Javascript object>

## Variable Grouping: CM-P

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P were from:**

*BTiSS:*
    Dominance analysis: Plant F
    SCPM:Plant F

*TiSS:*
    Dominance analysis: Plant F
    SCPM:Plant F

In [325]:
df_results["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [326]:
df_results_cm_p = (
    df_results[
        df_results["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [327]:
df_results_cm_p["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [328]:
df_results_cm_p.shape

(10, 17)

<IPython.core.display.Javascript object>

### Blocking time series

In [329]:
df_results_cm_p["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [330]:
df_results_cm_p_btss = (
    df_results_cm_p[
        df_results_cm_p["Cross Validation"].eq("Blocking Time Series Split")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [331]:
df_results_cm_p_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [332]:
df_results_cm_p_btss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [333]:
df_results_cm_p_btss = compute_scpm(df_results_cm_p_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [334]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_btss)
dominance_matrix_cm_p_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [335]:
dominance_matrix_cm_p_btss.shape, len(dominance_graph_cm_p_btss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [336]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    2
non_dominant_model    2
dominant_model        1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [337]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/ecics_204_f_dominance_analysis_cm_p_btss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [338]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,204,f,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,MLP,MLP,,1.372538,...,2.516835,0.957256,0.295185,0.228292,0.564297,0.018385,-5.010293,0,dominant_model,4
4,204,f,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,Transformer,Transformer_1.0,1.0,1.632891,...,2.754794,0.940862,0.247506,0.207447,0.482094,0.014642,-4.045491,1,intermediate_model,3
3,204,f,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Conv1D,Conv1D,Conv1D_7.0,7.0,2.362857,...,4.055819,0.834888,0.973011,0.729558,1.365477,0.143282,0.639894,2,intermediate_model,2
1,204,f,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_7.0,7.0,2.960377,...,4.810744,0.751327,0.84081,0.499236,1.142482,0.149506,3.624921,3,non_dominant_model,0
2,204,f,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,LSTM,LSTM_7.0,7.0,3.177365,...,4.689686,0.637151,2.11564,1.185856,2.304105,0.545944,4.790969,3,non_dominant_model,0


<IPython.core.display.Javascript object>

In [339]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [340]:
# Top 4
df_sorted_topo_cols.iloc[0:5]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,MLP,1.37,0.3,1.08,0.23,2.52,0.56,0.96,0.02,-5.01,0,dominant_model,4,f
4,Transformer_1.0,1.63,0.25,1.18,0.21,2.75,0.48,0.94,0.01,-4.05,1,intermediate_model,3,f
3,Conv1D_7.0,2.36,0.97,1.84,0.73,4.06,1.37,0.83,0.14,0.64,2,intermediate_model,2,f
1,BidirectionalLSTM_7.0,2.96,0.84,2.14,0.5,4.81,1.14,0.75,0.15,3.62,3,non_dominant_model,0,f
2,LSTM_7.0,3.18,2.12,2.16,1.19,4.69,2.3,0.64,0.55,4.79,3,non_dominant_model,0,f


<IPython.core.display.Javascript object>

In [341]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [342]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
4,Transformer_1.0,1.63,0.25,1.18,0.21,2.75,0.48,0.94,0.01,-4.05,1,intermediate_model,3,f
3,Conv1D_7.0,2.36,0.97,1.84,0.73,4.06,1.37,0.83,0.14,0.64,2,intermediate_model,2,f


<IPython.core.display.Javascript object>

In [343]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(2, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [344]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,BidirectionalLSTM_7.0,2.96,0.84,2.14,0.5,4.81,1.14,0.75,0.15,3.62,3,non_dominant_model,0,f
2,LSTM_7.0,3.18,2.12,2.16,1.19,4.69,2.3,0.64,0.55,4.79,3,non_dominant_model,0,f


<IPython.core.display.Javascript object>

In [345]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(2, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [346]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [347]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [348]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [349]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [350]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [351]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,non_dominant_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,intermediate_model,25.0


<IPython.core.display.Javascript object>

In [352]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,2.0,1.75
Dominated_Count,std,,1.5
Dominated_Count,min,2.0,0.0
Dominated_Count,25%,2.0,0.75
Dominated_Count,50%,2.0,2.0
Dominated_Count,75%,2.0,3.0
Dominated_Count,max,2.0,3.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,2.0,1.75


<IPython.core.display.Javascript object>

In [353]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [354]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,non_dominant_model,2


<IPython.core.display.Javascript object>

In [355]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  non_dominant_model    2
                 dominant_model        1
                 intermediate_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [356]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [357]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [358]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,2.0,2.0,2,2,0.4
Neural Networks,4,1.75,1.5,7,7,1.4


<IPython.core.display.Javascript object>

In [359]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,2.362857,,2.362857,2.362857,2.362857,2.362857,2.362857,1.0,0.973011,...,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
Neural Networks,4.0,2.285793,0.914746,1.372538,1.567803,2.296634,3.014624,3.177365,4.0,0.874785,...,3.0,3.0,4.0,1.75,2.061553,0.0,0.0,1.5,3.25,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [360]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [361]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [362]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [363]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,MLP,1.37,0.3,1.08,0.23,2.52,0.56,0.96,0.02,-5.01,0,4,dominant_model,f,True
4,Transformer_1.0,1.63,0.25,1.18,0.21,2.75,0.48,0.94,0.01,-4.05,1,3,intermediate_model,f,True
3,Conv1D_7.0,2.36,0.97,1.84,0.73,4.06,1.37,0.83,0.14,0.64,2,2,intermediate_model,f,True
1,BidirectionalLSTM_7.0,2.96,0.84,2.14,0.5,4.81,1.14,0.75,0.15,3.62,3,0,non_dominant_model,f,True
2,LSTM_7.0,3.18,2.12,2.16,1.19,4.69,2.3,0.64,0.55,4.79,3,0,non_dominant_model,f,True


<IPython.core.display.Javascript object>

### Time Series Split

### Plant F

In [364]:
df_results_cm_p = (
    df_results[
        df_results["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [365]:
df_results_cm_p["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [366]:
df_results_cm_p.shape

(10, 17)

<IPython.core.display.Javascript object>

In [367]:
df_results_cm_p = df_results_cm_p[df_results_cm_p["Plant"].eq("f")].reset_index(
    drop=True
)

<IPython.core.display.Javascript object>

In [368]:
df_results_cm["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [369]:
df_results_cm_p_tss = (
    df_results_cm_p[df_results_cm_p["Cross Validation"].eq("Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [370]:
df_results_cm_p_tss["Cross Validation"].unique()

array(['Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [371]:
df_results_cm_p_tss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [372]:
df_results_cm_p_tss = compute_scpm(df_results_cm_p_tss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [373]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_tss)
dominance_matrix_cm_p_tss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_tss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [374]:
dominance_matrix_cm_tss.shape, len(dominance_graph_cm_tss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [375]:
df_sorted_topo["Classification"].value_counts()

Classification
dominant_model        2
intermediate_model    2
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [376]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/ecics_204_f_dominance_analysis_cm_p_tss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [377]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,204,f,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,MLP,MLP,,1.626909,...,3.036724,0.942175,0.273138,0.197504,0.477115,0.020827,-2.842768,0,dominant_model,3
4,204,f,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,Transformer,Transformer_1.0,1.0,1.63891,...,2.901188,0.940758,0.317922,0.223103,0.511231,0.022433,-3.092237,0,dominant_model,3
3,204,f,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Conv1D,Conv1D,Conv1D_7.0,7.0,1.753235,...,3.19543,0.930707,0.307386,0.236634,0.523346,0.022692,-1.687823,2,intermediate_model,2
2,204,f,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,LSTM,LSTM_7.0,7.0,1.996658,...,3.360978,0.906109,0.598745,0.312867,0.685873,0.058599,-0.072028,3,intermediate_model,1
1,204,f,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_7.0,7.0,2.744033,...,4.732717,0.805571,1.136449,0.745373,1.851245,0.167596,7.694857,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [378]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [379]:
# Top 4
df_sorted_topo_cols.sort_values(
    by=["Dominates_Count", "SCPM"], ascending=[False, True]
).iloc[:2].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
4,Transformer_1.0,1.64,0.32,1.26,0.22,2.9,0.51,0.94,0.02,-3.09,0,dominant_model,3,f
0,MLP,1.63,0.27,1.29,0.2,3.04,0.48,0.94,0.02,-2.84,0,dominant_model,3,f


<IPython.core.display.Javascript object>

In [380]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(2, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [381]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:4].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
3,Conv1D_7.0,1.75,0.31,1.4,0.24,3.2,0.52,0.93,0.02,-1.69,2,intermediate_model,2,f
2,LSTM_7.0,2.0,0.6,1.48,0.31,3.36,0.69,0.91,0.06,-0.07,3,intermediate_model,1,f


<IPython.core.display.Javascript object>

In [382]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(2, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [383]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,BidirectionalLSTM_7.0,2.744033,1.136449,2.03343,0.745373,4.732717,1.851245,0.805571,0.167596,7.694857,4,non_dominant_model,0,f


<IPython.core.display.Javascript object>

In [384]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [385]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [386]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [387]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [388]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [389]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,dominant_model,50.0
Neural Networks,intermediate_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [390]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,2.0,1.75
Dominated_Count,std,,2.06
Dominated_Count,min,2.0,0.0
Dominated_Count,25%,2.0,0.0
Dominated_Count,50%,2.0,1.5
Dominated_Count,75%,2.0,3.25
Dominated_Count,max,2.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,2.0,1.75


<IPython.core.display.Javascript object>

In [391]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [392]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,dominant_model,2


<IPython.core.display.Javascript object>

In [393]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  dominant_model        2
                 intermediate_model    1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [394]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [395]:
summary_stats.round(2)

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,2.0,2.0,2,2,0.4
Neural Networks,4,1.75,2.0,7,7,1.4


<IPython.core.display.Javascript object>

In [396]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,1.753235,,1.753235,1.753235,1.753235,1.753235,1.753235,1.0,0.307386,...,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
Neural Networks,4.0,2.001628,0.523822,1.626909,1.63591,1.817784,2.183502,2.744033,4.0,0.581564,...,3.25,4.0,4.0,1.75,1.5,0.0,0.75,2.0,3.0,3.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [397]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [398]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [399]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [400]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
4,Transformer_1.0,1.64,0.32,1.26,0.22,2.9,0.51,0.94,0.02,-3.09,0,3,dominant_model,f,False
0,MLP,1.63,0.27,1.29,0.2,3.04,0.48,0.94,0.02,-2.84,0,3,dominant_model,f,False
3,Conv1D_7.0,1.75,0.31,1.4,0.24,3.2,0.52,0.93,0.02,-1.69,2,2,intermediate_model,f,True
2,LSTM_7.0,2.0,0.6,1.48,0.31,3.36,0.69,0.91,0.06,-0.07,3,1,intermediate_model,f,True
1,BidirectionalLSTM_7.0,2.74,1.14,2.03,0.75,4.73,1.85,0.81,0.17,7.69,4,0,non_dominant_model,f,True


<IPython.core.display.Javascript object>

## Variable Grouping: CM-P-CS

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P-CS were from:**

*BTiSS:*
    Dominance analysis: Plant S and Plant F
    SCPM: Plant S and Plant F

*TiSS:*
    Dominance analysis: Plant AB
    SCPM: Plant AB

In [401]:
df_results["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

### Blocking time series

### Plant F

In [402]:
df_results_cm_p_cs = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [403]:
df_results_cm_p_cs["Features"].unique()

array(['Chemical + Mineralogical + Physical'], dtype=object)

<IPython.core.display.Javascript object>

In [404]:
df_results_cm_p_cs.shape

(20, 17)

<IPython.core.display.Javascript object>

In [405]:
df_results_cm_p_cs = df_results_cm_p_cs[
    df_results_cm_p_cs["Plant"].eq("f")
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [406]:
df_results_cm_p_cs["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [407]:
df_results_cm_p_cs_btss = (
    df_results_cm_p_cs[
        df_results_cm_p_cs["Cross Validation"].eq("Blocking Time Series Split")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [408]:
df_results_cm_p_cs_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [409]:
df_results_cm_p_cs_btss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [410]:
df_results_cm_p_cs_btss = compute_scpm(df_results_cm_p_cs_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [411]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_btss)
dominance_matrix_cm_p_cs_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [412]:
dominance_matrix_cm_p_cs_btss.shape, len(dominance_graph_cm_p_cs_btss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [413]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    3
dominant_model        1
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [414]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/ecics_204_f_dominance_analysis_cm_p_cs_btss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [415]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,204,f,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,MLP,MLP,,1.502942,...,2.717972,0.949657,0.215928,0.182213,0.460453,0.014551,-3.274864,0,dominant_model,4
1,204,f,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Conv1D,Conv1D,Conv1D_7.0,7.0,1.714714,...,3.037118,0.92283,0.242213,0.192377,0.374853,0.019652,-2.404062,1,intermediate_model,2
2,204,f,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,Transformer,Transformer_1.0,1.0,1.766226,...,2.942542,0.931571,0.176847,0.206168,0.482162,0.012011,-2.617959,1,intermediate_model,2
4,204,f,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_14.0,14.0,2.439164,...,4.040735,0.60261,0.571468,0.43206,0.924047,0.109378,0.88962,3,intermediate_model,1
3,204,f,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,LSTM,LSTM_7.0,7.0,5.339302,...,6.174102,0.067397,2.754319,0.804991,1.922409,0.88247,7.407266,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [416]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [417]:
# Top 4
df_sorted_topo_cols.sort_values(
    by=["Dominates_Count", "SCPM"], ascending=[False, True]
).iloc[:5]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,MLP,1.5,0.22,1.13,0.18,2.72,0.46,0.95,0.01,-3.27,0,dominant_model,4,f
2,Transformer_1.0,1.77,0.18,1.26,0.21,2.94,0.48,0.93,0.01,-2.62,1,intermediate_model,2,f
1,Conv1D_7.0,1.71,0.24,1.35,0.19,3.04,0.37,0.92,0.02,-2.4,1,intermediate_model,2,f
4,BidirectionalLSTM_14.0,2.44,0.57,1.93,0.43,4.04,0.92,0.6,0.11,0.89,3,intermediate_model,1,f
3,LSTM_7.0,5.34,2.75,2.62,0.8,6.17,1.92,0.07,0.88,7.41,4,non_dominant_model,0,f


<IPython.core.display.Javascript object>

In [418]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [419]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
2,Transformer_1.0,1.77,0.18,1.26,0.21,2.94,0.48,0.93,0.01,-2.62,1,intermediate_model,2,f
1,Conv1D_7.0,1.71,0.24,1.35,0.19,3.04,0.37,0.92,0.02,-2.4,1,intermediate_model,2,f
4,BidirectionalLSTM_14.0,2.44,0.57,1.93,0.43,4.04,0.92,0.6,0.11,0.89,3,intermediate_model,1,f


<IPython.core.display.Javascript object>

In [420]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(3, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [421]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
3,LSTM_7.0,5.34,2.75,2.62,0.8,6.17,1.92,0.07,0.88,7.41,4,non_dominant_model,0,f


<IPython.core.display.Javascript object>

In [422]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [423]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [424]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [425]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [426]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [427]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [428]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [429]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,1.0,2.0
Dominated_Count,std,,1.83
Dominated_Count,min,1.0,0.0
Dominated_Count,25%,1.0,0.75
Dominated_Count,50%,1.0,2.0
Dominated_Count,75%,1.0,3.25
Dominated_Count,max,1.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,2.0,1.75


<IPython.core.display.Javascript object>

In [430]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [431]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,intermediate_model,2


<IPython.core.display.Javascript object>

In [432]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  intermediate_model    2
                 dominant_model        1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [433]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [434]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [435]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,2.0,2.0,1,2,0.4
Neural Networks,4,1.75,1.5,8,7,1.4


<IPython.core.display.Javascript object>

In [436]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,1.714714,,1.714714,1.714714,1.714714,1.714714,1.714714,1.0,0.242213,...,1.0,1.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
Neural Networks,4.0,2.761909,1.762905,1.502942,1.700405,2.102695,3.164199,5.339302,4.0,0.92964,...,3.25,4.0,4.0,1.75,1.707825,0.0,0.75,1.5,2.5,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [437]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [438]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [439]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [440]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,MLP,1.5,0.22,1.13,0.18,2.72,0.46,0.95,0.01,-3.27,0,4,dominant_model,f,True
2,Transformer_1.0,1.77,0.18,1.26,0.21,2.94,0.48,0.93,0.01,-2.62,1,2,intermediate_model,f,False
1,Conv1D_7.0,1.71,0.24,1.35,0.19,3.04,0.37,0.92,0.02,-2.4,1,2,intermediate_model,f,False
4,BidirectionalLSTM_14.0,2.44,0.57,1.93,0.43,4.04,0.92,0.6,0.11,0.89,3,1,intermediate_model,f,True
3,LSTM_7.0,5.34,2.75,2.62,0.8,6.17,1.92,0.07,0.88,7.41,4,0,non_dominant_model,f,True


<IPython.core.display.Javascript object>

### Time Series Split

In [441]:
df_results_cm_p_cs = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [442]:
df_results_cm_p_cs["Features"].unique()

array(['Chemical + Mineralogical + Physical'], dtype=object)

<IPython.core.display.Javascript object>

In [443]:
df_results_cm_p_cs.shape

(20, 17)

<IPython.core.display.Javascript object>

### Plant AB

In [444]:
df_results_cm_p_cs = df_results_cm_p_cs[
    df_results_cm_p_cs["Plant"].eq("ab")
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [445]:
df_results_cm_p_cs["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [446]:
df_results_cm_p_cs_tss = (
    df_results_cm_p_cs[df_results_cm_p_cs["Cross Validation"].eq("Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [447]:
df_results_cm_p_cs_tss["Cross Validation"].unique()

array(['Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [448]:
df_results_cm_p_cs_tss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [449]:
df_results_cm_p_cs_tss = compute_scpm(df_results_cm_p_cs_tss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [450]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_tss)
dominance_matrix_cm_p_cs_tss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_tss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [451]:
dominance_matrix_cm_p_cs_tss.shape, len(dominance_graph_cm_p_cs_tss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [452]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    3
dominant_model        1
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [453]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/ecics_204_ab_dominance_analysis_cm_p_cs_tss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [454]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,204,ab,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,MLP,MLP,,1.497268,...,2.566292,0.944029,0.449835,0.310156,0.577409,0.024374,-3.221203,0,dominant_model,4
4,204,ab,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_1.0,1.0,1.561617,...,2.644962,0.939912,0.28739,0.183725,0.309271,0.011093,-2.910494,1,intermediate_model,3
1,204,ab,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Conv1D,Conv1D,Conv1D_1.0,1.0,1.579179,...,2.767337,0.938225,0.376308,0.24695,0.450952,0.018989,-2.676357,2,intermediate_model,2
2,204,ab,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,Transformer,Transformer_1.0,1.0,2.268476,...,4.096342,0.871134,0.402272,0.376864,0.880983,0.033584,1.698707,3,intermediate_model,1
3,204,ab,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,LSTM,LSTM_7.0,7.0,3.140499,...,5.315326,0.730408,1.62154,1.149827,2.237891,0.243026,7.109347,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [455]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [456]:
# Top 4
df_sorted_topo_cols.iloc[0:2].sort_values(by="Dominates_Count", ascending=False).round(
    2
)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,MLP,1.5,0.45,1.15,0.31,2.57,0.58,0.94,0.02,-3.22,0,dominant_model,4,ab
4,BidirectionalLSTM_1.0,1.56,0.29,1.2,0.18,2.64,0.31,0.94,0.01,-2.91,1,intermediate_model,3,ab


<IPython.core.display.Javascript object>

In [457]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [458]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:4].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
4,BidirectionalLSTM_1.0,1.56,0.29,1.2,0.18,2.64,0.31,0.94,0.01,-2.91,1,intermediate_model,3,ab
1,Conv1D_1.0,1.58,0.38,1.23,0.25,2.77,0.45,0.94,0.02,-2.68,2,intermediate_model,2,ab
2,Transformer_1.0,2.27,0.4,1.82,0.38,4.1,0.88,0.87,0.03,1.7,3,intermediate_model,1,ab


<IPython.core.display.Javascript object>

In [459]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(3, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [460]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
3,LSTM_7.0,3.140499,1.62154,2.385823,1.149827,5.315326,2.237891,0.730408,0.243026,7.109347,4,non_dominant_model,0,ab


<IPython.core.display.Javascript object>

In [461]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [462]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [463]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [464]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [465]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [466]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [467]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,2.0,2.0
Dominated_Count,std,,1.83
Dominated_Count,min,2.0,0.0
Dominated_Count,25%,2.0,0.75
Dominated_Count,50%,2.0,2.0
Dominated_Count,75%,2.0,3.25
Dominated_Count,max,2.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,2.0,2.0


<IPython.core.display.Javascript object>

In [468]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [469]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,intermediate_model,2


<IPython.core.display.Javascript object>

In [470]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  intermediate_model    2
                 dominant_model        1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [471]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [472]:
summary_stats.round(2)

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,2.0,2.0,2,2,0.4
Neural Networks,4,2.0,2.0,8,8,1.6


<IPython.core.display.Javascript object>

In [473]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,1.579179,,1.579179,1.579179,1.579179,1.579179,1.579179,1.0,0.376308,...,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
Neural Networks,4.0,2.116965,0.766597,1.497268,1.54553,1.915047,2.486482,3.140499,4.0,0.69026,...,3.25,4.0,4.0,2.0,1.825742,0.0,0.75,2.0,3.25,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [474]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [475]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [476]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [477]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,MLP,1.5,0.45,1.15,0.31,2.57,0.58,0.94,0.02,-3.22,0,4,dominant_model,ab,True
4,BidirectionalLSTM_1.0,1.56,0.29,1.2,0.18,2.64,0.31,0.94,0.01,-2.91,1,3,intermediate_model,ab,True
1,Conv1D_1.0,1.58,0.38,1.23,0.25,2.77,0.45,0.94,0.02,-2.68,2,2,intermediate_model,ab,True
2,Transformer_1.0,2.27,0.4,1.82,0.38,4.1,0.88,0.87,0.03,1.7,3,1,intermediate_model,ab,True
3,LSTM_7.0,3.14,1.62,2.39,1.15,5.32,2.24,0.73,0.24,7.11,4,0,non_dominant_model,ab,True


<IPython.core.display.Javascript object>