In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import os
import glob
import numpy as np

<IPython.core.display.Javascript object>

In [3]:
import warnings

# Suppress specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn._oldcore")


<IPython.core.display.Javascript object>

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

<IPython.core.display.Javascript object>

In [5]:
from sklearn.preprocessing import StandardScaler

<IPython.core.display.Javascript object>

In [6]:
from collections import deque, defaultdict


<IPython.core.display.Javascript object>

In [7]:
def read_csv_files_grouped(csv_files, plant, header=[0, 1]):
    results = []
    for filepath in csv_files:
        df = pd.read_csv(filepath, header=header, low_memory=False)
        df["plant"] = plant  # Add the 'plant' column
        results.append(df)
    df = pd.concat(results, ignore_index=True)
    return df

<IPython.core.display.Javascript object>

In [8]:
def read_csv_files_full(csv_files, plant):
    results = []
    for filepath in csv_files:
        df = pd.read_csv(filepath, low_memory=False)
        df["plant"] = plant  # Add the 'plant' column
        results.append(df)
    df = pd.concat(results, ignore_index=True)
    return df

<IPython.core.display.Javascript object>

### Renaming the Columns

In [9]:
def preprocess_rename_columns(df):
    df.columns = pd.MultiIndex.from_tuples(
        list(
            {
                col: (col[0], "") if "Unnamed" in col[1] else col for col in df.columns
            }.values()
        )
    )
    return df

<IPython.core.display.Javascript object>

### Changing the order_of columns Columns

In [10]:
def preprocess_change_columns_order(df, column, pos):
    # Get the list of columns
    cols = df.columns.tolist()
    # Remove the specified column
    cols.remove(column)
    # Insert the column at the desired position
    cols.insert(pos, column)
    # Reorder the DataFrame columns
    return df[cols]

<IPython.core.display.Javascript object>

In [11]:
def read_csv_files_path(csv_files_path_dict, path, plant):
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    csv_files_path_dict[plant] = csv_files
    return csv_files_path_dict

<IPython.core.display.Javascript object>

# Topological Analysis - NEW

## Definitions

In [12]:
# Function to compare two models strictly based on means only
def dominates(model1, model2):
    """Return True if model1 strictly dominates model2, False otherwise."""
    # Check if model1 is at least as good as model2 in all metrics
    at_least_as_good = (
        model1["RMSE_mean"] <= model2["RMSE_mean"]
        and model1["MAE_mean"] <= model2["MAE_mean"]
        and model1["MAPE_mean"] <= model2["MAPE_mean"]
        and model1["R2_mean"] >= model2["R2_mean"]
    )

    # Check if model1 is strictly better in at least one metric
    strictly_better_in_one = (
        model1["RMSE_mean"] < model2["RMSE_mean"]
        or model1["MAE_mean"] < model2["MAE_mean"]
        or model1["MAPE_mean"] < model2["MAPE_mean"]
        or model1["R2_mean"] > model2["R2_mean"]
    )

    # Model1 strictly dominates Model2 if it is at least as good in all metrics and
    # strictly better in at least one metric
    return at_least_as_good and strictly_better_in_one

<IPython.core.display.Javascript object>

In [13]:
def get_dominance_matrix_and_graph(df):
    n = len(df)
    dominance_matrix = np.zeros((n, n), dtype=bool)

    for i in range(n):
        for j in range(n):
            if i != j:
                dominance_matrix[i, j] = dominates(df.iloc[i], df.iloc[j])

    # Create the dominance graph
    dominance_graph = {i: [] for i in range(n)}
    for i in range(n):
        for j in range(n):
            if dominance_matrix[i, j]:
                dominance_graph[i].append(j)

    return dominance_matrix, dominance_graph

<IPython.core.display.Javascript object>

In [14]:
# Topological Sorting
def topological_sort(graph):
    """Perform topological sorting on the given graph."""
    in_degree = {u: 0 for u in graph}
    for u in graph:
        for v in graph[u]:
            in_degree[v] += 1

    queue = deque([u for u in graph if in_degree[u] == 0])
    topo_order = []

    while queue:
        u = queue.popleft()
        topo_order.append(u)

        for v in graph[u]:
            in_degree[v] -= 1
            if in_degree[v] == 0:
                queue.append(v)

    return topo_order

<IPython.core.display.Javascript object>

In [15]:
# Function to find the top models in each group
def find_top_models(group):
    max_net_dominance = group["Net_Dominance"].max()
    top_models = group[group["Net_Dominance"] == max_net_dominance]
    return top_models

<IPython.core.display.Javascript object>

### Dominance Analysis and SCPM

In [16]:
def compute_scpm(df):
    scaler = StandardScaler()
    standardized_metrics = scaler.fit_transform(
        df[["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"]]
    )

    # Creating a new dataframe with standardized values
    standardized_df = pd.DataFrame(
        standardized_metrics,
        columns=["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"],
    )

    # Summing all metrics and subtracting R2_mean
    standardized_df["Result"] = (
        standardized_df[["RMSE_mean", "MAE_mean", "MAPE_mean"]].sum(axis=1)
        - standardized_df["R2_mean"]
    )

    df["SCPM"] = standardized_df["Result"]
    return df

<IPython.core.display.Javascript object>

In [17]:
def make_dominance_analysis(df):
    # Compute dominance matrix and graph
    dominance_matrix, dominance_graph = get_dominance_matrix_and_graph(df)

    # Get the topological order
    topo_order = topological_sort(dominance_graph)
    df = df.copy()
    n = len(topo_order)

    # Identify Dominant Models
    dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if not np.any(dominance_matrix[:, i])
    ]

    # Identify Non-Dominant Models
    non_dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i])
    ]

    # Identify Models that are both dominated and dominate at least one
    intermediate_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i]) and np.any(dominance_matrix[i, :])
    ]

    # Add count of times each model is dominated by others
    df["Dominated_Count"] = np.sum(dominance_matrix, axis=0)

    # Classify models
    df["Classification"] = "intermediate_model"  # default to intermediate_model
    df.loc[
        ~np.any(dominance_matrix, axis=1) & np.any(dominance_matrix, axis=0),
        "Classification",
    ] = "non_dominant_model"
    df.loc[~np.any(dominance_matrix, axis=0), "Classification"] = "dominant_model"

    # Map the topological order to the model names and get dominance counts
    sorted_models = [df["Model"].iloc[i] for i in topo_order]

    df_sorted = df.iloc[topo_order].copy()
    df_sorted["Dominates_Count"] = dominance_matrix.sum(axis=1)[topo_order]
    df_sorted_topo = df_sorted.copy()

    df_sorted = df_sorted.sort_values(by="Dominates_Count", ascending=False)
    df_sorted_count = df_sorted.copy()

    return {
        "dominance_matrix": dominance_matrix,
        "dominance_graph": dominance_graph,
        "df_sorted_topo": df_sorted_topo,
        "df_sorted_count": df_sorted_count,
    }

<IPython.core.display.Javascript object>

# Pre Training Analysis

# Reading The files

In [18]:
csv_files_path_fine_tuning = dict()
csv_files_path_pre_train = dict()

<IPython.core.display.Javascript object>

## 209

### Plant S

In [19]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/s/pre_training/full/"
plant = "s"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

### Plant AM

In [20]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/am/pre_training/full/"
plant = "am"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

### Plant W

In [21]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/w/pre_training/full/"
plant = "w"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

### Plant K

In [22]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/k/pre_training/full/"
plant = "k"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

### Reading all data - pre training

#### Full

In [23]:
dfs_plant = dict()
column = ("plant", "")
pos = 2

for plant, csv_files in csv_files_path_pre_train.items():
    df = read_csv_files_full(csv_files, plant)
    df["plant"] = plant
    dfs_plant[plant] = df

<IPython.core.display.Javascript object>

In [24]:
df_pre_train = pd.concat(dfs_plant.values(), ignore_index=True)

<IPython.core.display.Javascript object>

In [25]:
df_pre_train.shape

(511, 23)

<IPython.core.display.Javascript object>

## Preprocessing

In [26]:
patterns = [
    "Chemical + Mineralogical + Feature Engineering",
    "Chemical + Mineralogical + CS7 + One-Hot",
    "Chemical + Mineralogical + Physical + One-Hot",
    "Chemical + Mineralogical + CS3 + One-Hot",
    "Chemical + Mineralogical + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + One-Hot",
    "Chemical + Mineralogical + CS1 + One-Hot",
    "Chemical + Mineralogical + CS2 + One-Hot",
    "Chemical + Feature Engineering",
]

FEATURES_TO_REPLACE_1 = {
    "Chemical + Mineralogical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS2": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS7": "Chemical + Mineralogical + Early CS",
}

FEATURES_TO_REPLACE_2 = {
    "Chemical": "Chemical + Mineralogical",
    "Chemical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Mineralogical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Physical": "Chemical + Mineralogical + Physical",
}

df_pre_train = df_pre_train[
    ~df_pre_train["Features"].apply(lambda x: x in patterns)
].reset_index(drop=True)

df_pre_train = df_pre_train[
    ~df_pre_train["Cross Validation"].eq("Out of time Split")
].reset_index(drop=True)

# removing afterwards
df_pre_train = df_pre_train[
    ~df_pre_train["Cross Validation"].eq("Repeated KFold")
].reset_index(drop=True)

df_pre_train["Features_bkp"] = df_pre_train["Features"].copy()
df_pre_train["Features"] = df_pre_train["Features"].replace(FEATURES_TO_REPLACE_1)

df_pre_train["Features"] = df_pre_train["Features"].replace(FEATURES_TO_REPLACE_2)


# removing afterwards
# df_pre_train = df_pre_train[
#     ~df_pre_train["Features"].eq("Chemical + Mineralogical + Early CS")
# ].reset_index(drop=True)

metrics_names = {
    "RMSE Test": "RMSE_mean",
    "MAE Test": "MAE_mean",
    "MAPE Test": "MAPE_mean",
    "R2 Test": "R2_mean",
}

df_pre_train = df_pre_train.rename(metrics_names, axis=1)

<IPython.core.display.Javascript object>

In [27]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical', 'Chemical + Mineralogical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [28]:
df_pre_train["MAPE Train"] = df_pre_train["MAPE Train"] * 100
df_pre_train["MAPE_mean"] = df_pre_train["MAPE_mean"] * 100

df_pre_train["Model_bkp"] = df_pre_train["Model"].copy()
df_pre_train["Model_bkp_2"] = df_pre_train["Model"].apply(
    lambda x: "" if pd.isna(x) else "_" + str(x)
)

<IPython.core.display.Javascript object>

In [29]:
df_pre_train["Model_bkp_2"] = df_pre_train["Model_bkp_2"].apply(
    lambda x: "MLP"
    if "MLP" in x
    else "Bi-LSTM"
    if "Bi-LSTM" in x
    else "LSTM"
    if "LSTM" in x
    else "Conv1D"
    if "Conv1D" in x
    else "Transformer"
)

<IPython.core.display.Javascript object>

In [30]:
df_pre_train["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [31]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical', 'Chemical + Mineralogical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [32]:
df_pre_train["Features_bkp"].unique()

array(['Chemical + Physical', 'Chemical', 'Chemical + Properties CS Less'],
      dtype=object)

<IPython.core.display.Javascript object>

In [33]:
df_pre_train.shape

(511, 26)

<IPython.core.display.Javascript object>

In [34]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical', 'Chemical + Mineralogical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [35]:
df_results_cm = (
    df_pre_train[df_pre_train["Features"].eq("Chemical + Mineralogical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [36]:
df_results_cm["Features"].unique()

array(['Chemical + Mineralogical'], dtype=object)

<IPython.core.display.Javascript object>

In [37]:
df_results_cm.shape

(146, 26)

<IPython.core.display.Javascript object>

# Global Analysis (pre train results)

## ECICS

### Variable Grouping: CM

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM were from:**

*BTiSS:*
    Dominance analysis: Plant S
    SCPM:Plant S

*TiSS:*
    Dominance analysis: Plant S
    SCPM:Plant S

#### Hold Out

In [38]:
df_results_cm = df_results_cm[df_results_cm["Plant"].eq("S")].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [39]:
df_results_cm["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [40]:
df_results_cm_ho = (
    df_results_cm[df_results_cm["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [41]:
df_results_cm_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [42]:
df_results_cm_ho.shape

(73, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [43]:
df_results_cm_ho = compute_scpm(df_results_cm_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [44]:
dominance_dict = make_dominance_analysis(df_results_cm_ho)
dominance_matrix_cm_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [45]:
dominance_matrix_cm_ho.shape, len(dominance_graph_cm_ho)

((73, 73), 73)

<IPython.core.display.Javascript object>

In [46]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    69
dominant_model         3
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [47]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/local_models/ecics/209_dominance_analysis_cm_kf.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [48]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
10,Global Model,209,S,Chemical + Mineralogical,"(61946, 11)",,MLP_11,,Standard Scaler,,...,4.429864,0.811351,s,Chemical,MLP_11,MLP,-3.903728,0,dominant_model,59
18,Global Model,209,S,Chemical + Mineralogical,"(61946, 11)",14.0,LSTM6,,Standard Scaler,,...,4.398563,0.828245,s,Chemical,LSTM6,LSTM,-4.529758,0,dominant_model,70
32,Global Model,209,S,Chemical + Mineralogical,"(61946, 11)",7.0,Bi-LSTM5,,Standard Scaler,,...,4.33989,0.8249,s,Chemical,Bi-LSTM5,Bi-LSTM,-4.507755,0,dominant_model,70
5,Global Model,209,S,Chemical + Mineralogical,"(61946, 11)",,MLP_6,,Standard Scaler,,...,4.495524,0.802212,s,Chemical,MLP_6,MLP,-3.34127,3,intermediate_model,54
20,Global Model,209,S,Chemical + Mineralogical,"(61946, 11)",7.0,LSTM8,,Standard Scaler,,...,4.467514,0.81466,s,Chemical,LSTM8,LSTM,-3.812872,2,intermediate_model,59


<IPython.core.display.Javascript object>

In [49]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [50]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
32,Bi-LSTM5,7.0,2.51,1.87,4.34,0.82,-4.51,0,dominant_model,70,S
18,LSTM6,14.0,2.49,1.88,4.4,0.83,-4.53,0,dominant_model,70,S
21,LSTM9,14.0,2.51,1.92,4.54,0.82,-4.03,2,intermediate_model,66,S


<IPython.core.display.Javascript object>

##### Top intermediate models

In [51]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
21,LSTM9,14.0,2.51,1.92,4.54,0.82,-4.03,2,intermediate_model,66,S
41,Bi-LSTM14,7.0,2.53,1.94,4.56,0.82,-3.83,3,intermediate_model,61,S


<IPython.core.display.Javascript object>

##### Top non dominant models

In [52]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
39,Bi-LSTM12,14.0,4.58,3.71,8.36,0.42,20.89,72,non_dominant_model,0,S


<IPython.core.display.Javascript object>

In [53]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [54]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
LSTM           15
Bi-LSTM        15
Conv1D         15
Transformer    15
MLP            13
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [55]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [56]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [57]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [58]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,15.0,29.2,26.38,0.0,4.5,17.0,53.0,72.0,15.0,35.6,23.92,0.0,16.0,44.0,56.5,70.0
Conv1D,15.0,28.67,14.9,6.0,18.0,27.0,41.5,47.0,15.0,34.6,10.95,21.0,26.0,30.0,44.5,52.0
LSTM,15.0,25.53,22.35,0.0,3.5,24.0,44.0,61.0,15.0,38.0,21.35,5.0,23.0,38.0,55.0,70.0
MLP,13.0,16.77,10.98,0.0,7.0,18.0,22.0,36.0,13.0,42.23,9.09,27.0,38.0,38.0,50.0,59.0
Transformer,15.0,56.2,9.38,38.0,51.5,57.0,62.5,70.0,15.0,9.33,7.76,1.0,3.5,8.0,12.0,27.0


<IPython.core.display.Javascript object>

In [59]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,15,3,intermediate_model,13
Conv1D,15,1,intermediate_model,15
LSTM,15,2,intermediate_model,14
MLP,13,2,intermediate_model,12
Transformer,15,1,intermediate_model,15


<IPython.core.display.Javascript object>

In [60]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    13
             dominant_model         1
             non_dominant_model     1
Conv1D       intermediate_model    15
LSTM         intermediate_model    14
             dominant_model         1
MLP          intermediate_model    12
             dominant_model         1
Transformer  intermediate_model    15
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [61]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,86.67
Bi-LSTM,dominant_model,6.67
Bi-LSTM,non_dominant_model,6.67
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,93.33
LSTM,dominant_model,6.67
MLP,intermediate_model,92.31
MLP,dominant_model,7.69
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [62]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
10,MLP,0,59,dominant_model
18,LSTM,0,70,dominant_model
32,Bi-LSTM,0,70,dominant_model
5,MLP,3,54,intermediate_model
20,LSTM,2,59,intermediate_model
...,...,...,...,...
40,Bi-LSTM,64,4,intermediate_model
52,Transformer,65,3,intermediate_model
43,Transformer,70,1,intermediate_model
47,Transformer,70,1,intermediate_model


<IPython.core.display.Javascript object>

In [63]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [64]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,15,35.6,44.0,438,534,7.315068
Conv1D,15,34.6,30.0,430,519,7.109589
LSTM,15,38.0,38.0,383,570,7.808219
MLP,13,42.230769,38.0,218,549,7.520548
Transformer,15,9.333333,8.0,843,140,1.917808


<IPython.core.display.Javascript object>

In [65]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.948066,...,53.0,72.0,15.0,35.6,23.921897,0.0,16.0,44.0,56.5,70.0
Conv1D,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.795011,...,41.5,47.0,15.0,34.6,10.946624,21.0,26.0,30.0,44.5,52.0
LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.808583,...,44.0,61.0,15.0,38.0,21.350811,5.0,23.0,38.0,55.0,70.0
MLP,0.0,,,,,,,,13.0,2.762758,...,22.0,36.0,13.0,42.230769,9.093531,27.0,38.0,38.0,50.0,59.0
Transformer,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.188003,...,62.5,70.0,15.0,9.333333,7.76132,1.0,3.5,8.0,12.0,27.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [66]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [67]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [68]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [69]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [70]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
18,LSTM6,LSTM,14.0,2.49,1.88,4.4,0.83,-4.53,0,70,dominant_model,S,False
32,Bi-LSTM5,Bi-LSTM,7.0,2.51,1.87,4.34,0.82,-4.51,0,70,dominant_model,S,False
21,LSTM9,LSTM,14.0,2.51,1.92,4.54,0.82,-4.03,2,66,intermediate_model,S,False
10,MLP_11,MLP,,2.61,1.86,4.43,0.81,-3.9,0,59,dominant_model,S,False
41,Bi-LSTM14,Bi-LSTM,7.0,2.53,1.94,4.56,0.82,-3.83,3,61,intermediate_model,S,False
20,LSTM8,LSTM,7.0,2.59,1.91,4.47,0.81,-3.81,2,59,intermediate_model,S,False
27,LSTM15,LSTM,14.0,2.53,1.96,4.61,0.82,-3.74,3,55,intermediate_model,S,False


<IPython.core.display.Javascript object>

In [71]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].head(1)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
18,LSTM6,LSTM,14.0,2.4899,1.879614,4.398563,0.828245,-4.529758,0,70,dominant_model,S,False


<IPython.core.display.Javascript object>

### Variable Grouping: CM-P

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P were from:**

*BTiSS:*
    Dominance analysis: Plant AM
    SCPM:Plant AM

*TiSS:*
    Dominance analysis: Plant W
    SCPM:Plant W

In [72]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical', 'Chemical + Mineralogical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [73]:
df_results_cm_p = (
    df_pre_train[
        df_pre_train["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

### Plant AM

In [74]:
df_results_cm_p_am = df_results_cm_p[df_results_cm_p["Plant"].eq("AM")].reset_index(
    drop=True
)

<IPython.core.display.Javascript object>

In [75]:
df_results_cm_p_am.shape

(73, 26)

<IPython.core.display.Javascript object>

#### Hold Out

In [76]:
df_results_cm_p_am["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [77]:
df_results_cm_p_am_ho = (
    df_results_cm_p_am[df_results_cm_p_am["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [78]:
df_results_cm_p_am_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [79]:
df_results_cm_p_am_ho.shape

(73, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [80]:
df_results_cm_p_am_ho = compute_scpm(df_results_cm_p_am_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [81]:
dominance_dict = make_dominance_analysis(df_results_cm_p_am_ho)
dominance_matrix_cm_p_am_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_am_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [82]:
dominance_matrix_cm_p_am_ho.shape, len(dominance_graph_cm_p_am_ho)

((73, 73), 73)

<IPython.core.display.Javascript object>

In [83]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    71
dominant_model         1
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [84]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/local_models/ecics/209_dominance_analysis_cm_kf.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [85]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
72,Global Model,209,AM,Chemical + Mineralogical + Physical - Early CS,"(60402, 14)",,MLP_13,,Standard Scaler,,...,3.770625,0.867812,am,Chemical + Properties CS Less,MLP_13,MLP,-1.713538,0,dominant_model,72
70,Global Model,209,AM,Chemical + Mineralogical + Physical - Early CS,"(60402, 14)",,MLP_11,,Standard Scaler,,...,3.894766,0.858914,am,Chemical + Properties CS Less,MLP_11,MLP,-1.646028,1,intermediate_model,70
71,Global Model,209,AM,Chemical + Mineralogical + Physical - Early CS,"(60402, 14)",,MLP_12,,Standard Scaler,,...,3.901887,0.86359,am,Chemical + Properties CS Less,MLP_12,MLP,-1.655878,1,intermediate_model,70
65,Global Model,209,AM,Chemical + Mineralogical + Physical - Early CS,"(60402, 14)",,MLP_6,,Standard Scaler,,...,3.921627,0.855507,am,Chemical + Properties CS Less,MLP_6,MLP,-1.621879,3,intermediate_model,68
66,Global Model,209,AM,Chemical + Mineralogical + Physical - Early CS,"(60402, 14)",,MLP_7,,Standard Scaler,,...,3.914836,0.854839,am,Chemical + Properties CS Less,MLP_7,MLP,-1.620015,3,intermediate_model,67


<IPython.core.display.Javascript object>

In [86]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [87]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
72,MLP_13,,2.18,1.6,3.77,0.87,-1.71,0,dominant_model,72,AM
71,MLP_12,,2.21,1.66,3.9,0.86,-1.66,1,intermediate_model,70,AM
70,MLP_11,,2.25,1.64,3.89,0.86,-1.65,1,intermediate_model,70,AM


<IPython.core.display.Javascript object>

##### Top intermediate models

In [88]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
70,MLP_11,,2.25,1.64,3.89,0.86,-1.65,1,intermediate_model,70,AM
71,MLP_12,,2.21,1.66,3.9,0.86,-1.66,1,intermediate_model,70,AM


<IPython.core.display.Javascript object>

##### Top non dominant models

In [89]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
40,Bi-LSTM11,7.0,25.36,24.65,56.02,-16.94,32.4,72,non_dominant_model,0,AM


<IPython.core.display.Javascript object>

In [90]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [91]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Conv1D         15
LSTM           15
Bi-LSTM        15
Transformer    15
MLP            13
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [92]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [93]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [94]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [95]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,15.0,51.73,15.81,23.0,41.5,52.0,67.0,72.0,15.0,18.47,15.12,0.0,4.5,18.0,28.5,48.0
Conv1D,15.0,24.13,15.51,13.0,15.5,16.0,26.0,67.0,15.0,44.67,14.72,5.0,43.0,50.0,50.5,59.0
LSTM,15.0,40.13,13.98,16.0,32.5,38.0,47.0,68.0,15.0,29.07,13.38,3.0,22.5,29.0,38.5,50.0
MLP,13.0,5.31,3.86,0.0,3.0,5.0,8.0,12.0,13.0,65.31,3.82,60.0,62.0,65.0,68.0,72.0
Transformer,15.0,49.07,12.49,28.0,38.5,51.0,58.5,65.0,15.0,20.87,12.24,7.0,11.0,18.0,29.0,43.0


<IPython.core.display.Javascript object>

In [96]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,15,2,intermediate_model,14
Conv1D,15,1,intermediate_model,15
LSTM,15,1,intermediate_model,15
MLP,13,2,intermediate_model,12
Transformer,15,1,intermediate_model,15


<IPython.core.display.Javascript object>

In [97]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    14
             non_dominant_model     1
Conv1D       intermediate_model    15
LSTM         intermediate_model    15
MLP          intermediate_model    12
             dominant_model         1
Transformer  intermediate_model    15
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [98]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,93.33
Bi-LSTM,non_dominant_model,6.67
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,100.0
MLP,intermediate_model,92.31
MLP,dominant_model,7.69
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [99]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
72,MLP,0,72,dominant_model
70,MLP,1,70,intermediate_model
71,MLP,1,70,intermediate_model
65,MLP,3,68,intermediate_model
66,MLP,3,67,intermediate_model
...,...,...,...,...
14,LSTM,68,3,intermediate_model
35,Bi-LSTM,68,3,intermediate_model
41,Bi-LSTM,70,1,intermediate_model
44,Bi-LSTM,70,1,intermediate_model


<IPython.core.display.Javascript object>

In [100]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [101]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,15,18.466667,18.0,776,277,3.794521
Conv1D,15,44.666667,50.0,362,670,9.178082
LSTM,15,29.066667,29.0,602,436,5.972603
MLP,13,65.307692,65.0,69,849,11.630137
Transformer,15,20.866667,18.0,736,313,4.287671


<IPython.core.display.Javascript object>

In [102]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,5.565634,...,67.0,72.0,15.0,18.466667,15.117949,0.0,4.5,18.0,28.5,48.0
Conv1D,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.963421,...,26.0,67.0,15.0,44.666667,14.724453,5.0,43.0,50.0,50.5,59.0
LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.412089,...,47.0,68.0,15.0,29.066667,13.37624,3.0,22.5,29.0,38.5,50.0
MLP,0.0,,,,,,,,13.0,2.288615,...,8.0,12.0,13.0,65.307692,3.816294,60.0,62.0,65.0,68.0,72.0
Transformer,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.65539,...,58.5,65.0,15.0,20.866667,12.235001,7.0,11.0,18.0,29.0,43.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [103]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [104]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [105]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [106]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [107]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
72,MLP_13,MLP,,2.18,1.6,3.77,0.87,-1.71,0,72,dominant_model,AM,True
71,MLP_12,MLP,,2.21,1.66,3.9,0.86,-1.66,1,70,intermediate_model,AM,False
70,MLP_11,MLP,,2.25,1.64,3.89,0.86,-1.65,1,70,intermediate_model,AM,False
65,MLP_6,MLP,,2.27,1.66,3.92,0.86,-1.62,3,68,intermediate_model,AM,True
66,MLP_7,MLP,,2.28,1.67,3.91,0.85,-1.62,3,67,intermediate_model,AM,True
63,MLP_4,MLP,,2.29,1.66,3.93,0.85,-1.61,4,65,intermediate_model,AM,True
64,MLP_5,MLP,,2.28,1.67,3.95,0.85,-1.61,5,65,intermediate_model,AM,True


<IPython.core.display.Javascript object>

### Plant W

In [108]:
df_results_cm_p_w = df_results_cm_p[df_results_cm_p["Plant"].eq("W")].reset_index(
    drop=True
)

<IPython.core.display.Javascript object>

In [109]:
df_results_cm_p_w.shape

(73, 26)

<IPython.core.display.Javascript object>

#### Hold Out

In [110]:
df_results_cm_p_w["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [111]:
df_results_cm_p_w_ho = (
    df_results_cm_p_w[df_results_cm_p_w["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [112]:
df_results_cm_p_w_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [113]:
df_results_cm_p_w_ho.shape

(73, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [114]:
df_results_cm_p_w_ho = compute_scpm(df_results_cm_p_w_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [115]:
dominance_dict = make_dominance_analysis(df_results_cm_p_w_ho)
dominance_matrix_cm_p_w_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_w_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [116]:
dominance_matrix_cm_p_w_ho.shape, len(dominance_graph_cm_p_w_ho)

((73, 73), 73)

<IPython.core.display.Javascript object>

In [117]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    70
non_dominant_model     2
dominant_model         1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [118]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/local_models/ecics/209_dominance_analysis_cm_kf.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [119]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
66,Global Model,209,W,Chemical + Mineralogical + Physical - Early CS,"(57588, 14)",,MLP_7,,Standard Scaler,,...,3.733926,0.864596,w,Chemical + Properties CS Less,MLP_7,MLP,-4.765685,0,dominant_model,72
65,Global Model,209,W,Chemical + Mineralogical + Physical - Early CS,"(57588, 14)",,MLP_6,,Standard Scaler,,...,3.814427,0.861923,w,Chemical + Properties CS Less,MLP_6,MLP,-4.639092,1,intermediate_model,70
71,Global Model,209,W,Chemical + Mineralogical + Physical - Early CS,"(57588, 14)",,MLP_12,,Standard Scaler,,...,3.832613,0.863924,w,Chemical + Properties CS Less,MLP_12,MLP,-4.646422,1,intermediate_model,69
69,Global Model,209,W,Chemical + Mineralogical + Physical - Early CS,"(57588, 14)",,MLP_10,,Standard Scaler,,...,3.819649,0.858855,w,Chemical + Properties CS Less,MLP_10,MLP,-4.582923,2,intermediate_model,68
62,Global Model,209,W,Chemical + Mineralogical + Physical - Early CS,"(57588, 14)",,MLP_3,,Standard Scaler,,...,3.878092,0.861012,w,Chemical + Properties CS Less,MLP_3,MLP,-4.57421,3,intermediate_model,67


<IPython.core.display.Javascript object>

In [120]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [121]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
66,MLP_7,,2.17,1.56,3.73,0.86,-4.77,0,dominant_model,72,W
65,MLP_6,,2.2,1.59,3.81,0.86,-4.64,1,intermediate_model,70,W
71,MLP_12,,2.18,1.6,3.83,0.86,-4.65,1,intermediate_model,69,W


<IPython.core.display.Javascript object>

##### Top intermediate models

In [122]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
65,MLP_6,,2.2,1.59,3.81,0.86,-4.64,1,intermediate_model,70,W
71,MLP_12,,2.18,1.6,3.83,0.86,-4.65,1,intermediate_model,69,W


<IPython.core.display.Javascript object>

##### Top non dominant models

In [123]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
38,Bi-LSTM9,14.0,6.07,5.1,11.61,-0.05,14.34,71,non_dominant_model,0,W
41,Bi-LSTM12,14.0,5.97,5.13,11.93,-0.02,14.26,71,non_dominant_model,0,W


<IPython.core.display.Javascript object>

In [124]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [125]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Conv1D         15
LSTM           15
Bi-LSTM        15
Transformer    15
MLP            13
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [126]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [127]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [128]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [129]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,15.0,48.27,14.92,21.0,39.0,48.0,60.5,71.0,15.0,20.07,13.74,0.0,8.0,20.0,29.5,47.0
Conv1D,15.0,23.2,10.4,12.0,16.0,21.0,28.0,52.0,15.0,46.8,11.41,15.0,43.0,48.0,55.0,59.0
LSTM,15.0,41.53,16.81,18.0,28.0,43.0,57.5,69.0,15.0,27.53,17.13,2.0,11.5,26.0,43.5,51.0
MLP,13.0,5.54,4.07,0.0,2.0,6.0,9.0,12.0,13.0,65.38,3.93,58.0,63.0,65.0,68.0,72.0
Transformer,15.0,51.33,12.51,33.0,41.5,51.0,61.5,69.0,15.0,18.07,11.09,2.0,9.5,17.0,26.5,37.0


<IPython.core.display.Javascript object>

In [130]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,15,2,intermediate_model,13
Conv1D,15,1,intermediate_model,15
LSTM,15,1,intermediate_model,15
MLP,13,2,intermediate_model,12
Transformer,15,1,intermediate_model,15


<IPython.core.display.Javascript object>

In [131]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    13
             non_dominant_model     2
Conv1D       intermediate_model    15
LSTM         intermediate_model    15
MLP          intermediate_model    12
             dominant_model         1
Transformer  intermediate_model    15
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [132]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,86.67
Bi-LSTM,non_dominant_model,13.33
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,100.0
MLP,intermediate_model,92.31
MLP,dominant_model,7.69
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [133]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
66,MLP,0,72,dominant_model
65,MLP,1,70,intermediate_model
71,MLP,1,69,intermediate_model
69,MLP,2,68,intermediate_model
62,MLP,3,67,intermediate_model
...,...,...,...,...
48,Transformer,68,4,intermediate_model
14,LSTM,69,2,intermediate_model
57,Transformer,69,2,intermediate_model
38,Bi-LSTM,71,0,non_dominant_model


<IPython.core.display.Javascript object>

In [134]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [135]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,15,20.066667,20.0,724,301,4.123288
Conv1D,15,46.8,48.0,348,702,9.616438
LSTM,15,27.533333,26.0,623,413,5.657534
MLP,13,65.384615,65.0,72,850,11.643836
Transformer,15,18.066667,17.0,770,271,3.712329


<IPython.core.display.Javascript object>

In [136]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.761277,...,60.5,71.0,15.0,20.066667,13.744956,0.0,8.0,20.0,29.5,47.0
Conv1D,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.796076,...,28.0,52.0,15.0,46.8,11.409269,15.0,43.0,48.0,55.0,59.0
LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.381053,...,57.5,69.0,15.0,27.533333,17.13337,2.0,11.5,26.0,43.5,51.0
MLP,0.0,,,,,,,,13.0,2.295701,...,9.0,12.0,13.0,65.384615,3.927223,58.0,63.0,65.0,68.0,72.0
Transformer,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.682079,...,61.5,69.0,15.0,18.066667,11.093542,2.0,9.5,17.0,26.5,37.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [137]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [138]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [139]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [140]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [141]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
66,MLP_7,MLP,,2.17,1.56,3.73,0.86,-4.77,0,72,dominant_model,W,True
71,MLP_12,MLP,,2.18,1.6,3.83,0.86,-4.65,1,69,intermediate_model,W,False
65,MLP_6,MLP,,2.2,1.59,3.81,0.86,-4.64,1,70,intermediate_model,W,False
69,MLP_10,MLP,,2.22,1.6,3.82,0.86,-4.58,2,68,intermediate_model,W,True
62,MLP_3,MLP,,2.2,1.6,3.88,0.86,-4.57,3,67,intermediate_model,W,True
70,MLP_11,MLP,,2.26,1.61,3.88,0.85,-4.46,4,67,intermediate_model,W,True
72,MLP_13,MLP,,2.29,1.64,3.91,0.85,-4.34,6,65,intermediate_model,W,False


<IPython.core.display.Javascript object>

### Variable Grouping: CM-P-CS

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P-CS were from:**

*BTiSS:*
    Dominance analysis: Plant S
    SCPM:Plant S and K

*TiSS:*
    Dominance analysis: Plant S
    SCPM:Plant S

In [142]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical', 'Chemical + Mineralogical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [143]:
df_results_cm_p_cs = (
    df_pre_train[df_pre_train["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

### Plant S

In [144]:
df_results_cm_p_cs_s = df_results_cm_p_cs[
    df_results_cm_p_cs["Plant"].eq("S")
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [145]:
df_results_cm_p_cs_s.shape

(73, 26)

<IPython.core.display.Javascript object>

#### Hold Out

In [146]:
df_results_cm_p_cs_s["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [147]:
df_results_cm_p_cs_s_ho = (
    df_results_cm_p_cs_s[df_results_cm_p_cs_s["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [148]:
df_results_cm_p_cs_s_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [149]:
df_results_cm_p_cs_s_ho.shape

(73, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [150]:
df_results_cm_p_cs_s_ho = compute_scpm(df_results_cm_p_cs_s_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [151]:
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_s_ho)
dominance_matrix_cm_p_cs_s_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_s_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [152]:
dominance_matrix_cm_p_cs_s_ho.shape, len(dominance_graph_cm_p_cs_s_ho)

((73, 73), 73)

<IPython.core.display.Javascript object>

In [153]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    71
dominant_model         1
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [154]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/local_models/ecics/209_dominance_analysis_cm_kf.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [155]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
26,Global Model,209,S,Chemical + Mineralogical + Physical,"(61946, 17)",,MLP_12,,Standard Scaler,,...,2.567879,0.940373,s,Chemical + Physical,MLP_12,MLP,-1.560389,0,dominant_model,72
20,Global Model,209,S,Chemical + Mineralogical + Physical,"(61946, 17)",,MLP_6,,Standard Scaler,,...,2.627928,0.938022,s,Chemical + Physical,MLP_6,MLP,-1.523449,1,intermediate_model,70
21,Global Model,209,S,Chemical + Mineralogical + Physical,"(61946, 17)",,MLP_7,,Standard Scaler,,...,2.638908,0.937571,s,Chemical + Physical,MLP_7,MLP,-1.519538,1,intermediate_model,69
25,Global Model,209,S,Chemical + Mineralogical + Physical,"(61946, 17)",,MLP_11,,Standard Scaler,,...,2.661951,0.937755,s,Chemical + Physical,MLP_11,MLP,-1.513479,2,intermediate_model,69
16,Global Model,209,S,Chemical + Mineralogical + Physical,"(61946, 17)",,MLP_2,,Standard Scaler,,...,2.679156,0.935337,s,Chemical + Physical,MLP_2,MLP,-1.491462,4,intermediate_model,68


<IPython.core.display.Javascript object>

In [156]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [157]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
26,MLP_12,,1.47,1.09,2.57,0.94,-1.56,0,dominant_model,72,S
20,MLP_6,,1.49,1.12,2.63,0.94,-1.52,1,intermediate_model,70,S
21,MLP_7,,1.5,1.12,2.64,0.94,-1.52,1,intermediate_model,69,S


<IPython.core.display.Javascript object>

##### Top intermediate models

In [158]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
20,MLP_6,,1.49,1.12,2.63,0.94,-1.52,1,intermediate_model,70,S
25,MLP_11,,1.5,1.12,2.66,0.94,-1.51,2,intermediate_model,69,S


<IPython.core.display.Javascript object>

##### Top non dominant models

In [159]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
48,Bi-LSTM6,14.0,21.67,20.82,47.13,-12.01,32.17,72,non_dominant_model,0,S


<IPython.core.display.Javascript object>

In [160]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [161]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Conv1D         15
Bi-LSTM        15
LSTM           15
Transformer    15
MLP            13
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [162]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [163]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [164]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [165]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,15.0,46.67,18.15,11.0,33.5,46.0,62.5,72.0,15.0,23.4,17.46,0.0,6.5,25.0,34.5,59.0
Conv1D,15.0,18.33,6.18,11.0,14.0,15.0,23.5,29.0,15.0,48.47,6.07,38.0,46.0,49.0,51.0,59.0
LSTM,15.0,46.87,15.13,14.0,38.5,50.0,54.0,70.0,15.0,22.73,13.24,1.0,16.0,22.0,31.0,50.0
MLP,13.0,7.69,8.63,0.0,2.0,6.0,9.0,33.0,13.0,62.77,10.99,31.0,63.0,66.0,69.0,72.0
Transformer,15.0,50.33,13.43,23.0,39.0,57.0,60.0,65.0,15.0,19.87,12.81,6.0,10.0,15.0,31.5,45.0


<IPython.core.display.Javascript object>

In [166]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,15,2,intermediate_model,14
Conv1D,15,1,intermediate_model,15
LSTM,15,1,intermediate_model,15
MLP,13,2,intermediate_model,12
Transformer,15,1,intermediate_model,15


<IPython.core.display.Javascript object>

In [167]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    14
             non_dominant_model     1
Conv1D       intermediate_model    15
LSTM         intermediate_model    15
MLP          intermediate_model    12
             dominant_model         1
Transformer  intermediate_model    15
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [168]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,93.33
Bi-LSTM,non_dominant_model,6.67
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,100.0
MLP,intermediate_model,92.31
MLP,dominant_model,7.69
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [169]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
26,MLP,0,72,dominant_model
20,MLP,1,70,intermediate_model
21,MLP,1,69,intermediate_model
25,MLP,2,69,intermediate_model
16,MLP,4,68,intermediate_model
...,...,...,...,...
2,LSTM,68,4,intermediate_model
54,Bi-LSTM,69,3,intermediate_model
8,LSTM,70,1,intermediate_model
51,Bi-LSTM,70,1,intermediate_model


<IPython.core.display.Javascript object>

In [170]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [171]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,15,23.4,25.0,700,351,4.808219
Conv1D,15,48.466667,49.0,275,727,9.958904
LSTM,15,22.733333,22.0,703,341,4.671233
MLP,13,62.769231,66.0,100,816,11.178082
Transformer,15,19.866667,15.0,755,298,4.082192


<IPython.core.display.Javascript object>

In [172]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,4.083984,...,62.5,72.0,15.0,23.4,17.455249,0.0,6.5,25.0,34.5,59.0
Conv1D,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,1.848217,...,23.5,29.0,15.0,48.466667,6.069439,38.0,46.0,49.0,51.0,59.0
LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.79539,...,54.0,70.0,15.0,22.733333,13.236673,1.0,16.0,22.0,31.0,50.0
MLP,0.0,,,,,,,,13.0,1.592532,...,9.0,33.0,13.0,62.769231,10.993588,31.0,63.0,66.0,69.0,72.0
Transformer,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.627488,...,60.0,65.0,15.0,19.866667,12.805505,6.0,10.0,15.0,31.5,45.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [173]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [174]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [175]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [176]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [177]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
26,MLP_12,MLP,,1.47,1.09,2.57,0.94,-1.56,0,72,dominant_model,S,True
20,MLP_6,MLP,,1.49,1.12,2.63,0.94,-1.52,1,70,intermediate_model,S,True
21,MLP_7,MLP,,1.5,1.12,2.64,0.94,-1.52,1,69,intermediate_model,S,True
25,MLP_11,MLP,,1.5,1.12,2.66,0.94,-1.51,2,69,intermediate_model,S,True
16,MLP_2,MLP,,1.53,1.14,2.68,0.94,-1.49,4,68,intermediate_model,S,True
22,MLP_8,MLP,,1.53,1.15,2.73,0.94,-1.48,5,67,intermediate_model,S,True
15,MLP_1,MLP,,1.54,1.16,2.74,0.93,-1.46,6,66,intermediate_model,S,True


<IPython.core.display.Javascript object>

### Plant K

In [178]:
df_results_cm_p_cs_k = df_results_cm_p_cs[
    df_results_cm_p_cs["Plant"].eq("K")
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [179]:
df_results_cm_p_cs_k.shape

(73, 26)

<IPython.core.display.Javascript object>

#### Hold Out

In [180]:
df_results_cm_p_cs_k["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [181]:
df_results_cm_p_cs_k_ho = (
    df_results_cm_p_cs_k[df_results_cm_p_cs_k["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [182]:
df_results_cm_p_cs_k_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [183]:
df_results_cm_p_cs_k_ho.shape

(73, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [184]:
df_results_cm_p_cs_k_ho = compute_scpm(df_results_cm_p_cs_k_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [185]:
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_k_ho)
dominance_matrix_cm_p_cs_k_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_k_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [186]:
dominance_matrix_cm_p_cs_k_ho.shape, len(dominance_graph_cm_p_cs_k_ho)

((73, 73), 73)

<IPython.core.display.Javascript object>

In [187]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    71
dominant_model         1
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [188]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/local_models/ecics/209_dominance_analysis_cm_kf.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [189]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
26,Global Model,209,K,Chemical + Mineralogical + Physical,"(60916, 16)",,MLP_12,,Standard Scaler,,...,2.624937,0.938098,k,Chemical + Physical,MLP_12,MLP,-3.128532,0,dominant_model,72
21,Global Model,209,K,Chemical + Mineralogical + Physical,"(60916, 16)",,MLP_7,,Standard Scaler,,...,2.656111,0.937947,k,Chemical + Physical,MLP_7,MLP,-3.107214,1,intermediate_model,70
25,Global Model,209,K,Chemical + Mineralogical + Physical,"(60916, 16)",,MLP_11,,Standard Scaler,,...,2.644121,0.937268,k,Chemical + Physical,MLP_11,MLP,-3.103618,1,intermediate_model,70
20,Global Model,209,K,Chemical + Mineralogical + Physical,"(60916, 16)",,MLP_6,,Standard Scaler,,...,2.660564,0.936892,k,Chemical + Physical,MLP_6,MLP,-3.088933,3,intermediate_model,69
22,Global Model,209,K,Chemical + Mineralogical + Physical,"(60916, 16)",,MLP_8,,Standard Scaler,,...,2.730669,0.935629,k,Chemical + Physical,MLP_8,MLP,-3.027017,4,intermediate_model,66


<IPython.core.display.Javascript object>

In [190]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [191]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
26,MLP_12,,1.5,1.12,2.62,0.94,-3.13,0,dominant_model,72,K
25,MLP_11,,1.51,1.13,2.64,0.94,-3.1,1,intermediate_model,70,K
21,MLP_7,,1.51,1.13,2.66,0.94,-3.11,1,intermediate_model,70,K


<IPython.core.display.Javascript object>

##### Top intermediate models

In [192]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
21,MLP_7,,1.51,1.13,2.66,0.94,-3.11,1,intermediate_model,70,K
25,MLP_11,,1.51,1.13,2.64,0.94,-3.1,1,intermediate_model,70,K


<IPython.core.display.Javascript object>

##### Top non dominant models

In [193]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
8,LSTM9,14.0,8.79,8.21,18.69,-1.11,20.7,72,non_dominant_model,0,K


<IPython.core.display.Javascript object>

In [194]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [195]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Conv1D         15
LSTM           15
Bi-LSTM        15
Transformer    15
MLP            13
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [196]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [197]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [198]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [199]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,15.0,48.27,13.64,27.0,40.0,46.0,58.0,69.0,15.0,21.47,12.93,1.0,12.0,24.0,31.0,39.0
Conv1D,15.0,19.73,7.66,10.0,13.5,18.0,23.0,38.0,15.0,48.87,8.31,32.0,44.0,52.0,55.5,60.0
LSTM,15.0,44.13,19.25,20.0,27.0,43.0,63.5,72.0,15.0,24.47,17.39,0.0,7.5,29.0,37.5,48.0
MLP,13.0,6.15,5.05,0.0,3.0,4.0,9.0,18.0,13.0,64.54,6.2,48.0,63.0,66.0,69.0,72.0
Transformer,15.0,51.6,10.14,26.0,49.5,55.0,58.0,64.0,15.0,18.33,9.22,7.0,11.0,17.0,20.0,37.0


<IPython.core.display.Javascript object>

In [200]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,15,1,intermediate_model,15
Conv1D,15,1,intermediate_model,15
LSTM,15,2,intermediate_model,14
MLP,13,2,intermediate_model,12
Transformer,15,1,intermediate_model,15


<IPython.core.display.Javascript object>

In [201]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    15
Conv1D       intermediate_model    15
LSTM         intermediate_model    14
             non_dominant_model     1
MLP          intermediate_model    12
             dominant_model         1
Transformer  intermediate_model    15
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [202]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,100.0
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,93.33
LSTM,non_dominant_model,6.67
MLP,intermediate_model,92.31
MLP,dominant_model,7.69
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [203]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
26,MLP,0,72,dominant_model
21,MLP,1,70,intermediate_model
25,MLP,1,70,intermediate_model
20,MLP,3,69,intermediate_model
22,MLP,4,66,intermediate_model
...,...,...,...,...
51,Bi-LSTM,68,2,intermediate_model
54,Bi-LSTM,68,3,intermediate_model
5,LSTM,70,1,intermediate_model
45,Bi-LSTM,69,1,intermediate_model


<IPython.core.display.Javascript object>

In [204]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [205]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,15,21.466667,24.0,724,322,4.410959
Conv1D,15,48.866667,52.0,296,733,10.041096
LSTM,15,24.466667,29.0,662,367,5.027397
MLP,13,64.538462,66.0,80,839,11.493151
Transformer,15,18.333333,17.0,774,275,3.767123


<IPython.core.display.Javascript object>

In [206]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.283718,...,58.0,69.0,15.0,21.466667,12.927637,1.0,12.0,24.0,31.0,39.0
Conv1D,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,1.926335,...,23.0,38.0,15.0,48.866667,8.305477,32.0,44.0,52.0,55.5,60.0
LSTM,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,3.365673,...,63.5,72.0,15.0,24.466667,17.389926,0.0,7.5,29.0,37.5,48.0
MLP,0.0,,,,,,,,13.0,1.603941,...,9.0,18.0,13.0,64.538462,6.199669,48.0,63.0,66.0,69.0,72.0
Transformer,15.0,7.333333,5.498918,1.0,1.0,7.0,14.0,14.0,15.0,2.952267,...,58.0,64.0,15.0,18.333333,9.224708,7.0,11.0,17.0,20.0,37.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [207]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [208]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [209]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [210]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [211]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
26,MLP_12,MLP,,1.5,1.12,2.62,0.94,-3.13,0,72,dominant_model,K,True
21,MLP_7,MLP,,1.51,1.13,2.66,0.94,-3.11,1,70,intermediate_model,K,True
25,MLP_11,MLP,,1.51,1.13,2.64,0.94,-3.1,1,70,intermediate_model,K,True
20,MLP_6,MLP,,1.52,1.13,2.66,0.94,-3.09,3,69,intermediate_model,K,True
22,MLP_8,MLP,,1.53,1.16,2.73,0.94,-3.03,4,66,intermediate_model,K,True
24,MLP_10,MLP,,1.56,1.15,2.69,0.93,-3.02,4,66,intermediate_model,K,True
27,MLP_13,MLP,,1.56,1.16,2.75,0.93,-3.0,4,66,intermediate_model,K,True


<IPython.core.display.Javascript object>

# Global Analysis (fine tuning results)

# Definitions

# Topological Analysis - NEW

## Definitions

In [212]:
# Function to compare two models strictly based on means only
def dominates(model1, model2):
    """Return True if model1 strictly dominates model2, False otherwise."""
    # Check if model1 is at least as good as model2 in all metrics
    at_least_as_good = (
        model1["RMSE_mean"] <= model2["RMSE_mean"]
        and model1["MAE_mean"] <= model2["MAE_mean"]
        and model1["MAPE_mean"] <= model2["MAPE_mean"]
        and model1["R2_mean"] >= model2["R2_mean"]
    )

    # Check if model1 is strictly better in at least one metric
    strictly_better_in_one = (
        model1["RMSE_mean"] < model2["RMSE_mean"]
        or model1["MAE_mean"] < model2["MAE_mean"]
        or model1["MAPE_mean"] < model2["MAPE_mean"]
        or model1["R2_mean"] > model2["R2_mean"]
    )

    # Model1 strictly dominates Model2 if it is at least as good in all metrics and
    # strictly better in at least one metric
    return at_least_as_good and strictly_better_in_one

<IPython.core.display.Javascript object>

In [213]:
def get_dominance_matrix_and_graph(df):
    n = len(df)
    dominance_matrix = np.zeros((n, n), dtype=bool)

    for i in range(n):
        for j in range(n):
            if i != j:
                dominance_matrix[i, j] = dominates(df.iloc[i], df.iloc[j])

    # Create the dominance graph
    dominance_graph = {i: [] for i in range(n)}
    for i in range(n):
        for j in range(n):
            if dominance_matrix[i, j]:
                dominance_graph[i].append(j)

    return dominance_matrix, dominance_graph

<IPython.core.display.Javascript object>

In [214]:
# Topological Sorting
def topological_sort(graph):
    """Perform topological sorting on the given graph."""
    in_degree = {u: 0 for u in graph}
    for u in graph:
        for v in graph[u]:
            in_degree[v] += 1

    queue = deque([u for u in graph if in_degree[u] == 0])
    topo_order = []

    while queue:
        u = queue.popleft()
        topo_order.append(u)

        for v in graph[u]:
            in_degree[v] -= 1
            if in_degree[v] == 0:
                queue.append(v)

    return topo_order

<IPython.core.display.Javascript object>

In [215]:
# Function to find the top models in each group
def find_top_models(group):
    max_net_dominance = group["Net_Dominance"].max()
    top_models = group[group["Net_Dominance"] == max_net_dominance]
    return top_models

<IPython.core.display.Javascript object>

### Dominance Analysis and SCPM

In [216]:
def compute_scpm(df):
    scaler = StandardScaler()
    standardized_metrics = scaler.fit_transform(
        df[["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"]]
    )

    # Creating a new dataframe with standardized values
    standardized_df = pd.DataFrame(
        standardized_metrics,
        columns=["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"],
    )

    # Summing all metrics and subtracting R2_mean
    standardized_df["Result"] = (
        standardized_df[["RMSE_mean", "MAE_mean", "MAPE_mean"]].sum(axis=1)
        - standardized_df["R2_mean"]
    )

    df["SCPM"] = standardized_df["Result"]
    return df

<IPython.core.display.Javascript object>

In [217]:
def make_dominance_analysis(df):
    # Compute dominance matrix and graph
    dominance_matrix, dominance_graph = get_dominance_matrix_and_graph(df)

    # Get the topological order
    topo_order = topological_sort(dominance_graph)
    df = df.copy()
    n = len(topo_order)

    # Identify Dominant Models
    dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if not np.any(dominance_matrix[:, i])
    ]

    # Identify Non-Dominant Models
    non_dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i])
    ]

    # Identify Models that are both dominated and dominate at least one
    intermediate_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i]) and np.any(dominance_matrix[i, :])
    ]

    # Add count of times each model is dominated by others
    df["Dominated_Count"] = np.sum(dominance_matrix, axis=0)

    # Classify models
    df["Classification"] = "intermediate_model"  # default to intermediate_model
    df.loc[
        ~np.any(dominance_matrix, axis=1) & np.any(dominance_matrix, axis=0),
        "Classification",
    ] = "non_dominant_model"
    df.loc[~np.any(dominance_matrix, axis=0), "Classification"] = "dominant_model"

    # Map the topological order to the model names and get dominance counts
    sorted_models = [df["Model"].iloc[i] for i in topo_order]

    df_sorted = df.iloc[topo_order].copy()
    df_sorted["Dominates_Count"] = dominance_matrix.sum(axis=1)[topo_order]
    df_sorted_topo = df_sorted.copy()

    df_sorted = df_sorted.sort_values(by="Dominates_Count", ascending=False)
    df_sorted_count = df_sorted.copy()

    return {
        "dominance_matrix": dominance_matrix,
        "dominance_graph": dominance_graph,
        "df_sorted_topo": df_sorted_topo,
        "df_sorted_count": df_sorted_count,
    }

<IPython.core.display.Javascript object>

# Reading The files

In [218]:
csv_files_path_fine_tuning_full = dict()
csv_files_path_fine_tuning_grouped = dict()

<IPython.core.display.Javascript object>

## 209

### Plant S

In [219]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/s/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/s/fine_tuning/grouped/"
plant = "s"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

### Plant AM

In [220]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/am/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/am/fine_tuning/grouped/"
plant = "am"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

### Plant W

In [221]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/w/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/w/fine_tuning/grouped/"
plant = "w"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

### Plant K

In [222]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/k/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/k/fine_tuning/grouped/"
plant = "k"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

### Reading all data - pre training

#### Full

In [223]:
dfs_plant = dict()
column = ("plant", "")
pos = 2

for plant, csv_files in csv_files_path_fine_tuning_full.items():
    df = read_csv_files_full(csv_files, plant)
    df["plant"] = plant
    dfs_plant[plant] = df

<IPython.core.display.Javascript object>

In [224]:
df_fine_tuning_full = pd.concat(dfs_plant.values(), ignore_index=True)

<IPython.core.display.Javascript object>

In [225]:
df_fine_tuning_full.shape

(1272, 23)

<IPython.core.display.Javascript object>

#### Grouped

In [226]:
dfs_plant = dict()
column = ("plant", "")
pos = 2

for plant, csv_files in csv_files_path_fine_tuning_grouped.items():
    df = read_csv_files_grouped(csv_files, plant)
    df["plant"] = plant
    df = preprocess_rename_columns(df)
    df = preprocess_change_columns_order(df, column, pos)
    dfs_plant[plant] = df

<IPython.core.display.Javascript object>

In [227]:
df_fine_tuning_grouped = pd.concat(dfs_plant.values(), ignore_index=True)

<IPython.core.display.Javascript object>

In [228]:
df_fine_tuning_grouped.shape

(96, 25)

<IPython.core.display.Javascript object>

In [229]:
df_copy = df_fine_tuning_grouped.copy()
df_copy = (
    df_copy.reset_index(level=0)
    .rename({"level_0": "Plant"}, axis=1)
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

## Preprocessing steps

In [230]:
patterns = [
    "Chemical + Mineralogical + Feature Engineering",
    "Chemical + Mineralogical + CS7 + One-Hot",
    "Chemical + Mineralogical + Physical + One-Hot",
    "Chemical + Mineralogical + CS3 + One-Hot",
    "Chemical + Mineralogical + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + One-Hot",
    "Chemical + Mineralogical + CS1 + One-Hot",
    "Chemical + Mineralogical + CS2 + One-Hot",
    "Chemical + Feature Engineering",
]

replace_dict = {
    "Chemical + Mineralogical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS2": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS7": "Chemical + Mineralogical + Early CS",
}

replace_dict_2 = {
    "Chemical": "Chemical + Mineralogical",
    "Chemical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Mineralogical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Physical": "Chemical + Mineralogical + Physical",
}

df_copy = df_copy[~df_copy["Cross Validation"].eq("Out of time Split")].reset_index(
    drop=True
)

# Removing afterwards
df_copy = df_copy[~df_copy["Cross Validation"].eq("Repeated KFold")].reset_index(
    drop=True
)

df_copy = df_copy[~df_copy["Features"].apply(lambda x: x in patterns)].reset_index(
    drop=True
)
df_copy["Features_bkp"] = df_copy["Features"].copy()
df_copy["Features"] = df_copy["Features"].replace(replace_dict)

df_copy["Features"] = df_copy["Features"].replace(replace_dict_2)

# Removing afterwards
# df_copy = df_copy[
#     ~df_copy["Features"].eq("Chemical + Mineralogical + Early CS")
# ].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [231]:
df_copy[("MAPE Train", "mean")] = df_copy[("MAPE Train", "mean")] * 100
df_copy[("MAPE Train", "std")] = df_copy[("MAPE Train", "std")] * 100
df_copy[("MAPE Test", "mean")] = df_copy[("MAPE Test", "mean")] * 100
df_copy[("MAPE Test", "std")] = df_copy[("MAPE Test", "std")] * 100

df_copy["Model_bkp"] = df_copy["Model"].copy()
df_copy["Model_bkp_2"] = df_copy["Model"] + df_copy["Timesteps"].apply(
    lambda x: "" if pd.isna(x) else "_" + str(x)
)
df_copy["Model"] = df_copy["Model"].replace(
    {
        "MLP": "Neural Networks",
        "LSTM": "Neural Networks",
        "GRU": "Neural Networks",
        "BidirectionalLSTM": "Neural Networks",
        "BidirectionalGRU": "Neural Networks",
        "Transformer": "Neural Networks",
        "Decision Tree": "Trees",
        "Random Forest": "Trees",
        "XGBoost": "Trees",
    }
)

<IPython.core.display.Javascript object>

In [232]:
df_copy["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [233]:
df_copy["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [234]:
df_copy["Model"].unique()

array(['Neural Networks', 'Conv1D'], dtype=object)

<IPython.core.display.Javascript object>

In [235]:
df_copy["Model_bkp"].unique()

array(['Transformer', 'MLP', 'BidirectionalLSTM', 'Conv1D', 'LSTM'],
      dtype=object)

<IPython.core.display.Javascript object>

In [236]:
df_copy["Model_bkp_2"].unique()

array(['Transformer_1.0', 'MLP', 'BidirectionalLSTM_7.0', 'Conv1D_7.0',
       'LSTM_14.0', 'Conv1D_1.0', 'LSTM_1.0', 'BidirectionalLSTM_1.0',
       'Transformer_14.0', 'BidirectionalLSTM_14.0', 'LSTM_7.0'],
      dtype=object)

<IPython.core.display.Javascript object>

In [237]:
df_copy.shape

(68, 29)

<IPython.core.display.Javascript object>

In [238]:
df_copy_grouped = df_copy.copy()

<IPython.core.display.Javascript object>

In [239]:
df_copy[
    [
        ("RMSE Test", "mean"),
        ("MAE Test", "mean"),
        ("MAPE Test", "mean"),
        ("R2 Test", "mean"),
    ]
].describe().round(2).T

Unnamed: 0,Unnamed: 1,count,mean,std,min,25%,50%,75%,max
RMSE Test,mean,68.0,1.84,1.15,0.71,1.18,1.46,2.01,8.44
MAE Test,mean,68.0,1.3,0.62,0.54,0.87,1.11,1.44,3.86
MAPE Test,mean,68.0,3.1,1.45,1.34,2.15,2.62,3.47,8.99
R2 Test,mean,68.0,0.79,0.44,-1.78,0.85,0.93,0.95,0.98


<IPython.core.display.Javascript object>

In [240]:
df_copy["plant"].unique()

array(['s', 'am', 'w', 'k'], dtype=object)

<IPython.core.display.Javascript object>

# Global Analysis (fine tuning results)

## 209

In [241]:
df_209_grouped = df_copy.copy()

<IPython.core.display.Javascript object>

In [242]:
df_results = df_209_grouped[
    [
        ("Company", ""),
        ("plant", ""),
        ("Cross Validation", ""),
        ("Features", ""),
        ('Features_bkp', ''),
        ("Model", ""),
        ("Model_bkp", ""),         
        ('Model_bkp_2',''),
        ("Timesteps", ""),
        ("RMSE Test", "mean"),
        ("MAE Test", "mean"),
        ("MAPE Test", "mean"),
        ("R2 Test", "mean"),
        ("RMSE Test", "std"),
        ("MAE Test", "std"),
        ("MAPE Test", "std"),
        ("R2 Test", "std"),
    ]
].copy()
df_results = df_results.reset_index(drop=True)

# Rename columns
new_column_names = [
    "Company",
    "Plant",
    "Cross Validation",
    "Features",
    "Features_bkp",
    "Model",
    "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "RMSE_std",
    "MAE_std",
    "MAPE_std",
    "R2_std",
]

df_results.columns = new_column_names


<IPython.core.display.Javascript object>

In [243]:
df_results.shape

(68, 17)

<IPython.core.display.Javascript object>

In [244]:
df_results["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

## Variable Grouping: CM

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM were from:**

*BTiSS:*
    Dominance analysis: Plant S
    SCPM:Plant S

*TiSS:*
    Dominance analysis: Plant S
    SCPM:Plant S

In [245]:
df_results["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [246]:
df_results_cm = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [247]:
df_results_cm["Features"].unique()

array(['Chemical + Mineralogical'], dtype=object)

<IPython.core.display.Javascript object>

In [248]:
df_results_cm.shape

(18, 17)

<IPython.core.display.Javascript object>

### Plant S

In [249]:
df_results_cm = df_results_cm[df_results_cm["Plant"].eq("s")].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Blocking time series

In [250]:
df_results_cm["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [251]:
df_results_cm_btss = (
    df_results_cm[df_results_cm["Cross Validation"].eq("Blocking Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [252]:
df_results_cm_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [253]:
df_results_cm_btss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [254]:
df_results_cm_btss = compute_scpm(df_results_cm_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [255]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_btss)
dominance_matrix_cm_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [256]:
dominance_matrix_cm_btss.shape, len(dominance_graph_cm_btss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [257]:
df_sorted_topo["Classification"].value_counts()

Classification
dominant_model        2
intermediate_model    2
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [258]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/209_s_dominance_analysis_cm_btss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [259]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,209,s,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,Transformer,Transformer_1.0,1.0,1.16241,...,2.142578,0.94641,0.422754,0.235959,0.610615,0.04138,-3.271616,0,dominant_model,3
3,209,s,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,MLP,MLP,,1.213463,...,2.326368,0.949241,0.137581,0.084595,0.223857,0.01128,-3.013663,0,dominant_model,3
2,209,s,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Conv1D,Conv1D,Conv1D_7.0,7.0,1.437166,...,2.612071,0.929066,0.312455,0.202023,0.393569,0.024494,-2.3587,2,intermediate_model,2
4,209,s,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,LSTM,LSTM_14.0,14.0,2.393818,...,4.651054,0.760966,1.211175,0.86984,2.401948,0.296117,1.438587,3,intermediate_model,1
1,209,s,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_7.0,7.0,3.700777,...,6.794265,0.313188,2.280101,1.430805,3.345232,1.190779,7.205392,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [260]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [261]:
# Top 4
df_sorted_topo_cols.iloc[0:5]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,Transformer_1.0,1.16,0.42,0.86,0.24,2.14,0.61,0.95,0.04,-3.27,0,dominant_model,3,s
3,MLP,1.21,0.14,0.94,0.08,2.33,0.22,0.95,0.01,-3.01,0,dominant_model,3,s
2,Conv1D_7.0,1.44,0.31,1.08,0.2,2.61,0.39,0.93,0.02,-2.36,2,intermediate_model,2,s
4,LSTM_14.0,2.39,1.21,1.79,0.87,4.65,2.4,0.76,0.3,1.44,3,intermediate_model,1,s
1,BidirectionalLSTM_7.0,3.7,2.28,2.8,1.43,6.79,3.35,0.31,1.19,7.21,4,non_dominant_model,0,s


<IPython.core.display.Javascript object>

In [262]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(2, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [263]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
2,Conv1D_7.0,1.44,0.31,1.08,0.2,2.61,0.39,0.93,0.02,-2.36,2,intermediate_model,2,s
4,LSTM_14.0,2.39,1.21,1.79,0.87,4.65,2.4,0.76,0.3,1.44,3,intermediate_model,1,s


<IPython.core.display.Javascript object>

In [264]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(2, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [265]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,BidirectionalLSTM_7.0,3.7,2.28,2.8,1.43,6.79,3.35,0.31,1.19,7.21,4,non_dominant_model,0,s


<IPython.core.display.Javascript object>

In [266]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [267]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [268]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [269]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [270]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [271]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [272]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,dominant_model,50.0
Neural Networks,intermediate_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [273]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,2.0,1.75
Dominated_Count,std,,2.06
Dominated_Count,min,2.0,0.0
Dominated_Count,25%,2.0,0.0
Dominated_Count,50%,2.0,1.5
Dominated_Count,75%,2.0,3.25
Dominated_Count,max,2.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,2.0,1.75


<IPython.core.display.Javascript object>

In [274]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [275]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,dominant_model,2


<IPython.core.display.Javascript object>

In [276]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  dominant_model        2
                 intermediate_model    1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [277]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [278]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [279]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,2.0,2.0,2,2,0.4
Neural Networks,4,1.75,2.0,7,7,1.4


<IPython.core.display.Javascript object>

In [280]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,1.437166,,1.437166,1.437166,1.437166,1.437166,1.437166,1.0,0.312455,...,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
Neural Networks,4.0,2.117617,1.198971,1.16241,1.2007,1.803641,2.720558,3.700777,4.0,1.012903,...,3.25,4.0,4.0,1.75,1.5,0.0,0.75,2.0,3.0,3.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [281]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [282]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [283]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [284]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,Transformer_1.0,1.16,0.42,0.86,0.24,2.14,0.61,0.95,0.04,-3.27,0,3,dominant_model,s,True
3,MLP,1.21,0.14,0.94,0.08,2.33,0.22,0.95,0.01,-3.01,0,3,dominant_model,s,True
2,Conv1D_7.0,1.44,0.31,1.08,0.2,2.61,0.39,0.93,0.02,-2.36,2,2,intermediate_model,s,True
4,LSTM_14.0,2.39,1.21,1.79,0.87,4.65,2.4,0.76,0.3,1.44,3,1,intermediate_model,s,True
1,BidirectionalLSTM_7.0,3.7,2.28,2.8,1.43,6.79,3.35,0.31,1.19,7.21,4,0,non_dominant_model,s,True


<IPython.core.display.Javascript object>

### Time Series Split

In [285]:
df_results_cm["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [286]:
df_results_cm_tss = (
    df_results_cm[df_results_cm["Cross Validation"].eq("Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [287]:
df_results_cm_tss["Cross Validation"].unique()

array(['Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [288]:
df_results_cm_tss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [289]:
df_results_cm_tss = compute_scpm(df_results_cm_tss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [290]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_tss)
dominance_matrix_cm_tss = dominance_dict["dominance_matrix"]
dominance_graph_cm_tss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [291]:
dominance_matrix_cm_tss.shape, len(dominance_graph_cm_tss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [292]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    3
dominant_model        1
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [293]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/209_s_dominance_analysis_cm_tss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [294]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,209,s,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,Transformer,Transformer_1.0,1.0,1.311009,...,2.147842,0.935433,0.216608,0.09795,0.19273,0.019644,-3.554423,0,dominant_model,4
3,209,s,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,MLP,MLP,,1.387445,...,2.446752,0.92681,0.355405,0.238475,0.506784,0.033572,-2.837274,1,intermediate_model,3
2,209,s,Time Series Split,Chemical + Mineralogical,Chemical,Conv1D,Conv1D,Conv1D_7.0,7.0,1.557909,...,2.823972,0.910436,0.148739,0.112064,0.215531,0.011473,-1.729965,2,intermediate_model,2
4,209,s,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,LSTM,LSTM_14.0,14.0,1.951237,...,3.6009,0.856122,0.375631,0.313801,0.79674,0.055328,0.665817,3,intermediate_model,1
1,209,s,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_7.0,7.0,2.907399,...,5.452278,0.670463,0.727911,0.588969,1.413314,0.145463,7.455845,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [295]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [296]:
# Top 4
df_sorted_topo_cols.iloc[0:2].sort_values(by="Dominates_Count", ascending=False).round(
    2
)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,Transformer_1.0,1.31,0.22,0.88,0.1,2.15,0.19,0.94,0.02,-3.55,0,dominant_model,4,s
3,MLP,1.39,0.36,1.0,0.24,2.45,0.51,0.93,0.03,-2.84,1,intermediate_model,3,s


<IPython.core.display.Javascript object>

In [297]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [298]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:4].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
3,MLP,1.39,0.36,1.0,0.24,2.45,0.51,0.93,0.03,-2.84,1,intermediate_model,3,s
2,Conv1D_7.0,1.56,0.15,1.16,0.11,2.82,0.22,0.91,0.01,-1.73,2,intermediate_model,2,s
4,LSTM_14.0,1.95,0.38,1.41,0.31,3.6,0.8,0.86,0.06,0.67,3,intermediate_model,1,s


<IPython.core.display.Javascript object>

In [299]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(3, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [300]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,BidirectionalLSTM_7.0,2.907399,0.727911,2.253783,0.588969,5.452278,1.413314,0.670463,0.145463,7.455845,4,non_dominant_model,0,s


<IPython.core.display.Javascript object>

In [301]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [302]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [303]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [304]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [305]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [306]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [307]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,2.0,2.0
Dominated_Count,std,,1.83
Dominated_Count,min,2.0,0.0
Dominated_Count,25%,2.0,0.75
Dominated_Count,50%,2.0,2.0
Dominated_Count,75%,2.0,3.25
Dominated_Count,max,2.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,2.0,2.0


<IPython.core.display.Javascript object>

In [308]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [309]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,intermediate_model,2


<IPython.core.display.Javascript object>

In [310]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  intermediate_model    2
                 dominant_model        1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [311]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [312]:
summary_stats.round(2)

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,2.0,2.0,2,2,0.4
Neural Networks,4,2.0,2.0,8,8,1.6


<IPython.core.display.Javascript object>

In [313]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,1.557909,,1.557909,1.557909,1.557909,1.557909,1.557909,1.0,0.148739,...,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
Neural Networks,4.0,1.889272,0.736351,1.311009,1.368336,1.669341,2.190277,2.907399,4.0,0.418889,...,3.25,4.0,4.0,2.0,1.825742,0.0,0.75,2.0,3.25,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [314]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [315]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [316]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [317]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,Transformer_1.0,1.31,0.22,0.88,0.1,2.15,0.19,0.94,0.02,-3.55,0,4,dominant_model,s,True
3,MLP,1.39,0.36,1.0,0.24,2.45,0.51,0.93,0.03,-2.84,1,3,intermediate_model,s,True
2,Conv1D_7.0,1.56,0.15,1.16,0.11,2.82,0.22,0.91,0.01,-1.73,2,2,intermediate_model,s,True
4,LSTM_14.0,1.95,0.38,1.41,0.31,3.6,0.8,0.86,0.06,0.67,3,1,intermediate_model,s,True
1,BidirectionalLSTM_7.0,2.91,0.73,2.25,0.59,5.45,1.41,0.67,0.15,7.46,4,0,non_dominant_model,s,True


<IPython.core.display.Javascript object>

## Variable Grouping: CM-P

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P were from:**

*BTiSS:*
    Dominance analysis: Plant AM
    SCPM:Plant AM

*TiSS:*
    Dominance analysis: Plant W
    SCPM:Plant W

In [318]:
df_results["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [319]:
df_results_cm_p = (
    df_results[
        df_results["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [320]:
df_results_cm_p["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [321]:
df_results_cm_p.shape

(20, 17)

<IPython.core.display.Javascript object>

### Plant AM

In [322]:
df_results_cm_p = df_results_cm_p[df_results_cm_p["Plant"].eq("am")].reset_index(
    drop=True
)

<IPython.core.display.Javascript object>

### Blocking time series

In [323]:
df_results_cm_p["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [324]:
df_results_cm_p_btss = (
    df_results_cm_p[
        df_results_cm_p["Cross Validation"].eq("Blocking Time Series Split")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [325]:
df_results_cm_p_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [326]:
df_results_cm_p_btss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [327]:
df_results_cm_p_btss = compute_scpm(df_results_cm_p_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [328]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_btss)
dominance_matrix_cm_p_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [329]:
dominance_matrix_cm_p_btss.shape, len(dominance_graph_cm_p_btss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [330]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    3
dominant_model        1
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [331]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/209_am_dominance_analysis_cm_p_btss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [332]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,209,am,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,MLP,MLP,,0.977688,...,1.817851,0.972777,0.07991,0.055835,0.1251,0.002697,-6.142056,0,dominant_model,4
3,209,am,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Conv1D,Conv1D,Conv1D_7.0,7.0,1.32179,...,2.43999,0.951914,0.151181,0.094707,0.219578,0.007694,-0.941569,1,intermediate_model,3
1,209,am,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_7.0,7.0,1.396904,...,2.591385,0.944418,0.214808,0.176594,0.422295,0.019869,0.468387,2,intermediate_model,1
4,209,am,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,Transformer,Transformer_1.0,1.0,1.370688,...,2.579751,0.942873,0.236311,0.21449,0.512886,0.026853,0.251325,2,intermediate_model,1
2,209,am,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,LSTM,LSTM_7.0,7.0,1.783077,...,3.162759,0.907087,0.344122,0.183404,0.423823,0.045363,6.363913,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [333]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [334]:
# Top 4
df_sorted_topo_cols.iloc[0:5]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,MLP,0.98,0.08,0.75,0.06,1.82,0.13,0.97,0.0,-6.14,0,dominant_model,4,am
3,Conv1D_7.0,1.32,0.15,1.01,0.09,2.44,0.22,0.95,0.01,-0.94,1,intermediate_model,3,am
1,BidirectionalLSTM_7.0,1.4,0.21,1.09,0.18,2.59,0.42,0.94,0.02,0.47,2,intermediate_model,1,am
4,Transformer_1.0,1.37,0.24,1.06,0.21,2.58,0.51,0.94,0.03,0.25,2,intermediate_model,1,am
2,LSTM_7.0,1.78,0.34,1.32,0.18,3.16,0.42,0.91,0.05,6.36,4,non_dominant_model,0,am


<IPython.core.display.Javascript object>

In [335]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [336]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
3,Conv1D_7.0,1.32,0.15,1.01,0.09,2.44,0.22,0.95,0.01,-0.94,1,intermediate_model,3,am
4,Transformer_1.0,1.37,0.24,1.06,0.21,2.58,0.51,0.94,0.03,0.25,2,intermediate_model,1,am
1,BidirectionalLSTM_7.0,1.4,0.21,1.09,0.18,2.59,0.42,0.94,0.02,0.47,2,intermediate_model,1,am


<IPython.core.display.Javascript object>

In [337]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(3, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [338]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
2,LSTM_7.0,1.78,0.34,1.32,0.18,3.16,0.42,0.91,0.05,6.36,4,non_dominant_model,0,am


<IPython.core.display.Javascript object>

In [339]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [340]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [341]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [342]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [343]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [344]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [345]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [346]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,1.0,2.0
Dominated_Count,std,,1.63
Dominated_Count,min,1.0,0.0
Dominated_Count,25%,1.0,1.5
Dominated_Count,50%,1.0,2.0
Dominated_Count,75%,1.0,2.5
Dominated_Count,max,1.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,3.0,1.5


<IPython.core.display.Javascript object>

In [347]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [348]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,intermediate_model,2


<IPython.core.display.Javascript object>

In [349]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  intermediate_model    2
                 dominant_model        1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [350]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [351]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [352]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,3.0,3.0,1,3,0.6
Neural Networks,4,1.5,1.0,8,6,1.2


<IPython.core.display.Javascript object>

In [353]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,1.32179,,1.32179,1.32179,1.32179,1.32179,1.32179,1.0,0.151181,...,1.0,1.0,1.0,3.0,,3.0,3.0,3.0,3.0,3.0
Neural Networks,4.0,1.382089,0.328979,0.977688,1.272438,1.383796,1.493447,1.783077,4.0,0.218788,...,2.5,4.0,4.0,1.5,1.732051,0.0,0.75,1.0,1.75,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [354]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [355]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [356]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [357]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,MLP,0.98,0.08,0.75,0.06,1.82,0.13,0.97,0.0,-6.14,0,4,dominant_model,am,True
3,Conv1D_7.0,1.32,0.15,1.01,0.09,2.44,0.22,0.95,0.01,-0.94,1,3,intermediate_model,am,True
4,Transformer_1.0,1.37,0.24,1.06,0.21,2.58,0.51,0.94,0.03,0.25,2,1,intermediate_model,am,False
1,BidirectionalLSTM_7.0,1.4,0.21,1.09,0.18,2.59,0.42,0.94,0.02,0.47,2,1,intermediate_model,am,False
2,LSTM_7.0,1.78,0.34,1.32,0.18,3.16,0.42,0.91,0.05,6.36,4,0,non_dominant_model,am,True


<IPython.core.display.Javascript object>

### Time Series Split

In [358]:
df_results_cm_p = (
    df_results[
        df_results["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [359]:
df_results_cm_p["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [360]:
df_results_cm_p.shape

(20, 17)

<IPython.core.display.Javascript object>

### Plant W

In [361]:
df_results_cm_p = df_results_cm_p[df_results_cm_p["Plant"].eq("w")].reset_index(
    drop=True
)

<IPython.core.display.Javascript object>

In [362]:
df_results_cm["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [363]:
df_results_cm_p_tss = (
    df_results_cm_p[df_results_cm_p["Cross Validation"].eq("Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [364]:
df_results_cm_p_tss["Cross Validation"].unique()

array(['Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [365]:
df_results_cm_p_tss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [366]:
df_results_cm_p_tss = compute_scpm(df_results_cm_p_tss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [367]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_tss)
dominance_matrix_cm_p_tss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_tss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [368]:
dominance_matrix_cm_tss.shape, len(dominance_graph_cm_tss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [369]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    3
dominant_model        1
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [370]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/209_w_dominance_analysis_cm_p_tss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [371]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,209,w,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,MLP,MLP,,1.317312,...,2.463852,0.931815,0.244525,0.193443,0.375366,0.026493,-4.628705,0,dominant_model,4
2,209,w,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,LSTM,LSTM_7.0,7.0,1.660127,...,2.971102,0.887714,0.438231,0.291369,0.646294,0.058431,-2.062591,1,intermediate_model,3
1,209,w,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_1.0,1.0,1.86928,...,3.280013,0.852865,0.5838,0.411514,0.934949,0.086808,-0.370473,2,intermediate_model,1
4,209,w,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,Transformer,Transformer_1.0,1.0,1.868226,...,3.331895,0.850222,0.680115,0.475889,1.040908,0.107215,-0.277492,2,intermediate_model,1
3,209,w,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Conv1D,Conv1D,Conv1D_7.0,7.0,2.613889,...,4.963884,0.681428,1.245495,1.08779,2.35659,0.27816,7.339261,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [372]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [373]:
# Top 4
df_sorted_topo_cols.iloc[0:2].sort_values(by="Dominates_Count", ascending=False).round(
    2
)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,MLP,1.32,0.24,1.04,0.19,2.46,0.38,0.93,0.03,-4.63,0,dominant_model,4,w
2,LSTM_7.0,1.66,0.44,1.27,0.29,2.97,0.65,0.89,0.06,-2.06,1,intermediate_model,3,w


<IPython.core.display.Javascript object>

In [374]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [375]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:4].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
2,LSTM_7.0,1.66,0.44,1.27,0.29,2.97,0.65,0.89,0.06,-2.06,1,intermediate_model,3,w
1,BidirectionalLSTM_1.0,1.87,0.58,1.42,0.41,3.28,0.93,0.85,0.09,-0.37,2,intermediate_model,1,w
4,Transformer_1.0,1.87,0.68,1.42,0.48,3.33,1.04,0.85,0.11,-0.28,2,intermediate_model,1,w


<IPython.core.display.Javascript object>

In [376]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(3, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [377]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
3,Conv1D_7.0,2.613889,1.245495,2.116994,1.08779,4.963884,2.35659,0.681428,0.27816,7.339261,4,non_dominant_model,0,w


<IPython.core.display.Javascript object>

In [378]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [379]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [380]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [381]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [382]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [383]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,non_dominant_model,100.0
Neural Networks,intermediate_model,75.0
Neural Networks,dominant_model,25.0


<IPython.core.display.Javascript object>

In [384]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,4.0,1.25
Dominated_Count,std,,0.96
Dominated_Count,min,4.0,0.0
Dominated_Count,25%,4.0,0.75
Dominated_Count,50%,4.0,1.5
Dominated_Count,75%,4.0,2.0
Dominated_Count,max,4.0,2.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,0.0,2.25


<IPython.core.display.Javascript object>

In [385]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [386]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,non_dominant_model,1
Neural Networks,4,2,intermediate_model,3


<IPython.core.display.Javascript object>

In [387]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           non_dominant_model    1
Neural Networks  intermediate_model    3
                 dominant_model        1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [388]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [389]:
summary_stats.round(2)

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,0.0,0.0,4,0,0.0
Neural Networks,4,2.25,2.0,5,9,1.8


<IPython.core.display.Javascript object>

In [390]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,2.613889,,2.613889,2.613889,2.613889,2.613889,2.613889,1.0,1.245495,...,4.0,4.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
Neural Networks,4.0,1.678736,0.260248,1.317312,1.574423,1.764176,1.86849,1.86928,4.0,0.486668,...,2.0,2.0,4.0,2.25,1.5,1.0,1.0,2.0,3.25,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [391]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [392]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [393]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [394]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,MLP,1.32,0.24,1.04,0.19,2.46,0.38,0.93,0.03,-4.63,0,4,dominant_model,w,True
2,LSTM_7.0,1.66,0.44,1.27,0.29,2.97,0.65,0.89,0.06,-2.06,1,3,intermediate_model,w,True
1,BidirectionalLSTM_1.0,1.87,0.58,1.42,0.41,3.28,0.93,0.85,0.09,-0.37,2,1,intermediate_model,w,True
4,Transformer_1.0,1.87,0.68,1.42,0.48,3.33,1.04,0.85,0.11,-0.28,2,1,intermediate_model,w,True
3,Conv1D_7.0,2.61,1.25,2.12,1.09,4.96,2.36,0.68,0.28,7.34,4,0,non_dominant_model,w,True


<IPython.core.display.Javascript object>

## Variable Grouping: CM-P-CS

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P-CS were from:**

*BTiSS:*
    Dominance analysis: Plant S and Plant K
    SCPM: Plant S and Plant K

*TiSS:*
    Dominance analysis: Plant S
    SCPM: Plant S

In [395]:
df_results["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [396]:
df_results_cm_p_cs = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [397]:
df_results_cm_p_cs["Features"].unique()

array(['Chemical + Mineralogical + Physical'], dtype=object)

<IPython.core.display.Javascript object>

In [398]:
df_results_cm_p_cs.shape

(30, 17)

<IPython.core.display.Javascript object>

### Plant S

In [399]:
df_results_cm_p_cs = df_results_cm_p_cs[
    df_results_cm_p_cs["Plant"].eq("s")
].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Blocking time series

In [400]:
df_results_cm_p_cs["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [401]:
df_results_cm_p_cs_btss = (
    df_results_cm_p_cs[
        df_results_cm_p_cs["Cross Validation"].eq("Blocking Time Series Split")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [402]:
df_results_cm_p_cs_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [403]:
df_results_cm_p_cs_btss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [404]:
df_results_cm_p_cs_btss = compute_scpm(df_results_cm_p_cs_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [405]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_btss)
dominance_matrix_cm_p_cs_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [406]:
dominance_matrix_cm_p_cs_btss.shape, len(dominance_graph_cm_p_cs_btss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [407]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    3
dominant_model        1
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [408]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/209_s_dominance_analysis_cm_p_cs_btss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [409]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,209,s,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,MLP,MLP,,0.706974,...,1.343428,0.982151,0.131605,0.091949,0.217774,0.006596,-5.297239,0,dominant_model,4
1,209,s,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Conv1D,Conv1D,Conv1D_1.0,1.0,0.909706,...,1.770868,0.971428,0.138244,0.104172,0.235113,0.008137,-2.180513,1,intermediate_model,3
4,209,s,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_1.0,1.0,1.090873,...,2.068817,0.958023,0.197852,0.182819,0.414481,0.015695,0.307526,2,intermediate_model,2
3,209,s,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,LSTM,LSTM_1.0,1.0,1.115614,...,2.069726,0.95711,0.100757,0.115338,0.305806,0.008406,0.493156,3,intermediate_model,1
2,209,s,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,Transformer,Transformer_1.0,1.0,1.670219,...,2.61505,0.900631,0.4944,0.187695,0.453612,0.04708,6.67707,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [410]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [411]:
# Top 4
df_sorted_topo_cols.iloc[0:5]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,MLP,0.71,0.13,0.54,0.09,1.34,0.22,0.98,0.01,-5.3,0,dominant_model,4,s
1,Conv1D_1.0,0.91,0.14,0.72,0.1,1.77,0.24,0.97,0.01,-2.18,1,intermediate_model,3,s
4,BidirectionalLSTM_1.0,1.09,0.2,0.83,0.18,2.07,0.41,0.96,0.02,0.31,2,intermediate_model,2,s
3,LSTM_1.0,1.12,0.1,0.84,0.12,2.07,0.31,0.96,0.01,0.49,3,intermediate_model,1,s
2,Transformer_1.0,1.67,0.49,1.03,0.19,2.62,0.45,0.9,0.05,6.68,4,non_dominant_model,0,s


<IPython.core.display.Javascript object>

In [412]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [413]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,Conv1D_1.0,0.91,0.14,0.72,0.1,1.77,0.24,0.97,0.01,-2.18,1,intermediate_model,3,s
4,BidirectionalLSTM_1.0,1.09,0.2,0.83,0.18,2.07,0.41,0.96,0.02,0.31,2,intermediate_model,2,s
3,LSTM_1.0,1.12,0.1,0.84,0.12,2.07,0.31,0.96,0.01,0.49,3,intermediate_model,1,s


<IPython.core.display.Javascript object>

In [414]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(3, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [415]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
2,Transformer_1.0,1.67,0.49,1.03,0.19,2.62,0.45,0.9,0.05,6.68,4,non_dominant_model,0,s


<IPython.core.display.Javascript object>

In [416]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [417]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [418]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [419]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [420]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [421]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [422]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [423]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,1.0,2.25
Dominated_Count,std,,1.71
Dominated_Count,min,1.0,0.0
Dominated_Count,25%,1.0,1.5
Dominated_Count,50%,1.0,2.5
Dominated_Count,75%,1.0,3.25
Dominated_Count,max,1.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,3.0,1.75


<IPython.core.display.Javascript object>

In [424]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [425]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,intermediate_model,2


<IPython.core.display.Javascript object>

In [426]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  intermediate_model    2
                 dominant_model        1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [427]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [428]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [429]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,3.0,3.0,1,3,0.6
Neural Networks,4,1.75,1.5,9,7,1.4


<IPython.core.display.Javascript object>

In [430]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,0.909706,,0.909706,0.909706,0.909706,0.909706,0.909706,1.0,0.138244,...,1.0,1.0,1.0,3.0,,3.0,3.0,3.0,3.0,3.0
Neural Networks,4.0,1.14592,0.396448,0.706974,0.994898,1.103244,1.254266,1.670219,4.0,0.231153,...,3.25,4.0,4.0,1.75,1.707825,0.0,0.75,1.5,2.5,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [431]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [432]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [433]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [434]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,MLP,0.71,0.13,0.54,0.09,1.34,0.22,0.98,0.01,-5.3,0,4,dominant_model,s,True
1,Conv1D_1.0,0.91,0.14,0.72,0.1,1.77,0.24,0.97,0.01,-2.18,1,3,intermediate_model,s,True
4,BidirectionalLSTM_1.0,1.09,0.2,0.83,0.18,2.07,0.41,0.96,0.02,0.31,2,2,intermediate_model,s,True
3,LSTM_1.0,1.12,0.1,0.84,0.12,2.07,0.31,0.96,0.01,0.49,3,1,intermediate_model,s,True
2,Transformer_1.0,1.67,0.49,1.03,0.19,2.62,0.45,0.9,0.05,6.68,4,0,non_dominant_model,s,True


<IPython.core.display.Javascript object>

### Plant K

In [435]:
df_results_cm_p_cs = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [436]:
df_results_cm_p_cs["Features"].unique()

array(['Chemical + Mineralogical + Physical'], dtype=object)

<IPython.core.display.Javascript object>

In [437]:
df_results_cm_p_cs.shape

(30, 17)

<IPython.core.display.Javascript object>

In [438]:
df_results_cm_p_cs = df_results_cm_p_cs[
    df_results_cm_p_cs["Plant"].eq("k")
].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Blocking time series

In [439]:
df_results_cm_p_cs["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [440]:
df_results_cm_p_cs_btss = (
    df_results_cm_p_cs[
        df_results_cm_p_cs["Cross Validation"].eq("Blocking Time Series Split")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [441]:
df_results_cm_p_cs_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [442]:
df_results_cm_p_cs_btss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [443]:
df_results_cm_p_cs_btss = compute_scpm(df_results_cm_p_cs_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [444]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_btss)
dominance_matrix_cm_p_cs_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [445]:
dominance_matrix_cm_p_cs_btss.shape, len(dominance_graph_cm_p_cs_btss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [446]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    3
dominant_model        1
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [447]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/209_k_dominance_analysis_cm_p_cs_btss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [448]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,209,k,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,MLP,MLP,,0.856619,...,1.566781,0.973031,0.215978,0.157436,0.394729,0.019362,-5.707534,0,dominant_model,4
1,209,k,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Conv1D,Conv1D,Conv1D_7.0,7.0,1.354252,...,2.585411,0.940697,0.188048,0.163273,0.390112,0.021273,-1.677612,1,intermediate_model,3
2,209,k,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,Transformer,Transformer_1.0,1.0,1.593303,...,2.74163,0.917879,0.274851,0.117771,0.283808,0.042359,-0.573354,2,intermediate_model,2
3,209,k,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,LSTM,LSTM_7.0,7.0,1.903208,...,3.439258,0.875586,0.368892,0.271958,0.669841,0.066889,2.319616,3,intermediate_model,1
4,209,k,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_7.0,7.0,2.451529,...,3.550908,0.638752,1.831327,0.477803,1.016197,0.787869,5.638884,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [449]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [450]:
# Top 4
df_sorted_topo_cols.iloc[0:5]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,MLP,0.86,0.22,0.67,0.16,1.57,0.39,0.97,0.02,-5.71,0,dominant_model,4,k
1,Conv1D_7.0,1.35,0.19,1.1,0.16,2.59,0.39,0.94,0.02,-1.68,1,intermediate_model,3,k
2,Transformer_1.0,1.59,0.27,1.18,0.12,2.74,0.28,0.92,0.04,-0.57,2,intermediate_model,2,k
3,LSTM_7.0,1.9,0.37,1.48,0.27,3.44,0.67,0.88,0.07,2.32,3,intermediate_model,1,k
4,BidirectionalLSTM_7.0,2.45,1.83,1.52,0.48,3.55,1.02,0.64,0.79,5.64,4,non_dominant_model,0,k


<IPython.core.display.Javascript object>

In [451]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [452]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,Conv1D_7.0,1.35,0.19,1.1,0.16,2.59,0.39,0.94,0.02,-1.68,1,intermediate_model,3,k
2,Transformer_1.0,1.59,0.27,1.18,0.12,2.74,0.28,0.92,0.04,-0.57,2,intermediate_model,2,k
3,LSTM_7.0,1.9,0.37,1.48,0.27,3.44,0.67,0.88,0.07,2.32,3,intermediate_model,1,k


<IPython.core.display.Javascript object>

In [453]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(3, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [454]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
4,BidirectionalLSTM_7.0,2.45,1.83,1.52,0.48,3.55,1.02,0.64,0.79,5.64,4,non_dominant_model,0,k


<IPython.core.display.Javascript object>

In [455]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [456]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [457]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [458]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [459]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [460]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [461]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [462]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,1.0,2.25
Dominated_Count,std,,1.71
Dominated_Count,min,1.0,0.0
Dominated_Count,25%,1.0,1.5
Dominated_Count,50%,1.0,2.5
Dominated_Count,75%,1.0,3.25
Dominated_Count,max,1.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,3.0,1.75


<IPython.core.display.Javascript object>

In [463]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [464]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,intermediate_model,2


<IPython.core.display.Javascript object>

In [465]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  intermediate_model    2
                 dominant_model        1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [466]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [467]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [468]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,3.0,3.0,1,3,0.6
Neural Networks,4,1.75,1.5,9,7,1.4


<IPython.core.display.Javascript object>

In [469]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,1.354252,,1.354252,1.354252,1.354252,1.354252,1.354252,1.0,0.188048,...,1.0,1.0,1.0,3.0,,3.0,3.0,3.0,3.0,3.0
Neural Networks,4.0,1.701165,0.665522,0.856619,1.409132,1.748256,2.040288,2.451529,4.0,0.672762,...,3.25,4.0,4.0,1.75,1.707825,0.0,0.75,1.5,2.5,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [470]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [471]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [472]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [473]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,MLP,0.86,0.22,0.67,0.16,1.57,0.39,0.97,0.02,-5.71,0,4,dominant_model,k,True
1,Conv1D_7.0,1.35,0.19,1.1,0.16,2.59,0.39,0.94,0.02,-1.68,1,3,intermediate_model,k,True
2,Transformer_1.0,1.59,0.27,1.18,0.12,2.74,0.28,0.92,0.04,-0.57,2,2,intermediate_model,k,True
3,LSTM_7.0,1.9,0.37,1.48,0.27,3.44,0.67,0.88,0.07,2.32,3,1,intermediate_model,k,True
4,BidirectionalLSTM_7.0,2.45,1.83,1.52,0.48,3.55,1.02,0.64,0.79,5.64,4,0,non_dominant_model,k,True


<IPython.core.display.Javascript object>

### Time Series Split

In [474]:
df_results_cm_p_cs = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [475]:
df_results_cm_p_cs["Features"].unique()

array(['Chemical + Mineralogical + Physical'], dtype=object)

<IPython.core.display.Javascript object>

In [476]:
df_results_cm_p_cs.shape

(30, 17)

<IPython.core.display.Javascript object>

### Plant S

In [477]:
df_results_cm_p_cs = df_results_cm_p_cs[
    df_results_cm_p_cs["Plant"].eq("s")
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [478]:
df_results_cm_p_cs["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [479]:
df_results_cm_p_cs_tss = (
    df_results_cm_p_cs[df_results_cm_p_cs["Cross Validation"].eq("Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [480]:
df_results_cm_p_cs_tss["Cross Validation"].unique()

array(['Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [481]:
df_results_cm_p_cs_tss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [482]:
df_results_cm_p_cs_tss = compute_scpm(df_results_cm_p_cs_tss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [483]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_tss)
dominance_matrix_cm_p_cs_tss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_tss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [484]:
dominance_matrix_cm_p_cs_tss.shape, len(dominance_graph_cm_p_cs_tss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [485]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    3
dominant_model        1
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [486]:
df_sorted_topo.to_csv(
    "../../../../reports/results/global_models/ecics/209_s_dominance_analysis_cm_p_cs_tss.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [487]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,209,s,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,MLP,MLP,,0.790668,...,1.452909,0.976708,0.101774,0.063122,0.146045,0.00529,-5.984823,0,dominant_model,4
1,209,s,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Conv1D,Conv1D,Conv1D_1.0,1.0,0.97057,...,1.735976,0.964481,0.203195,0.087436,0.188137,0.012758,-2.078415,1,intermediate_model,3
3,209,s,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,LSTM,LSTM_1.0,1.0,1.179825,...,1.759961,0.940172,0.527326,0.156331,0.385089,0.055051,0.501405,2,intermediate_model,1
4,209,s,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_1.0,1.0,1.185788,...,1.972467,0.946583,0.239995,0.148219,0.380745,0.021361,1.820974,2,intermediate_model,1
2,209,s,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,Transformer,Transformer_1.0,1.0,1.374921,...,2.18924,0.926756,0.360852,0.158361,0.318648,0.034609,5.74086,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [488]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [489]:
# Top 4
df_sorted_topo_cols.iloc[0:2].sort_values(by="Dominates_Count", ascending=False).round(
    2
)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,MLP,0.79,0.1,0.58,0.06,1.45,0.15,0.98,0.01,-5.98,0,dominant_model,4,s
1,Conv1D_1.0,0.97,0.2,0.71,0.09,1.74,0.19,0.96,0.01,-2.08,1,intermediate_model,3,s


<IPython.core.display.Javascript object>

In [490]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [491]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:4].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,Conv1D_1.0,0.97,0.2,0.71,0.09,1.74,0.19,0.96,0.01,-2.08,1,intermediate_model,3,s
3,LSTM_1.0,1.18,0.53,0.71,0.16,1.76,0.39,0.94,0.06,0.5,2,intermediate_model,1,s
4,BidirectionalLSTM_1.0,1.19,0.24,0.79,0.15,1.97,0.38,0.95,0.02,1.82,2,intermediate_model,1,s


<IPython.core.display.Javascript object>

In [492]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(3, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [493]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
2,Transformer_1.0,1.374921,0.360852,0.895846,0.158361,2.18924,0.318648,0.926756,0.034609,5.74086,4,non_dominant_model,0,s


<IPython.core.display.Javascript object>

In [494]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [495]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [496]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [497]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [498]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [499]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [500]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,1.0,2.0
Dominated_Count,std,,1.63
Dominated_Count,min,1.0,0.0
Dominated_Count,25%,1.0,1.5
Dominated_Count,50%,1.0,2.0
Dominated_Count,75%,1.0,2.5
Dominated_Count,max,1.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,3.0,1.5


<IPython.core.display.Javascript object>

In [501]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [502]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,intermediate_model,2


<IPython.core.display.Javascript object>

In [503]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  intermediate_model    2
                 dominant_model        1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [504]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [505]:
summary_stats.round(2)

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,3.0,3.0,1,3,0.6
Neural Networks,4,1.5,1.0,8,6,1.2


<IPython.core.display.Javascript object>

In [506]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,0.97057,,0.97057,0.97057,0.97057,0.97057,0.97057,1.0,0.203195,...,1.0,1.0,1.0,3.0,,3.0,3.0,3.0,3.0,3.0
Neural Networks,4.0,1.132801,0.245422,0.790668,1.082536,1.182807,1.233071,1.374921,4.0,0.307487,...,2.5,4.0,4.0,1.5,1.732051,0.0,0.75,1.0,1.75,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [507]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [508]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [509]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [510]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,MLP,0.79,0.1,0.58,0.06,1.45,0.15,0.98,0.01,-5.98,0,4,dominant_model,s,True
1,Conv1D_1.0,0.97,0.2,0.71,0.09,1.74,0.19,0.96,0.01,-2.08,1,3,intermediate_model,s,True
3,LSTM_1.0,1.18,0.53,0.71,0.16,1.76,0.39,0.94,0.06,0.5,2,1,intermediate_model,s,True
4,BidirectionalLSTM_1.0,1.19,0.24,0.79,0.15,1.97,0.38,0.95,0.02,1.82,2,1,intermediate_model,s,True
2,Transformer_1.0,1.37,0.36,0.9,0.16,2.19,0.32,0.93,0.03,5.74,4,0,non_dominant_model,s,True


<IPython.core.display.Javascript object>