In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import os
import glob
import numpy as np

<IPython.core.display.Javascript object>

In [3]:
import warnings

# Suppress specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn._oldcore")


<IPython.core.display.Javascript object>

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

<IPython.core.display.Javascript object>

In [5]:
from sklearn.preprocessing import StandardScaler

<IPython.core.display.Javascript object>

In [6]:
from collections import deque, defaultdict


<IPython.core.display.Javascript object>

In [7]:
def read_csv_files_grouped(csv_files, plant, header=[0, 1]):
    results = []
    for filepath in csv_files:
        df = pd.read_csv(filepath, header=header, low_memory=False)
        df["plant"] = plant  # Add the 'plant' column
        results.append(df)
    df = pd.concat(results, ignore_index=True)
    return df

<IPython.core.display.Javascript object>

In [8]:
def read_csv_files_full(csv_files, plant):
    results = []
    for filepath in csv_files:
        df = pd.read_csv(filepath, low_memory=False)
        df["plant"] = plant  # Add the 'plant' column
        results.append(df)
    df = pd.concat(results, ignore_index=True)
    return df

<IPython.core.display.Javascript object>

### Renaming the Columns

In [9]:
def preprocess_rename_columns(df):
    df.columns = pd.MultiIndex.from_tuples(
        list(
            {
                col: (col[0], "") if "Unnamed" in col[1] else col for col in df.columns
            }.values()
        )
    )
    return df

<IPython.core.display.Javascript object>

### Changing the order_of columns Columns

In [10]:
def preprocess_change_columns_order(df, column, pos):
    # Get the list of columns
    cols = df.columns.tolist()
    # Remove the specified column
    cols.remove(column)
    # Insert the column at the desired position
    cols.insert(pos, column)
    # Reorder the DataFrame columns
    return df[cols]

<IPython.core.display.Javascript object>

In [11]:
def read_csv_files_path(csv_files_path_dict, path, plant):
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    csv_files_path_dict[plant] = csv_files
    return csv_files_path_dict

<IPython.core.display.Javascript object>

# Topological Analysis - NEW

## Definitions

In [12]:
# Function to compare two models strictly based on means only
def dominates(model1, model2):
    """Return True if model1 strictly dominates model2, False otherwise."""
    # Check if model1 is at least as good as model2 in all metrics
    at_least_as_good = (
        model1["RMSE_mean"] <= model2["RMSE_mean"]
        and model1["MAE_mean"] <= model2["MAE_mean"]
        and model1["MAPE_mean"] <= model2["MAPE_mean"]
        and model1["R2_mean"] >= model2["R2_mean"]
    )

    # Check if model1 is strictly better in at least one metric
    strictly_better_in_one = (
        model1["RMSE_mean"] < model2["RMSE_mean"]
        or model1["MAE_mean"] < model2["MAE_mean"]
        or model1["MAPE_mean"] < model2["MAPE_mean"]
        or model1["R2_mean"] > model2["R2_mean"]
    )

    # Model1 strictly dominates Model2 if it is at least as good in all metrics and
    # strictly better in at least one metric
    return at_least_as_good and strictly_better_in_one

<IPython.core.display.Javascript object>

In [13]:
def get_dominance_matrix_and_graph(df):
    n = len(df)
    dominance_matrix = np.zeros((n, n), dtype=bool)

    for i in range(n):
        for j in range(n):
            if i != j:
                dominance_matrix[i, j] = dominates(df.iloc[i], df.iloc[j])

    # Create the dominance graph
    dominance_graph = {i: [] for i in range(n)}
    for i in range(n):
        for j in range(n):
            if dominance_matrix[i, j]:
                dominance_graph[i].append(j)

    return dominance_matrix, dominance_graph

<IPython.core.display.Javascript object>

In [14]:
# Topological Sorting
def topological_sort(graph):
    """Perform topological sorting on the given graph."""
    in_degree = {u: 0 for u in graph}
    for u in graph:
        for v in graph[u]:
            in_degree[v] += 1

    queue = deque([u for u in graph if in_degree[u] == 0])
    topo_order = []

    while queue:
        u = queue.popleft()
        topo_order.append(u)

        for v in graph[u]:
            in_degree[v] -= 1
            if in_degree[v] == 0:
                queue.append(v)

    return topo_order

<IPython.core.display.Javascript object>

In [15]:
# Function to find the top models in each group
def find_top_models(group):
    max_net_dominance = group["Net_Dominance"].max()
    top_models = group[group["Net_Dominance"] == max_net_dominance]
    return top_models

<IPython.core.display.Javascript object>

### Dominance Analysis and SCPM

In [16]:
def compute_scpm(df):
    scaler = StandardScaler()
    standardized_metrics = scaler.fit_transform(
        df[["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"]]
    )

    # Creating a new dataframe with standardized values
    standardized_df = pd.DataFrame(
        standardized_metrics,
        columns=["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"],
    )

    # Summing all metrics and subtracting R2_mean
    standardized_df["Result"] = (
        standardized_df[["RMSE_mean", "MAE_mean", "MAPE_mean"]].sum(axis=1)
        - standardized_df["R2_mean"]
    )

    df["SCPM"] = standardized_df["Result"]
    return df

<IPython.core.display.Javascript object>

In [17]:
def make_dominance_analysis(df):
    # Compute dominance matrix and graph
    dominance_matrix, dominance_graph = get_dominance_matrix_and_graph(df)

    # Get the topological order
    topo_order = topological_sort(dominance_graph)
    df = df.copy()
    n = len(topo_order)

    # Identify Dominant Models
    dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if not np.any(dominance_matrix[:, i])
    ]

    # Identify Non-Dominant Models
    non_dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i])
    ]

    # Identify Models that are both dominated and dominate at least one
    intermediate_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i]) and np.any(dominance_matrix[i, :])
    ]

    # Add count of times each model is dominated by others
    df["Dominated_Count"] = np.sum(dominance_matrix, axis=0)

    # Classify models
    df["Classification"] = "intermediate_model"  # default to intermediate_model
    df.loc[
        ~np.any(dominance_matrix, axis=1) & np.any(dominance_matrix, axis=0),
        "Classification",
    ] = "non_dominant_model"
    df.loc[~np.any(dominance_matrix, axis=0), "Classification"] = "dominant_model"

    # Map the topological order to the model names and get dominance counts
    sorted_models = [df["Model"].iloc[i] for i in topo_order]

    df_sorted = df.iloc[topo_order].copy()
    df_sorted["Dominates_Count"] = dominance_matrix.sum(axis=1)[topo_order]
    df_sorted_topo = df_sorted.copy()

    df_sorted = df_sorted.sort_values(by="Dominates_Count", ascending=False)
    df_sorted_count = df_sorted.copy()

    return {
        "dominance_matrix": dominance_matrix,
        "dominance_graph": dominance_graph,
        "df_sorted_topo": df_sorted_topo,
        "df_sorted_count": df_sorted_count,
    }

<IPython.core.display.Javascript object>

# Pre Training Analysis

# Reading The files

In [18]:
csv_files_path_fine_tuning = dict()
csv_files_path_pre_train = dict()

<IPython.core.display.Javascript object>

## 203

### Plant C

In [19]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/203/c/pre_training/full/"
plant = "c"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

### Plant K

In [20]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/k/pre_training/full/"
plant = "k"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

## 204

### Plant AB

In [21]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/204/ab/pre_training/full/"
plant = "ab"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

### Plant F

In [22]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/204/f/pre_training/full/"
plant = "f"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

## 206

### Plant B

In [23]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/206/b/pre_training/full/"
plant = "b"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

## 207

### Plant AT

In [24]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/207/at/pre_training/full/"
plant = "at"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

## 209

### Plant S

In [25]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/s/pre_training/full/"
plant = "s"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

### Plant AM

In [26]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/am/pre_training/full/"
plant = "am"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

### Plant W

In [27]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/w/pre_training/full/"
plant = "w"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

### Plant K

In [28]:
path_pre_train = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/k/pre_training/full/"
plant = "k"

csv_files_path_pre_train = read_csv_files_path(
    csv_files_path_pre_train, path_pre_train, plant
)

<IPython.core.display.Javascript object>

### Reading all data - pre training

#### Full

In [29]:
dfs_plant = dict()
column = ("plant", "")
pos = 2

for plant, csv_files in csv_files_path_pre_train.items():
    df = read_csv_files_full(csv_files, plant)
    df["plant"] = plant
    dfs_plant[plant] = df

<IPython.core.display.Javascript object>

In [30]:
df_pre_train = pd.concat(dfs_plant.values(), ignore_index=True)

<IPython.core.display.Javascript object>

In [31]:
df_pre_train.shape

(1533, 23)

<IPython.core.display.Javascript object>

## Preprocessing

In [32]:
patterns = [
    "Chemical + Mineralogical + Feature Engineering",
    "Chemical + Mineralogical + CS7 + One-Hot",
    "Chemical + Mineralogical + Physical + One-Hot",
    "Chemical + Mineralogical + CS3 + One-Hot",
    "Chemical + Mineralogical + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + One-Hot",
    "Chemical + Mineralogical + CS1 + One-Hot",
    "Chemical + Mineralogical + CS2 + One-Hot",
    "Chemical + Feature Engineering",
]

FEATURES_TO_REPLACE_1 = {
    "Chemical + Mineralogical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS2": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS7": "Chemical + Mineralogical + Early CS",
}

FEATURES_TO_REPLACE_2 = {
    "Chemical": "Chemical + Mineralogical",
    "Chemical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Mineralogical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Physical": "Chemical + Mineralogical + Physical",
}

df_pre_train = df_pre_train[
    ~df_pre_train["Features"].apply(lambda x: x in patterns)
].reset_index(drop=True)

df_pre_train = df_pre_train[
    ~df_pre_train["Cross Validation"].eq("Out of time Split")
].reset_index(drop=True)

# removing afterwards
df_pre_train = df_pre_train[
    ~df_pre_train["Cross Validation"].eq("Repeated KFold")
].reset_index(drop=True)

df_pre_train["Features_bkp"] = df_pre_train["Features"].copy()
df_pre_train["Features"] = df_pre_train["Features"].replace(FEATURES_TO_REPLACE_1)

df_pre_train["Features"] = df_pre_train["Features"].replace(FEATURES_TO_REPLACE_2)


# removing afterwards
# df_pre_train = df_pre_train[
#     ~df_pre_train["Features"].eq("Chemical + Mineralogical + Early CS")
# ].reset_index(drop=True)

metrics_names = {
    "RMSE Test": "RMSE_mean",
    "MAE Test": "MAE_mean",
    "MAPE Test": "MAPE_mean",
    "R2 Test": "R2_mean",
}

df_pre_train = df_pre_train.rename(metrics_names, axis=1)

<IPython.core.display.Javascript object>

In [33]:
# Ensuring only the results of the best selected plants will be taken into account

cm = df_pre_train["Features"].eq("Chemical + Mineralogical")
cmp = df_pre_train["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
cmpcs = df_pre_train["Features"].eq("Chemical + Mineralogical + Physical")

fs = df_pre_train["plant"].eq("s")
fab = df_pre_train["plant"].eq("ab")
ff = df_pre_train["plant"].eq("f")
fam = df_pre_train["plant"].eq("am")
fw = df_pre_train["plant"].eq("w")
fk = df_pre_train["plant"].eq("k")

fc = df_pre_train["plant"].eq("c")
fb = df_pre_train["plant"].eq("b")
fat = df_pre_train["plant"].eq("at")

f_final = (
    (cm & (fs | fc | fab | fb | fat | ff))
    | (cmp & (fam | fc | ff | fb | fat | fw))
    | (cmpcs & (fs | fk | fc | ff | fb | fat | fab))
)
df_pre_train = df_pre_train[f_final].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [34]:
df_pre_train["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [35]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [36]:
df_pre_train["MAPE Train"] = df_pre_train["MAPE Train"] * 100
df_pre_train["MAPE_mean"] = df_pre_train["MAPE_mean"] * 100

df_pre_train["Model_bkp"] = df_pre_train["Model"].copy()
df_pre_train["Model_bkp_2"] = df_pre_train["Model"].apply(
    lambda x: "" if pd.isna(x) else "_" + str(x)
)

<IPython.core.display.Javascript object>

In [37]:
df_pre_train["Model_bkp_2"] = df_pre_train["Model_bkp_2"].apply(
    lambda x: "MLP"
    if "MLP" in x
    else "Bi-LSTM"
    if "Bi-LSTM" in x
    else "LSTM"
    if "LSTM" in x
    else "Conv1D"
    if "Conv1D" in x
    else "Transformer"
)

<IPython.core.display.Javascript object>

In [38]:
df_pre_train["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [39]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [40]:
df_pre_train["Features_bkp"].unique()

array(['Chemical + Properties CS Less', 'Chemical + Physical', 'Chemical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [41]:
df_pre_train.shape

(1387, 26)

<IPython.core.display.Javascript object>

In [42]:
df_pre_train[df_pre_train["Plant"].eq("F")]["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [43]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [44]:
df_pre_train["Plant"].value_counts()

Plant
C     219
F     219
B     219
AT    219
AB    146
S     146
K      73
AM     73
W      73
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [45]:
df_pre_train.shape

(1387, 26)

<IPython.core.display.Javascript object>

# Global Analysis (pre train results)

## ECICS

### Variable Grouping: CM

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM were from:**

*BTiSS:*
    Dominance analysis: Plant S
    SCPM:Plant S

*TiSS:*
    Dominance analysis: Plant S
    SCPM:Plant S

In [46]:
df_results_cm = (
    df_pre_train[df_pre_train["Features"].eq("Chemical + Mineralogical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [47]:
df_results_cm["Features"].unique()

array(['Chemical + Mineralogical'], dtype=object)

<IPython.core.display.Javascript object>

In [48]:
df_results_cm.shape

(438, 26)

<IPython.core.display.Javascript object>

#### Hold Out

In [49]:
df_results_cm["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [50]:
df_results_cm_ho = (
    df_results_cm[df_results_cm["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [51]:
df_results_cm_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [52]:
df_results_cm_ho.shape

(438, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [53]:
df_results_cm_ho = compute_scpm(df_results_cm_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [54]:
dominance_dict = make_dominance_analysis(df_results_cm_ho)
dominance_matrix_cm_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [55]:
dominance_matrix_cm_ho.shape, len(dominance_graph_cm_ho)

((438, 438), 438)

<IPython.core.display.Javascript object>

In [56]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    435
dominant_model          2
non_dominant_model      1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [57]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
330,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",7.0,Bi-LSTM11,,Standard Scaler,,...,4.101545,0.845872,at,Chemical,Bi-LSTM11,Bi-LSTM,-1.006033,0,dominant_model,431
334,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",14.0,Bi-LSTM15,,Standard Scaler,,...,4.142043,0.853383,at,Chemical,Bi-LSTM15,Bi-LSTM,-1.023931,0,dominant_model,435
309,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",7.0,LSTM5,,Standard Scaler,,...,4.139438,0.844656,at,Chemical,LSTM5,LSTM,-0.986891,1,intermediate_model,429
313,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",14.0,LSTM9,,Standard Scaler,,...,4.171853,0.848335,at,Chemical,LSTM9,LSTM,-0.991743,1,intermediate_model,432
315,Global Model,207,AT,Chemical + Mineralogical,"(62749, 12)",7.0,LSTM11,,Standard Scaler,,...,4.250362,0.849726,at,Chemical,LSTM11,LSTM,-0.976181,1,intermediate_model,428


<IPython.core.display.Javascript object>

In [58]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [59]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
334,Bi-LSTM15,14.0,2.33,1.77,4.14,0.85,-1.02,0,dominant_model,435,AT
313,LSTM9,14.0,2.37,1.79,4.17,0.85,-0.99,1,intermediate_model,432,AT
330,Bi-LSTM11,7.0,2.39,1.76,4.1,0.85,-1.01,0,dominant_model,431,AT


<IPython.core.display.Javascript object>

##### Top intermediate models

In [60]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
313,LSTM9,14.0,2.37,1.79,4.17,0.85,-0.99,1,intermediate_model,432,AT
309,LSTM5,7.0,2.4,1.78,4.14,0.84,-0.99,1,intermediate_model,429,AT


<IPython.core.display.Javascript object>

##### Top non dominant models

In [61]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
258,Bi-LSTM12,14.0,36.27,35.77,82.27,-35.29,56.36,437,non_dominant_model,0,B


<IPython.core.display.Javascript object>

In [62]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [63]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Bi-LSTM        90
LSTM           90
Conv1D         90
Transformer    90
MLP            78
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [64]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [65]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [66]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [67]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,90.0,181.42,146.35,0.0,41.25,184.0,321.5,437.0,90.0,220.16,141.95,0.0,75.0,220.0,329.0,435.0
Conv1D,90.0,161.4,98.11,4.0,77.25,158.5,233.0,392.0,90.0,231.43,94.02,22.0,172.0,232.0,301.0,423.0
LSTM,90.0,157.47,127.32,1.0,42.0,133.5,263.0,419.0,90.0,236.16,121.7,17.0,127.75,254.0,335.5,432.0
MLP,78.0,158.83,109.95,7.0,61.5,137.0,257.5,360.0,78.0,219.21,100.03,43.0,119.5,252.0,293.5,390.0
Transformer,90.0,316.63,67.52,201.0,259.0,304.5,380.5,431.0,90.0,76.86,50.52,4.0,27.5,83.5,113.75,195.0


<IPython.core.display.Javascript object>

In [68]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,90,3,intermediate_model,87
Conv1D,90,1,intermediate_model,90
LSTM,90,1,intermediate_model,90
MLP,78,1,intermediate_model,78
Transformer,90,1,intermediate_model,90


<IPython.core.display.Javascript object>

In [69]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    87
             dominant_model         2
             non_dominant_model     1
Conv1D       intermediate_model    90
LSTM         intermediate_model    90
MLP          intermediate_model    78
Transformer  intermediate_model    90
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [70]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,96.67
Bi-LSTM,dominant_model,2.22
Bi-LSTM,non_dominant_model,1.11
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,100.0
MLP,intermediate_model,100.0
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [71]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
330,Bi-LSTM,0,431,dominant_model
334,Bi-LSTM,0,435,dominant_model
309,LSTM,1,429,intermediate_model
313,LSTM,1,432,intermediate_model
315,LSTM,1,428,intermediate_model
...,...,...,...,...
112,Bi-LSTM,432,4,intermediate_model
109,Bi-LSTM,434,3,intermediate_model
39,Bi-LSTM,435,2,intermediate_model
331,Bi-LSTM,436,1,intermediate_model


<IPython.core.display.Javascript object>

In [72]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [73]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,90,220.155556,220.0,16328,19814,45.237443
Conv1D,90,231.433333,232.0,14526,20829,47.554795
LSTM,90,236.155556,254.0,14172,21254,48.525114
MLP,78,219.205128,252.0,12389,17098,39.03653
Transformer,90,76.855556,83.5,28497,6917,15.792237


<IPython.core.display.Javascript object>

In [74]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,90.0,7.333333,5.342221,1.0,1.0,7.0,14.0,14.0,90.0,3.938663,...,321.5,437.0,90.0,220.155556,141.947364,0.0,75.0,220.0,329.0,435.0
Conv1D,90.0,7.333333,5.342221,1.0,1.0,7.0,14.0,14.0,90.0,2.822161,...,233.0,392.0,90.0,231.433333,94.024626,22.0,172.0,232.0,301.0,423.0
LSTM,90.0,7.333333,5.342221,1.0,1.0,7.0,14.0,14.0,90.0,2.847826,...,263.0,419.0,90.0,236.155556,121.698089,17.0,127.75,254.0,335.5,432.0
MLP,0.0,,,,,,,,78.0,2.907567,...,257.5,360.0,78.0,219.205128,100.029653,43.0,119.5,252.0,293.5,390.0
Transformer,90.0,7.333333,5.342221,1.0,1.0,7.0,14.0,14.0,90.0,3.300154,...,380.5,431.0,90.0,76.855556,50.522564,4.0,27.5,83.5,113.75,195.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [75]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [76]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [77]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [78]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [79]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
334,Bi-LSTM15,Bi-LSTM,14.0,2.33,1.77,4.14,0.85,-1.02,0,435,dominant_model,AT,False
330,Bi-LSTM11,Bi-LSTM,7.0,2.39,1.76,4.1,0.85,-1.01,0,431,dominant_model,AT,False
313,LSTM9,LSTM,14.0,2.37,1.79,4.17,0.85,-0.99,1,432,intermediate_model,AT,False
309,LSTM5,LSTM,7.0,2.4,1.78,4.14,0.84,-0.99,1,429,intermediate_model,AT,False
315,LSTM11,LSTM,7.0,2.36,1.8,4.25,0.85,-0.98,1,428,intermediate_model,AT,True
32,Bi-LSTM5,Bi-LSTM,7.0,2.39,1.79,4.2,0.84,-0.98,2,426,intermediate_model,C,False
321,Bi-LSTM2,Bi-LSTM,7.0,2.36,1.8,4.27,0.85,-0.98,1,427,intermediate_model,AT,False


<IPython.core.display.Javascript object>

In [80]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].head(1)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
334,Bi-LSTM15,Bi-LSTM,14.0,2.334487,1.76689,4.142043,0.853383,-1.023931,0,435,dominant_model,AT,False


<IPython.core.display.Javascript object>

### Variable Grouping: CM-P

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P were from:**

*BTiSS:*
    Dominance analysis: Plant AM
    SCPM:Plant AM

*TiSS:*
    Dominance analysis: Plant W
    SCPM:Plant W

In [81]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [82]:
df_results_cm_p = (
    df_pre_train[
        df_pre_train["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [83]:
df_results_cm_p.shape

(438, 26)

<IPython.core.display.Javascript object>

#### Hold Out

In [84]:
df_results_cm_p["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [85]:
df_results_cm_p_ho = (
    df_results_cm_p[df_results_cm_p["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [86]:
df_results_cm_p_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [87]:
df_results_cm_p_ho.shape

(438, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [88]:
df_results_cm_p_ho = compute_scpm(df_results_cm_p_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [89]:
dominance_dict = make_dominance_analysis(df_results_cm_p_ho)
dominance_matrix_cm_p_am_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_am_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [90]:
dominance_matrix_cm_p_am_ho.shape, len(dominance_graph_cm_p_am_ho)

((438, 438), 438)

<IPython.core.display.Javascript object>

In [91]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    435
dominant_model          2
non_dominant_model      1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [92]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/local_models/ecics/ecics_dominance_analysis_cm_kf.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [93]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
285,Global Model,207,AT,Chemical + Mineralogical + Physical - Early CS,"(62749, 16)",,MLP_7,,Standard Scaler,,...,3.514956,0.891546,at,Chemical + Properties CS Less,MLP_7,MLP,-1.7765,0,dominant_model,436
289,Global Model,207,AT,Chemical + Mineralogical + Physical - Early CS,"(62749, 16)",,MLP_11,,Standard Scaler,,...,3.517973,0.893385,at,Chemical + Properties CS Less,MLP_11,MLP,-1.784692,0,dominant_model,436
288,Global Model,207,AT,Chemical + Mineralogical + Physical - Early CS,"(62749, 16)",,MLP_10,,Standard Scaler,,...,3.535726,0.885287,at,Chemical + Properties CS Less,MLP_10,MLP,-1.74188,2,intermediate_model,434
290,Global Model,207,AT,Chemical + Mineralogical + Physical - Early CS,"(62749, 16)",,MLP_12,,Standard Scaler,,...,3.635425,0.887517,at,Chemical + Properties CS Less,MLP_12,MLP,-1.723327,2,intermediate_model,432
284,Global Model,207,AT,Chemical + Mineralogical + Physical - Early CS,"(62749, 16)",,MLP_6,,Standard Scaler,,...,3.616965,0.885106,at,Chemical + Properties CS Less,MLP_6,MLP,-1.719407,3,intermediate_model,433


<IPython.core.display.Javascript object>

In [94]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [95]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
285,MLP_7,,2.01,1.49,3.51,0.89,-1.78,0,dominant_model,436,AT
289,MLP_11,,1.99,1.49,3.52,0.89,-1.78,0,dominant_model,436,AT
288,MLP_10,,2.06,1.51,3.54,0.89,-1.74,2,intermediate_model,434,AT


<IPython.core.display.Javascript object>

##### Top intermediate models

In [96]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
288,MLP_10,,2.06,1.51,3.54,0.89,-1.74,2,intermediate_model,434,AT
284,MLP_6,,2.07,1.54,3.62,0.89,-1.72,3,intermediate_model,433,AT


<IPython.core.display.Javascript object>

##### Top non dominant models

In [97]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
178,Bi-LSTM3,14.0,33.06,32.51,74.59,-29.15,46.3,437,non_dominant_model,0,B


<IPython.core.display.Javascript object>

In [98]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [99]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Conv1D         90
Bi-LSTM        90
LSTM           90
Transformer    90
MLP            78
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [100]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [101]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [102]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [103]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,90.0,302.58,93.97,76.0,227.5,295.0,388.5,437.0,90.0,118.02,86.63,0.0,40.0,113.0,188.5,340.0
Conv1D,90.0,127.69,60.01,39.0,94.0,119.5,148.0,402.0,90.0,291.39,59.35,28.0,269.25,304.0,326.5,363.0
LSTM,90.0,284.7,92.67,93.0,201.75,287.5,362.5,422.0,90.0,134.82,87.04,9.0,63.25,121.5,212.0,319.0
MLP,78.0,31.77,22.16,0.0,13.25,28.0,50.0,81.0,78.0,388.28,26.43,336.0,369.25,384.0,408.75,436.0
Transformer,90.0,277.1,69.02,145.0,226.75,275.0,323.75,430.0,90.0,138.86,63.41,6.0,99.0,137.0,180.5,281.0


<IPython.core.display.Javascript object>

In [104]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,90,2,intermediate_model,89
Conv1D,90,1,intermediate_model,90
LSTM,90,1,intermediate_model,90
MLP,78,2,intermediate_model,76
Transformer,90,1,intermediate_model,90


<IPython.core.display.Javascript object>

In [105]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    89
             non_dominant_model     1
Conv1D       intermediate_model    90
LSTM         intermediate_model    90
MLP          intermediate_model    76
             dominant_model         2
Transformer  intermediate_model    90
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [106]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,98.89
Bi-LSTM,non_dominant_model,1.11
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,100.0
MLP,intermediate_model,97.44
MLP,dominant_model,2.56
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [107]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
285,MLP,0,436,dominant_model
289,MLP,0,436,dominant_model
288,MLP,2,434,intermediate_model
290,MLP,2,432,intermediate_model
284,MLP,3,433,intermediate_model
...,...,...,...,...
257,Bi-LSTM,433,4,intermediate_model
41,Bi-LSTM,434,3,intermediate_model
332,Bi-LSTM,435,2,intermediate_model
111,Bi-LSTM,436,1,intermediate_model


<IPython.core.display.Javascript object>

In [108]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [109]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,90,118.022222,113.0,27232,10622,24.251142
Conv1D,90,291.388889,304.0,11492,26225,59.874429
LSTM,90,134.822222,121.5,25623,12134,27.703196
MLP,78,388.282051,384.0,2478,30286,69.146119
Transformer,90,138.855556,137.0,24939,12497,28.531963


<IPython.core.display.Javascript object>

In [110]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,90.0,7.333333,5.342221,1.0,1.0,7.0,14.0,14.0,90.0,5.181954,...,388.5,437.0,90.0,118.022222,86.628352,0.0,40.0,113.0,188.5,340.0
Conv1D,90.0,7.333333,5.342221,1.0,1.0,7.0,14.0,14.0,90.0,2.709816,...,148.0,402.0,90.0,291.388889,59.346383,28.0,269.25,304.0,326.5,363.0
LSTM,90.0,7.333333,5.342221,1.0,1.0,7.0,14.0,14.0,90.0,3.797851,...,362.5,422.0,90.0,134.822222,87.038617,9.0,63.25,121.5,212.0,319.0
MLP,0.0,,,,,,,,78.0,2.286617,...,50.0,81.0,78.0,388.282051,26.428977,336.0,369.25,384.0,408.75,436.0
Transformer,90.0,7.333333,5.342221,1.0,1.0,7.0,14.0,14.0,90.0,3.543584,...,323.75,430.0,90.0,138.855556,63.410835,6.0,99.0,137.0,180.5,281.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [111]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [112]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [113]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [114]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [115]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
289,MLP_11,MLP,,1.99,1.49,3.52,0.89,-1.78,0,436,dominant_model,AT,False
285,MLP_7,MLP,,2.01,1.49,3.51,0.89,-1.78,0,436,dominant_model,AT,False
288,MLP_10,MLP,,2.06,1.51,3.54,0.89,-1.74,2,434,intermediate_model,AT,True
290,MLP_12,MLP,,2.04,1.54,3.64,0.89,-1.72,2,432,intermediate_model,AT,True
284,MLP_6,MLP,,2.07,1.54,3.62,0.89,-1.72,3,433,intermediate_model,AT,True
283,MLP_5,MLP,,2.09,1.54,3.63,0.88,-1.7,4,432,intermediate_model,AT,True
291,MLP_13,MLP,,2.1,1.57,3.7,0.88,-1.68,6,429,intermediate_model,AT,False


<IPython.core.display.Javascript object>

### Variable Grouping: CM-P-CS

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P-CS were from:**

*BTiSS:*
    Dominance analysis: Plant S
    SCPM:Plant S and K

*TiSS:*
    Dominance analysis: Plant S
    SCPM:Plant S

In [116]:
df_pre_train["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS',
       'Chemical + Mineralogical + Physical', 'Chemical + Mineralogical'],
      dtype=object)

<IPython.core.display.Javascript object>

In [117]:
df_results_cm_p_cs = (
    df_pre_train[df_pre_train["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [118]:
df_results_cm_p_cs.shape

(511, 26)

<IPython.core.display.Javascript object>

In [119]:
# Here is the reason why the variable group cm-p-cs has more results
df_results_cm_p_cs["Plant"].nunique(), df_results_cm_p[
    "Plant"
].nunique(), df_results_cm["Plant"].nunique()

(7, 6, 6)

<IPython.core.display.Javascript object>

#### Hold Out

In [120]:
df_results_cm_p_cs["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [121]:
df_results_cm_p_cs_ho = (
    df_results_cm_p_cs[df_results_cm_p_cs["Cross Validation"].eq("Out of time")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [122]:
df_results_cm_p_cs_ho["Cross Validation"].unique()

array(['Out of time'], dtype=object)

<IPython.core.display.Javascript object>

In [123]:
df_results_cm_p_cs_ho.shape

(511, 26)

<IPython.core.display.Javascript object>

###### SCPM computation

In [124]:
df_results_cm_p_cs_ho = compute_scpm(df_results_cm_p_cs_ho)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [125]:
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_ho)
dominance_matrix_cm_p_cs_s_ho = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_s_ho = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [126]:
dominance_matrix_cm_p_cs_s_ho.shape, len(dominance_graph_cm_p_cs_s_ho)

((511, 511), 511)

<IPython.core.display.Javascript object>

In [127]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    508
non_dominant_model      2
dominant_model          1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [128]:
df_sorted_topo.head(5)

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,MAPE_mean,R2_mean,plant,Features_bkp,Model_bkp,Model_bkp_2,SCPM,Dominated_Count,Classification,Dominates_Count
385,Global Model,207,AT,Chemical + Mineralogical + Physical,"(62749, 18)",,MLP_6,,Standard Scaler,,...,2.505854,0.944895,at,Chemical + Physical,MLP_6,MLP,-1.593756,0,dominant_model,510
390,Global Model,207,AT,Chemical + Mineralogical + Physical,"(62749, 18)",,MLP_11,,Standard Scaler,,...,2.525874,0.944668,at,Chemical + Physical,MLP_11,MLP,-1.587341,1,intermediate_model,509
386,Global Model,207,AT,Chemical + Mineralogical + Physical,"(62749, 18)",,MLP_7,,Standard Scaler,,...,2.541619,0.942447,at,Chemical + Physical,MLP_7,MLP,-1.571952,2,intermediate_model,507
391,Global Model,207,AT,Chemical + Mineralogical + Physical,"(62749, 18)",,MLP_12,,Standard Scaler,,...,2.59103,0.94271,at,Chemical + Physical,MLP_12,MLP,-1.55888,2,intermediate_model,505
464,Global Model,209,S,Chemical + Mineralogical + Physical,"(61946, 17)",,MLP_12,,Standard Scaler,,...,2.567879,0.940373,s,Chemical + Physical,MLP_12,MLP,-1.562532,3,intermediate_model,504


<IPython.core.display.Javascript object>

In [129]:
df_sorted_topo_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

##### Top dominant models

In [130]:
# Top 4
df_sorted_topo_cols.sort_values(by="Dominates_Count", ascending=False).iloc[0:3]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
385,MLP_6,,1.43,1.07,2.51,0.94,-1.59,0,dominant_model,510,AT
390,MLP_11,,1.43,1.07,2.53,0.94,-1.59,1,intermediate_model,509,AT
386,MLP_7,,1.46,1.08,2.54,0.94,-1.57,2,intermediate_model,507,AT


<IPython.core.display.Javascript object>

##### Top intermediate models

In [131]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by="Dominates_Count", ascending=False).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
390,MLP_11,,1.43,1.07,2.53,0.94,-1.59,1,intermediate_model,509,AT
386,MLP_7,,1.46,1.08,2.54,0.94,-1.57,2,intermediate_model,507,AT


<IPython.core.display.Javascript object>

##### Top non dominant models

In [132]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by="Dominated_Count", ascending=True).iloc[:2]

Unnamed: 0,Model,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
246,MLP_13,,49.65,1.84,4.51,-67.67,32.74,294,non_dominant_model,0,F
346,Bi-LSTM12,14.0,28.06,27.4,62.55,-20.71,32.56,509,non_dominant_model,0,B


<IPython.core.display.Javascript object>

In [133]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "Timesteps",
        "RMSE_mean",
        "MAE_mean",
        "MAPE_mean",
        "R2_mean",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [134]:
df_sorted_topo["Model_bkp_2"].value_counts()

Model_bkp_2
Conv1D         105
Bi-LSTM        105
LSTM           105
Transformer    105
MLP             91
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [135]:
df_sorted_topo_models_grouped = (
    df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().T
)

<IPython.core.display.Javascript object>

In [136]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [137]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model_bkp_2",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [138]:
df_sorted_topo_models_cols.groupby("Model_bkp_2").describe().round(2)

Unnamed: 0_level_0,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Bi-LSTM,105.0,335.26,114.56,77.0,251.0,328.0,443.0,509.0,105.0,157.4,107.43,0.0,60.0,154.0,239.0,416.0
Conv1D,105.0,144.64,59.98,61.0,100.0,128.0,176.0,322.0,105.0,338.95,63.25,158.0,297.0,351.0,388.0,437.0
LSTM,105.0,314.93,120.53,88.0,219.0,312.0,442.0,508.0,105.0,177.8,113.5,1.0,60.0,195.0,266.0,412.0
MLP,91.0,52.08,55.31,0.0,17.0,36.0,61.5,294.0,91.0,442.05,73.68,0.0,438.0,461.0,478.5,510.0
Transformer,105.0,355.9,78.31,139.0,316.0,366.0,409.0,499.0,105.0,138.6,72.75,7.0,89.0,127.0,175.0,346.0


<IPython.core.display.Javascript object>

In [139]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bi-LSTM,105,2,intermediate_model,104
Conv1D,105,1,intermediate_model,105
LSTM,105,1,intermediate_model,105
MLP,91,3,intermediate_model,89
Transformer,105,1,intermediate_model,105


<IPython.core.display.Javascript object>

In [140]:
df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]].value_counts()

Model_bkp_2  Classification    
Bi-LSTM      intermediate_model    104
             non_dominant_model      1
Conv1D       intermediate_model    105
LSTM         intermediate_model    105
MLP          intermediate_model     89
             dominant_model          1
             non_dominant_model      1
Transformer  intermediate_model    105
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [141]:
(
    df_sorted_topo_models_cols.groupby("Model_bkp_2")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model_bkp_2,Classification,Unnamed: 2_level_1
Bi-LSTM,intermediate_model,99.05
Bi-LSTM,non_dominant_model,0.95
Conv1D,intermediate_model,100.0
LSTM,intermediate_model,100.0
MLP,intermediate_model,97.8
MLP,dominant_model,1.1
MLP,non_dominant_model,1.1
Transformer,intermediate_model,100.0


<IPython.core.display.Javascript object>

In [142]:
df_sorted_topo_models_cols

Unnamed: 0,Model_bkp_2,Dominated_Count,Dominates_Count,Classification
385,MLP,0,510,dominant_model
390,MLP,1,509,intermediate_model
386,MLP,2,507,intermediate_model
391,MLP,2,505,intermediate_model
464,MLP,3,504,intermediate_model
...,...,...,...,...
297,LSTM,505,4,intermediate_model
419,Bi-LSTM,506,3,intermediate_model
5,LSTM,507,2,intermediate_model
229,LSTM,508,1,intermediate_model


<IPython.core.display.Javascript object>

In [143]:
summary_stats = df_sorted_topo_models_cols.groupby("Model_bkp_2").agg(
    Total_Models=("Model_bkp_2", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [144]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model_bkp_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bi-LSTM,105,157.4,154.0,35202,16527,32.342466
Conv1D,105,338.952381,351.0,15187,35590,69.64775
LSTM,105,177.8,195.0,33068,18669,36.534247
MLP,91,442.054945,461.0,4739,40227,78.722114
Transformer,105,138.6,127.0,37370,14553,28.479452


<IPython.core.display.Javascript object>

In [145]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,Timesteps,RMSE_mean,RMSE_mean,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model_bkp_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bi-LSTM,105.0,7.333333,5.337939,1.0,1.0,7.0,14.0,14.0,105.0,3.986665,...,443.0,509.0,105.0,157.4,107.433038,0.0,60.0,154.0,239.0,416.0
Conv1D,105.0,7.333333,5.337939,1.0,1.0,7.0,14.0,14.0,105.0,1.91833,...,176.0,322.0,105.0,338.952381,63.253668,158.0,297.0,351.0,388.0,437.0
LSTM,105.0,7.333333,5.337939,1.0,1.0,7.0,14.0,14.0,105.0,3.649342,...,442.0,508.0,105.0,177.8,113.504863,1.0,60.0,195.0,266.0,412.0
MLP,0.0,,,,,,,,91.0,2.140752,...,61.5,294.0,91.0,442.054945,73.681652,0.0,438.0,461.0,478.5,510.0
Transformer,105.0,7.333333,5.337939,1.0,1.0,7.0,14.0,14.0,105.0,3.065799,...,409.0,499.0,105.0,138.6,72.747403,7.0,89.0,127.0,175.0,346.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [146]:
cols = [
    "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

In [147]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [148]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [149]:
# df_sorted_topo[cols].round(2).head(5)

<IPython.core.display.Javascript object>

In [150]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(7)

Unnamed: 0,Model,Model_bkp_2,Timesteps,RMSE_mean,MAE_mean,MAPE_mean,R2_mean,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
385,MLP_6,MLP,,1.43,1.07,2.51,0.94,-1.59,0,510,dominant_model,AT,True
390,MLP_11,MLP,,1.43,1.07,2.53,0.94,-1.59,1,509,intermediate_model,AT,True
386,MLP_7,MLP,,1.46,1.08,2.54,0.94,-1.57,2,507,intermediate_model,AT,True
464,MLP_12,MLP,,1.47,1.09,2.57,0.94,-1.56,3,504,intermediate_model,S,False
391,MLP_12,MLP,,1.46,1.1,2.59,0.94,-1.56,2,505,intermediate_model,AT,False
384,MLP_5,MLP,,1.49,1.1,2.57,0.94,-1.55,4,502,intermediate_model,AT,False
389,MLP_10,MLP,,1.49,1.11,2.61,0.94,-1.54,4,501,intermediate_model,AT,True


<IPython.core.display.Javascript object>

# Global Analysis (fine tuning results)

# Definitions

# Topological Analysis - NEW

## Definitions

In [151]:
# Function to compare two models strictly based on means only
def dominates(model1, model2):
    """Return True if model1 strictly dominates model2, False otherwise."""
    # Check if model1 is at least as good as model2 in all metrics
    at_least_as_good = (
        model1["RMSE_mean"] <= model2["RMSE_mean"]
        and model1["MAE_mean"] <= model2["MAE_mean"]
        and model1["MAPE_mean"] <= model2["MAPE_mean"]
        and model1["R2_mean"] >= model2["R2_mean"]
    )

    # Check if model1 is strictly better in at least one metric
    strictly_better_in_one = (
        model1["RMSE_mean"] < model2["RMSE_mean"]
        or model1["MAE_mean"] < model2["MAE_mean"]
        or model1["MAPE_mean"] < model2["MAPE_mean"]
        or model1["R2_mean"] > model2["R2_mean"]
    )

    # Model1 strictly dominates Model2 if it is at least as good in all metrics and
    # strictly better in at least one metric
    return at_least_as_good and strictly_better_in_one

<IPython.core.display.Javascript object>

In [152]:
def get_dominance_matrix_and_graph(df):
    n = len(df)
    dominance_matrix = np.zeros((n, n), dtype=bool)

    for i in range(n):
        for j in range(n):
            if i != j:
                dominance_matrix[i, j] = dominates(df.iloc[i], df.iloc[j])

    # Create the dominance graph
    dominance_graph = {i: [] for i in range(n)}
    for i in range(n):
        for j in range(n):
            if dominance_matrix[i, j]:
                dominance_graph[i].append(j)

    return dominance_matrix, dominance_graph

<IPython.core.display.Javascript object>

In [153]:
# Topological Sorting
def topological_sort(graph):
    """Perform topological sorting on the given graph."""
    in_degree = {u: 0 for u in graph}
    for u in graph:
        for v in graph[u]:
            in_degree[v] += 1

    queue = deque([u for u in graph if in_degree[u] == 0])
    topo_order = []

    while queue:
        u = queue.popleft()
        topo_order.append(u)

        for v in graph[u]:
            in_degree[v] -= 1
            if in_degree[v] == 0:
                queue.append(v)

    return topo_order

<IPython.core.display.Javascript object>

In [154]:
# Function to find the top models in each group
def find_top_models(group):
    max_net_dominance = group["Net_Dominance"].max()
    top_models = group[group["Net_Dominance"] == max_net_dominance]
    return top_models

<IPython.core.display.Javascript object>

### Dominance Analysis and SCPM

In [155]:
def compute_scpm(df):
    scaler = StandardScaler()
    standardized_metrics = scaler.fit_transform(
        df[["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"]]
    )

    # Creating a new dataframe with standardized values
    standardized_df = pd.DataFrame(
        standardized_metrics,
        columns=["RMSE_mean", "MAE_mean", "MAPE_mean", "R2_mean"],
    )

    # Summing all metrics and subtracting R2_mean
    standardized_df["Result"] = (
        standardized_df[["RMSE_mean", "MAE_mean", "MAPE_mean"]].sum(axis=1)
        - standardized_df["R2_mean"]
    )

    df["SCPM"] = standardized_df["Result"]
    return df

<IPython.core.display.Javascript object>

In [156]:
def make_dominance_analysis(df):
    # Compute dominance matrix and graph
    dominance_matrix, dominance_graph = get_dominance_matrix_and_graph(df)

    # Get the topological order
    topo_order = topological_sort(dominance_graph)
    df = df.copy()
    n = len(topo_order)

    # Identify Dominant Models
    dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if not np.any(dominance_matrix[:, i])
    ]

    # Identify Non-Dominant Models
    non_dominant_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i])
    ]

    # Identify Models that are both dominated and dominate at least one
    intermediate_models = [
        df["Model_bkp_2"].iloc[i]
        for i in range(len(df))
        if np.any(dominance_matrix[:, i]) and np.any(dominance_matrix[i, :])
    ]

    # Add count of times each model is dominated by others
    df["Dominated_Count"] = np.sum(dominance_matrix, axis=0)

    # Classify models
    df["Classification"] = "intermediate_model"  # default to intermediate_model
    df.loc[
        ~np.any(dominance_matrix, axis=1) & np.any(dominance_matrix, axis=0),
        "Classification",
    ] = "non_dominant_model"
    df.loc[~np.any(dominance_matrix, axis=0), "Classification"] = "dominant_model"

    # Map the topological order to the model names and get dominance counts
    sorted_models = [df["Model"].iloc[i] for i in topo_order]

    df_sorted = df.iloc[topo_order].copy()
    df_sorted["Dominates_Count"] = dominance_matrix.sum(axis=1)[topo_order]
    df_sorted_topo = df_sorted.copy()

    df_sorted = df_sorted.sort_values(by="Dominates_Count", ascending=False)
    df_sorted_count = df_sorted.copy()

    return {
        "dominance_matrix": dominance_matrix,
        "dominance_graph": dominance_graph,
        "df_sorted_topo": df_sorted_topo,
        "df_sorted_count": df_sorted_count,
    }

<IPython.core.display.Javascript object>

# Reading The files

In [157]:
csv_files_path_fine_tuning_full = dict()
csv_files_path_fine_tuning_grouped = dict()

<IPython.core.display.Javascript object>

## 203

### Plant C

In [158]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/203/c/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/203/c/fine_tuning/grouped/"
plant = "c"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

## 204

### Plant F

In [159]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/204/f/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/204/f/fine_tuning/grouped/"
plant = "f"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

### Plant AB

In [160]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/204/ab/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/204/ab/fine_tuning/grouped/"
plant = "ab"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

## 206

### Plant B

In [161]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/206/b/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/206/b/fine_tuning/grouped/"
plant = "b"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

## 207

### Plant AT

In [162]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/207/at/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/207/at/fine_tuning/grouped/"
plant = "at"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

## 209

### Plant S

In [163]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/s/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/s/fine_tuning/grouped/"
plant = "s"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

### Plant AM

In [164]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/am/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/am/fine_tuning/grouped/"
plant = "am"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

### Plant W

In [165]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/w/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/w/fine_tuning/grouped/"
plant = "w"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

### Plant K

In [166]:
path_fine_tuning_full = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/k/fine_tuning/full/"
path_fine_tuning_grouped = "/home/peressim/projects/ccs28-ml-modelling/reports/results/global_models/209/k/fine_tuning/grouped/"
plant = "k"

csv_files_path_fine_tuning_full = read_csv_files_path(
    csv_files_path_fine_tuning_full, path_fine_tuning_full, plant
)

csv_files_path_fine_tuning_grouped = read_csv_files_path(
    csv_files_path_fine_tuning_grouped, path_fine_tuning_grouped, plant
)

<IPython.core.display.Javascript object>

### Reading all data - pre training

#### Full

In [167]:
dfs_plant = dict()
column = ("plant", "")
pos = 2

for plant, csv_files in csv_files_path_fine_tuning_full.items():
    df = read_csv_files_full(csv_files, plant)
    df["plant"] = plant
    dfs_plant[plant] = df

<IPython.core.display.Javascript object>

In [168]:
df_fine_tuning_full = pd.concat(dfs_plant.values(), ignore_index=True)

<IPython.core.display.Javascript object>

In [169]:
df_fine_tuning_full.shape

(4185, 23)

<IPython.core.display.Javascript object>

In [170]:
df_fine_tuning_full_copy = (
    df_fine_tuning_full[
        df_fine_tuning_full["Cross Validation"].isin(
            ["Blocking Time Series Split", "Time Series Split"]
        )
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [171]:
df_fine_tuning_full_copy.shape

(3120, 23)

<IPython.core.display.Javascript object>

In [172]:
df_fine_tuning_full_copy["Cross Validation"].value_counts()

Cross Validation
Blocking Time Series Split    1560
Time Series Split             1560
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [173]:
df_fine_tuning_full_copy["Features"].value_counts()

Features
Chemical + Physical              1200
Chemical                         1020
Chemical + Properties CS Less     900
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [174]:
df_fine_tuning_full_copy[
    df_fine_tuning_full_copy["Features"].eq("Chemical")
    & df_fine_tuning_full_copy["Cross Validation"].eq("Blocking Time Series Split")
].shape

(510, 23)

<IPython.core.display.Javascript object>

In [175]:
df_fine_tuning_full_copy[
    df_fine_tuning_full_copy["Features"].eq("Chemical")
    & df_fine_tuning_full_copy["Cross Validation"].eq("Time Series Split")
].shape

(510, 23)

<IPython.core.display.Javascript object>

In [176]:
df_fine_tuning_full_copy[
    df_fine_tuning_full_copy["Features"].eq("Chemical + Properties CS Less")
    & df_fine_tuning_full_copy["Cross Validation"].eq("Blocking Time Series Split")
].shape

(450, 23)

<IPython.core.display.Javascript object>

In [177]:
df_fine_tuning_full_copy[
    df_fine_tuning_full_copy["Features"].eq("Chemical + Properties CS Less")
    & df_fine_tuning_full_copy["Cross Validation"].eq("Time Series Split")
].shape

(450, 23)

<IPython.core.display.Javascript object>

In [178]:
df_fine_tuning_full_copy[
    df_fine_tuning_full_copy["Features"].eq("Chemical + Physical")
    & df_fine_tuning_full_copy["Cross Validation"].eq("Blocking Time Series Split")
].shape

(600, 23)

<IPython.core.display.Javascript object>

In [179]:
df_fine_tuning_full_copy[
    df_fine_tuning_full_copy["Features"].eq("Chemical + Physical")
    & df_fine_tuning_full_copy["Cross Validation"].eq("Time Series Split")
    & ~df_fine_tuning_full_copy["Plant"].eq("K")
].shape

(525, 23)

<IPython.core.display.Javascript object>

#### Grouped

In [180]:
dfs_plant = dict()
column = ("plant", "")
pos = 2

for plant, csv_files in csv_files_path_fine_tuning_grouped.items():
    df = read_csv_files_grouped(csv_files, plant)
    df["plant"] = plant
    df = preprocess_rename_columns(df)
    df = preprocess_change_columns_order(df, column, pos)
    dfs_plant[plant] = df

<IPython.core.display.Javascript object>

In [181]:
df_fine_tuning_grouped = pd.concat(dfs_plant.values(), ignore_index=True)

<IPython.core.display.Javascript object>

In [182]:
df_fine_tuning_grouped.shape

(327, 25)

<IPython.core.display.Javascript object>

In [183]:
df_copy = df_fine_tuning_grouped.copy()
df_copy = (
    df_copy.reset_index(level=0)
    .rename({"level_0": "Plant"}, axis=1)
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

## Preprocessing steps

In [184]:
patterns = [
    "Chemical + Mineralogical + Feature Engineering",
    "Chemical + Mineralogical + CS7 + One-Hot",
    "Chemical + Mineralogical + Physical + One-Hot",
    "Chemical + Mineralogical + CS3 + One-Hot",
    "Chemical + Mineralogical + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + CS7 + One-Hot",
    "Chemical + Mineralogical + CS1 + CS3 + One-Hot",
    "Chemical + Mineralogical + CS1 + One-Hot",
    "Chemical + Mineralogical + CS2 + One-Hot",
    "Chemical + Feature Engineering",
]

replace_dict = {
    "Chemical + Mineralogical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + Mineralogical + CS2": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3 + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS7": "Chemical + Mineralogical + Early CS",
    "Chemical + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS3": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1": "Chemical + Mineralogical + Early CS",
    "Chemical + CS1 + CS7": "Chemical + Mineralogical + Early CS",
}

replace_dict_2 = {
    "Chemical": "Chemical + Mineralogical",
    "Chemical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Mineralogical + Properties CS Less": "Chemical + Mineralogical + Physical - Early CS",
    "Chemical + Physical": "Chemical + Mineralogical + Physical",
}

df_copy = df_copy[~df_copy["Cross Validation"].eq("Out of time Split")].reset_index(
    drop=True
)

# Removing afterwards
df_copy = df_copy[~df_copy["Cross Validation"].eq("Repeated KFold")].reset_index(
    drop=True
)

df_copy = df_copy[~df_copy["Features"].apply(lambda x: x in patterns)].reset_index(
    drop=True
)
df_copy["Features_bkp"] = df_copy["Features"].copy()
df_copy["Features"] = df_copy["Features"].replace(replace_dict)

df_copy["Features"] = df_copy["Features"].replace(replace_dict_2)

# This plant wasn't between the best models for this variable grouping configuration, hence it was removed
# df_copy = df_copy[
#     ~(
#         df_copy["plant"].eq("am")
#         & df_copy["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
#     )
# ]

<IPython.core.display.Javascript object>

In [185]:
df_copy["plant"].unique()

array(['c', 'f', 'ab', 'b', 'at', 's', 'am', 'w', 'k'], dtype=object)

<IPython.core.display.Javascript object>

In [186]:
# Ensuring only the results of the best selected plants will be taken into account

cm = df_copy["Features"].eq("Chemical + Mineralogical")
cmp = df_copy["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
cmpcs = df_copy["Features"].eq("Chemical + Mineralogical + Physical")

btss = df_copy["Cross Validation"].eq("Blocking Time Series Split")
tss = df_copy["Cross Validation"].eq("Time Series Split")

fs = df_copy["plant"].eq("s")
fab = df_copy["plant"].eq("ab")
ff = df_copy["plant"].eq("f")
fam = df_copy["plant"].eq("am")
fw = df_copy["plant"].eq("w")
fk = df_copy["plant"].eq("k")

fc = df_copy["plant"].eq("c")
fb = df_copy["plant"].eq("b")
fat = df_copy["plant"].eq("at")

f_final = (
    (cm & ((btss & (fs | fc | fab | fb | fat)) | (tss & (fs | fc | ff | fb | fat))))
    | (cmp & ((btss & (fam | fc | ff | fb | fat)) | (tss & (fw | fc | ff | fb | fat))))
    | (
        cmpcs
        & ((btss & (fs | fk | fc | ff | fb | fat)) | (tss & (fs | fc | fab | fb | fat)))
    )
)
df_copy = df_copy[f_final].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [187]:
# for feature in df_copy["Features"].unique():
#     print()
#     cvs = df_copy["Cross Validation"].unique()
#     print("================================")
#     print("Feature: ", feature)
#     print("================================")
#     for cv in cvs:
#         print("================================")
#         print(cv)
#         print("================================")
#         plants_df = df_copy[
#             df_copy["Cross Validation"].eq(cv) & df_copy["Features"].eq(feature)
#         ]
#         for plant in plants_df["plant"].unique():
#             print(plant)
#         print("================================")
#         print()

<IPython.core.display.Javascript object>

In [188]:
df_copy[("MAPE Train", "mean")] = df_copy[("MAPE Train", "mean")] * 100
df_copy[("MAPE Train", "std")] = df_copy[("MAPE Train", "std")] * 100
df_copy[("MAPE Test", "mean")] = df_copy[("MAPE Test", "mean")] * 100
df_copy[("MAPE Test", "std")] = df_copy[("MAPE Test", "std")] * 100

df_copy["Model_bkp"] = df_copy["Model"].copy()
df_copy["Model_bkp_2"] = df_copy["Model"] + df_copy["Timesteps"].apply(
    lambda x: "" if pd.isna(x) else "_" + str(x)
)
df_copy["Model"] = df_copy["Model"].replace(
    {
        "MLP": "Neural Networks",
        "LSTM": "Neural Networks",
        "GRU": "Neural Networks",
        "BidirectionalLSTM": "Neural Networks",
        "BidirectionalGRU": "Neural Networks",
        "Transformer": "Neural Networks",
        "Decision Tree": "Trees",
        "Random Forest": "Trees",
        "XGBoost": "Trees",
    }
)

<IPython.core.display.Javascript object>

In [189]:
df_copy["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [190]:
df_copy["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [191]:
df_copy["Model"].unique()

array(['Neural Networks', 'Conv1D'], dtype=object)

<IPython.core.display.Javascript object>

In [192]:
df_copy["Model_bkp"].unique()

array(['Transformer', 'MLP', 'BidirectionalLSTM', 'Conv1D', 'LSTM'],
      dtype=object)

<IPython.core.display.Javascript object>

In [193]:
df_copy["Model_bkp_2"].unique()

array(['Transformer_7.0', 'MLP', 'BidirectionalLSTM_7.0', 'Conv1D_7.0',
       'BidirectionalLSTM_1.0', 'LSTM_1.0', 'LSTM_7.0', 'Conv1D_14.0',
       'Transformer_1.0', 'BidirectionalLSTM_14.0', 'Transformer_14.0',
       'LSTM_14.0', 'Conv1D_1.0'], dtype=object)

<IPython.core.display.Javascript object>

In [194]:
df_copy.shape

(155, 29)

<IPython.core.display.Javascript object>

In [195]:
df_copy_grouped = df_copy.copy()

<IPython.core.display.Javascript object>

In [196]:
df_copy[
    [
        ("RMSE Test", "mean"),
        ("MAE Test", "mean"),
        ("MAPE Test", "mean"),
        ("R2 Test", "mean"),
    ]
].describe().round(2).T

Unnamed: 0,Unnamed: 1,count,mean,std,min,25%,50%,75%,max
RMSE Test,mean,155.0,5.46,36.26,0.71,1.65,2.05,2.84,453.49
MAE Test,mean,155.0,3.41,20.03,0.54,1.23,1.56,2.06,250.83
MAPE Test,mean,155.0,7.66,43.69,1.34,2.86,3.6,4.71,547.28
R2 Test,mean,155.0,-79.24,985.03,-12263.58,-0.04,0.67,0.93,0.98


<IPython.core.display.Javascript object>

In [197]:
df_copy["plant"].unique()

array(['c', 'f', 'ab', 'b', 'at', 's', 'am', 'w', 'k'], dtype=object)

<IPython.core.display.Javascript object>

In [198]:
df_copy["Features"].value_counts()

Features
Chemical + Mineralogical + Physical               55
Chemical + Mineralogical                          50
Chemical + Mineralogical + Physical - Early CS    50
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [199]:
df_copy[df_copy["plant"].eq("am")]["Features"].value_counts()

Features
Chemical + Mineralogical + Physical - Early CS    5
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [200]:
df_copy["plant"].value_counts()

plant
c     30
b     30
at    30
f     20
s     20
ab    10
am     5
w      5
k      5
Name: count, dtype: int64

<IPython.core.display.Javascript object>

# Global Analysis (fine tuning results)

## ECICS

In [201]:
df_ecics_grouped = df_copy.copy()

<IPython.core.display.Javascript object>

In [202]:
df_results = df_ecics_grouped[
    [
        ("Company", ""),
        ("plant", ""),
        ("Cross Validation", ""),
        ("Features", ""),
        ('Features_bkp', ''),
        ("Model", ""),
        ("Model_bkp", ""),         
        ('Model_bkp_2',''),
        ("Timesteps", ""),
        ("RMSE Test", "mean"),
        ("MAE Test", "mean"),
        ("MAPE Test", "mean"),
        ("R2 Test", "mean"),
        ("RMSE Test", "std"),
        ("MAE Test", "std"),
        ("MAPE Test", "std"),
        ("R2 Test", "std"),
    ]
].copy()
df_results = df_results.reset_index(drop=True)

# Rename columns
new_column_names = [
    "Company",
    "Plant",
    "Cross Validation",
    "Features",
    "Features_bkp",
    "Model",
    "Model_bkp",
    "Model_bkp_2",
    "Timesteps",
    "RMSE_mean",
    "MAE_mean",
    "MAPE_mean",
    "R2_mean",
    "RMSE_std",
    "MAE_std",
    "MAPE_std",
    "R2_std",
]

df_results.columns = new_column_names


<IPython.core.display.Javascript object>

In [203]:
df_results.shape

(155, 17)

<IPython.core.display.Javascript object>

In [204]:
df_results["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

## Variable Grouping: CM

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM were from:**

*BTiSS:*
    Dominance analysis: Plant S
    SCPM:Plant S

*TiSS:*
    Dominance analysis: Plant S
    SCPM:Plant S

In [205]:
df_results["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [206]:
df_results_cm = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [207]:
df_results_cm["Features"].unique()

array(['Chemical + Mineralogical'], dtype=object)

<IPython.core.display.Javascript object>

In [208]:
df_results_cm.shape

(50, 17)

<IPython.core.display.Javascript object>

### Blocking time series

In [209]:
df_results_cm["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [210]:
df_results_cm_btss = (
    df_results_cm[df_results_cm["Cross Validation"].eq("Blocking Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [211]:
df_results_cm_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [212]:
df_results_cm_btss.shape

(25, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [213]:
df_results_cm_btss = compute_scpm(df_results_cm_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [214]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_btss)
dominance_matrix_cm_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [215]:
dominance_matrix_cm_btss.shape, len(dominance_graph_cm_btss)

((25, 25), 25)

<IPython.core.display.Javascript object>

In [216]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    22
dominant_model         2
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [217]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/global_models/ecics/ecics_209_s_dominance_analysis_cm_btss.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [218]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
20,209,s,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,Transformer,Transformer_1.0,1.0,1.16241,...,2.142578,0.94641,0.422754,0.235959,0.610615,0.04138,-0.909737,0,dominant_model,23
23,209,s,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,MLP,MLP,,1.213463,...,2.326368,0.949241,0.137581,0.084595,0.223857,0.01128,-0.905715,0,dominant_model,23
22,209,s,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Conv1D,Conv1D,Conv1D_7.0,7.0,1.437166,...,2.612071,0.929066,0.312455,0.202023,0.393569,0.024494,-0.897757,2,intermediate_model,22
0,203,c,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,Transformer,Transformer_7.0,7.0,2.070881,...,3.262632,0.801263,1.701564,0.849215,2.633226,0.341221,-0.880159,3,intermediate_model,17
3,203,c,Blocking Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,MLP,MLP,,1.821639,...,3.070525,0.898877,0.535745,0.323934,0.746508,0.0493,-0.884689,3,intermediate_model,20


<IPython.core.display.Javascript object>

In [219]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [220]:
# Top 4
df_sorted_topo_cols.iloc[0:5]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
20,Transformer_1.0,1.16,0.42,0.86,0.24,2.14,0.61,0.95,0.04,-0.91,0,dominant_model,23,s
23,MLP,1.21,0.14,0.94,0.08,2.33,0.22,0.95,0.01,-0.91,0,dominant_model,23,s
22,Conv1D_7.0,1.44,0.31,1.08,0.2,2.61,0.39,0.93,0.02,-0.9,2,intermediate_model,22,s
0,Transformer_7.0,2.07,1.7,1.28,0.85,3.26,2.63,0.8,0.34,-0.88,3,intermediate_model,17,c
3,MLP,1.82,0.54,1.29,0.32,3.07,0.75,0.9,0.05,-0.88,3,intermediate_model,20,c


<IPython.core.display.Javascript object>

In [221]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(2, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [222]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
22,Conv1D_7.0,1.44,0.31,1.08,0.2,2.61,0.39,0.93,0.02,-0.9,2,intermediate_model,22,s
3,MLP,1.82,0.54,1.29,0.32,3.07,0.75,0.9,0.05,-0.88,3,intermediate_model,20,c
5,Transformer_14.0,1.88,0.72,1.43,0.43,3.36,1.06,0.85,0.18,-0.88,4,intermediate_model,18,ab


<IPython.core.display.Javascript object>

In [223]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(22, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [224]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
6,BidirectionalLSTM_7.0,453.49,308.53,250.83,162.35,547.28,354.3,-12263.58,19055.3,19.59,24,non_dominant_model,0,ab


<IPython.core.display.Javascript object>

In [225]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [226]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [227]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    20
Conv1D              5
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [228]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [229]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [230]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [231]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,85.0
Neural Networks,dominant_model,10.0
Neural Networks,non_dominant_model,5.0


<IPython.core.display.Javascript object>

In [232]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,5.0,20.0
Dominated_Count,mean,10.8,9.75
Dominated_Count,std,6.91,7.21
Dominated_Count,min,2.0,0.0
Dominated_Count,25%,8.0,4.0
Dominated_Count,50%,9.0,8.5
Dominated_Count,75%,15.0,12.5
Dominated_Count,max,20.0,24.0
Dominates_Count,count,5.0,20.0
Dominates_Count,mean,9.4,10.1


<IPython.core.display.Javascript object>

In [233]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [234]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,5,1,intermediate_model,5
Neural Networks,20,3,intermediate_model,17


<IPython.core.display.Javascript object>

In [235]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model     5
Neural Networks  intermediate_model    17
                 dominant_model         2
                 non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [236]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [237]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [238]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,5,9.4,8.0,54,47,1.88
Neural Networks,20,10.1,8.0,195,202,8.08


<IPython.core.display.Javascript object>

In [239]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,5.0,3.154597,1.266354,1.437166,2.967269,3.075219,3.293655,4.999674,5.0,0.961512,...,15.0,20.0,5.0,9.4,7.668116,2.0,5.0,8.0,10.0,22.0
Neural Networks,20.0,25.91739,100.670551,1.16241,2.019224,2.423043,3.902089,453.491348,20.0,16.749742,...,12.5,24.0,20.0,10.1,7.411939,0.0,4.5,8.0,17.25,23.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [240]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [241]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [242]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [243]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
20,Transformer_1.0,1.16,0.42,0.86,0.24,2.14,0.61,0.95,0.04,-0.91,0,23,dominant_model,s,True
23,MLP,1.21,0.14,0.94,0.08,2.33,0.22,0.95,0.01,-0.91,0,23,dominant_model,s,True
22,Conv1D_7.0,1.44,0.31,1.08,0.2,2.61,0.39,0.93,0.02,-0.9,2,22,intermediate_model,s,True
3,MLP,1.82,0.54,1.29,0.32,3.07,0.75,0.9,0.05,-0.88,3,20,intermediate_model,c,False
0,Transformer_7.0,2.07,1.7,1.28,0.85,3.26,2.63,0.8,0.34,-0.88,3,17,intermediate_model,c,False


<IPython.core.display.Javascript object>

### Time Series Split

In [244]:
df_results_cm["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [245]:
df_results_cm_tss = (
    df_results_cm[df_results_cm["Cross Validation"].eq("Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [246]:
df_results_cm_tss["Cross Validation"].unique()

array(['Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [247]:
df_results_cm_tss.shape

(25, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [248]:
df_results_cm_tss = compute_scpm(df_results_cm_tss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [249]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_tss)
dominance_matrix_cm_tss = dominance_dict["dominance_matrix"]
dominance_graph_cm_tss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [250]:
dominance_matrix_cm_tss.shape, len(dominance_graph_cm_tss)

((25, 25), 25)

<IPython.core.display.Javascript object>

In [251]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    23
dominant_model         1
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [252]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/global_models/ecics/ecics_209_s_dominance_analysis_cm_tss.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [253]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
20,209,s,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,Transformer,Transformer_1.0,1.0,1.311009,...,2.147842,0.935433,0.216608,0.09795,0.19273,0.019644,-3.197405,0,dominant_model,24
5,204,f,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,Transformer,Transformer_1.0,1.0,1.741329,...,2.935762,0.934552,0.236479,0.198124,0.448636,0.01669,-2.500113,1,intermediate_model,21
23,209,s,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,MLP,MLP,,1.387445,...,2.446752,0.92681,0.355405,0.238475,0.506784,0.033572,-2.984878,1,intermediate_model,21
8,204,f,Time Series Split,Chemical + Mineralogical,Chemical,Neural Networks,MLP,MLP,,1.791195,...,3.22986,0.930404,0.26033,0.208885,0.50073,0.019883,-2.31781,2,intermediate_model,20
22,209,s,Time Series Split,Chemical + Mineralogical,Chemical,Conv1D,Conv1D,Conv1D_7.0,7.0,1.557909,...,2.823972,0.910436,0.148739,0.112064,0.215531,0.011473,-2.67978,2,intermediate_model,20


<IPython.core.display.Javascript object>

In [254]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [255]:
# Top 4
df_sorted_topo_cols.iloc[0:2].sort_values(by="Dominates_Count", ascending=False).round(
    2
)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
20,Transformer_1.0,1.31,0.22,0.88,0.1,2.15,0.19,0.94,0.02,-3.2,0,dominant_model,24,s
5,Transformer_1.0,1.74,0.24,1.27,0.2,2.94,0.45,0.93,0.02,-2.5,1,intermediate_model,21,f


<IPython.core.display.Javascript object>

In [256]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [257]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:4].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
23,MLP,1.39,0.36,1.0,0.24,2.45,0.51,0.93,0.03,-2.98,1,intermediate_model,21,s
5,Transformer_1.0,1.74,0.24,1.27,0.2,2.94,0.45,0.93,0.02,-2.5,1,intermediate_model,21,f
22,Conv1D_7.0,1.56,0.15,1.16,0.11,2.82,0.22,0.91,0.01,-2.68,2,intermediate_model,20,s
8,MLP,1.79,0.26,1.36,0.21,3.23,0.5,0.93,0.02,-2.32,2,intermediate_model,20,f


<IPython.core.display.Javascript object>

In [258]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(23, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [259]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
16,BidirectionalLSTM_14.0,13.725374,6.50983,7.997681,2.775789,17.917479,5.967878,-37.589314,47.748064,16.938688,24,non_dominant_model,0,at


<IPython.core.display.Javascript object>

In [260]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [261]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [262]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    20
Conv1D              5
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [263]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [264]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [265]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,90.0
Neural Networks,dominant_model,5.0
Neural Networks,non_dominant_model,5.0


<IPython.core.display.Javascript object>

In [266]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,5.0,20.0
Dominated_Count,mean,9.4,9.7
Dominated_Count,std,6.5,7.05
Dominated_Count,min,2.0,0.0
Dominated_Count,25%,5.0,5.0
Dominated_Count,50%,10.0,7.5
Dominated_Count,75%,11.0,17.0
Dominated_Count,max,19.0,24.0
Dominates_Count,count,5.0,20.0
Dominates_Count,mean,10.2,9.5


<IPython.core.display.Javascript object>

In [267]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [268]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,5,1,intermediate_model,5
Neural Networks,20,3,intermediate_model,18


<IPython.core.display.Javascript object>

In [269]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model     5
Neural Networks  intermediate_model    18
                 dominant_model         1
                 non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [270]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [271]:
summary_stats.round(2)

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,5,10.2,9.0,47,51,2.04
Neural Networks,20,9.5,8.0,194,190,7.6


<IPython.core.display.Javascript object>

In [272]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,5.0,2.847972,1.194107,1.557909,2.020378,2.45185,3.967133,4.242589,5.0,0.659429,...,11.0,19.0,5.0,10.2,7.79102,1.0,5.0,9.0,16.0,20.0
Neural Networks,20.0,3.597318,2.845631,1.311009,1.986867,2.905258,3.730361,13.725374,20.0,1.214478,...,17.0,24.0,20.0,9.5,7.465711,0.0,2.75,8.0,12.25,24.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [273]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [274]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [275]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [276]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
20,Transformer_1.0,1.31,0.22,0.88,0.1,2.15,0.19,0.94,0.02,-3.2,0,24,dominant_model,s,True
23,MLP,1.39,0.36,1.0,0.24,2.45,0.51,0.93,0.03,-2.98,1,21,intermediate_model,s,False
22,Conv1D_7.0,1.56,0.15,1.16,0.11,2.82,0.22,0.91,0.01,-2.68,2,20,intermediate_model,s,False
5,Transformer_1.0,1.74,0.24,1.27,0.2,2.94,0.45,0.93,0.02,-2.5,1,21,intermediate_model,f,False
8,MLP,1.79,0.26,1.36,0.21,3.23,0.5,0.93,0.02,-2.32,2,20,intermediate_model,f,False


<IPython.core.display.Javascript object>

## Variable Grouping: CM-P

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P were from:**

*BTiSS:*
    Dominance analysis: Plant AM
    SCPM:Plant AM

*TiSS:*
    Dominance analysis: Plant W
    SCPM:Plant W

In [277]:
df_results["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [278]:
df_results_cm_p = (
    df_results[
        df_results["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [279]:
df_results_cm_p["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [280]:
df_results_cm_p.shape

(50, 17)

<IPython.core.display.Javascript object>

### Blocking time series

In [281]:
df_results_cm_p["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [282]:
df_results_cm_p_btss = (
    df_results_cm_p[
        df_results_cm_p["Cross Validation"].eq("Blocking Time Series Split")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [283]:
df_results_cm_p_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [284]:
df_results_cm_p_btss.shape

(25, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [285]:
df_results_cm_p_btss = compute_scpm(df_results_cm_p_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [286]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_btss)
dominance_matrix_cm_p_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [287]:
dominance_matrix_cm_p_btss.shape, len(dominance_graph_cm_p_btss)

((25, 25), 25)

<IPython.core.display.Javascript object>

In [288]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    23
dominant_model         1
non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [289]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/global_models/ecics/ecics_209_am_dominance_analysis_cm_p_btss.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [290]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
20,209,am,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,MLP,MLP,,0.977688,...,1.817851,0.972777,0.07991,0.055835,0.1251,0.002697,-5.601157,0,dominant_model,24
4,203,c,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,Transformer,Transformer_1.0,1.0,1.37506,...,2.090569,0.943194,0.296677,0.146197,0.293774,0.021891,-4.441716,1,intermediate_model,19
5,204,f,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,MLP,MLP,,1.372538,...,2.516835,0.957256,0.295185,0.228292,0.564297,0.018385,-3.61339,1,intermediate_model,18
23,209,am,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Conv1D,Conv1D,Conv1D_7.0,7.0,1.32179,...,2.43999,0.951914,0.151181,0.094707,0.219578,0.007694,-3.905356,1,intermediate_model,21
2,203,c,Blocking Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,LSTM,LSTM_1.0,1.0,1.49991,...,2.54719,0.933078,0.29704,0.098587,0.237979,0.026938,-3.392459,3,intermediate_model,15


<IPython.core.display.Javascript object>

In [291]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [292]:
# Top 4
df_sorted_topo_cols.iloc[0:5]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
20,MLP,0.98,0.08,0.75,0.06,1.82,0.13,0.97,0.0,-5.6,0,dominant_model,24,am
4,Transformer_1.0,1.38,0.3,0.87,0.15,2.09,0.29,0.94,0.02,-4.44,1,intermediate_model,19,c
5,MLP,1.37,0.3,1.08,0.23,2.52,0.56,0.96,0.02,-3.61,1,intermediate_model,18,f
23,Conv1D_7.0,1.32,0.15,1.01,0.09,2.44,0.22,0.95,0.01,-3.91,1,intermediate_model,21,am
2,LSTM_1.0,1.5,0.3,1.06,0.1,2.55,0.24,0.93,0.03,-3.39,3,intermediate_model,15,c


<IPython.core.display.Javascript object>

In [293]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [294]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
23,Conv1D_7.0,1.32,0.15,1.01,0.09,2.44,0.22,0.95,0.01,-3.91,1,intermediate_model,21,am
4,Transformer_1.0,1.38,0.3,0.87,0.15,2.09,0.29,0.94,0.02,-4.44,1,intermediate_model,19,c
5,MLP,1.37,0.3,1.08,0.23,2.52,0.56,0.96,0.02,-3.61,1,intermediate_model,18,f


<IPython.core.display.Javascript object>

In [295]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(23, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [296]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
12,LSTM_1.0,3.41,0.94,2.68,0.95,5.98,1.88,-3.31,5.94,10.89,24,non_dominant_model,0,b


<IPython.core.display.Javascript object>

In [297]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [298]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [299]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    20
Conv1D              5
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [300]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [301]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [302]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [303]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,90.0
Neural Networks,dominant_model,5.0
Neural Networks,non_dominant_model,5.0


<IPython.core.display.Javascript object>

In [304]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,5.0,20.0
Dominated_Count,mean,9.6,9.35
Dominated_Count,std,6.11,6.84
Dominated_Count,min,1.0,0.0
Dominated_Count,25%,6.0,3.0
Dominated_Count,50%,11.0,11.0
Dominated_Count,75%,14.0,12.0
Dominated_Count,max,16.0,24.0
Dominates_Count,count,5.0,20.0
Dominates_Count,mean,9.4,9.4


<IPython.core.display.Javascript object>

In [305]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [306]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,5,1,intermediate_model,5
Neural Networks,20,3,intermediate_model,18


<IPython.core.display.Javascript object>

In [307]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model     5
Neural Networks  intermediate_model    18
                 dominant_model         1
                 non_dominant_model     1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [308]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [309]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [310]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,5,9.4,5.0,48,47,1.88
Neural Networks,20,9.4,6.0,187,188,7.52


<IPython.core.display.Javascript object>

In [311]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,5.0,1.934973,0.516749,1.32179,1.42081,2.279616,2.289793,2.362857,5.0,0.579923,...,14.0,16.0,5.0,9.4,8.173127,3.0,3.0,5.0,15.0,21.0
Neural Networks,20.0,2.055055,0.693566,0.977688,1.474158,2.015422,2.656925,3.40671,20.0,0.63477,...,12.0,24.0,20.0,9.4,7.700991,0.0,3.25,6.0,15.5,24.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [312]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [313]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [314]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [315]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
20,MLP,0.98,0.08,0.75,0.06,1.82,0.13,0.97,0.0,-5.6,0,24,dominant_model,am,True
4,Transformer_1.0,1.38,0.3,0.87,0.15,2.09,0.29,0.94,0.02,-4.44,1,19,intermediate_model,c,True
23,Conv1D_7.0,1.32,0.15,1.01,0.09,2.44,0.22,0.95,0.01,-3.91,1,21,intermediate_model,am,False
5,MLP,1.37,0.3,1.08,0.23,2.52,0.56,0.96,0.02,-3.61,1,18,intermediate_model,f,False
24,Transformer_1.0,1.37,0.24,1.06,0.21,2.58,0.51,0.94,0.03,-3.59,2,18,intermediate_model,am,False


<IPython.core.display.Javascript object>

### Time Series Split

In [316]:
df_results_cm_p = (
    df_results[
        df_results["Features"].eq("Chemical + Mineralogical + Physical - Early CS")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [317]:
df_results_cm_p["Features"].unique()

array(['Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [318]:
df_results_cm_p.shape

(50, 17)

<IPython.core.display.Javascript object>

In [319]:
df_results_cm["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [320]:
df_results_cm_p_tss = (
    df_results_cm_p[df_results_cm_p["Cross Validation"].eq("Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [321]:
df_results_cm_p_tss["Cross Validation"].unique()

array(['Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [322]:
df_results_cm_p_tss.shape

(25, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [323]:
df_results_cm_p_tss = compute_scpm(df_results_cm_p_tss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [324]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_tss)
dominance_matrix_cm_p_tss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_tss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [325]:
dominance_matrix_cm_tss.shape, len(dominance_graph_cm_tss)

((25, 25), 25)

<IPython.core.display.Javascript object>

In [326]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    19
dominant_model         3
non_dominant_model     3
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [327]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/global_models/ecics/ecics_209_w_dominance_analysis_cm_p_tss.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [328]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
5,204,f,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,MLP,MLP,,1.626909,...,3.036724,0.942175,0.273138,0.197504,0.477115,0.020827,-3.297952,0,dominant_model,21
9,204,f,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,Transformer,Transformer_1.0,1.0,1.63891,...,2.901188,0.940758,0.317922,0.223103,0.511231,0.022433,-3.445487,0,dominant_model,22
20,209,w,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,MLP,MLP,,1.317312,...,2.463852,0.931815,0.244525,0.193443,0.375366,0.026493,-4.50688,0,dominant_model,22
8,204,f,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Conv1D,Conv1D,Conv1D_7.0,7.0,1.753235,...,3.19543,0.930707,0.307386,0.236634,0.523346,0.022692,-2.824788,3,intermediate_model,19
22,209,w,Time Series Split,Chemical + Mineralogical + Physical - Early CS,Chemical + Properties CS Less,Neural Networks,LSTM,LSTM_7.0,7.0,1.660127,...,2.971102,0.887714,0.438231,0.291369,0.646294,0.058431,-3.266957,2,intermediate_model,19


<IPython.core.display.Javascript object>

In [329]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [330]:
# Top 4
df_sorted_topo_cols.iloc[0:2].sort_values(by="Dominates_Count", ascending=False).round(
    2
)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
9,Transformer_1.0,1.64,0.32,1.26,0.22,2.9,0.51,0.94,0.02,-3.45,0,dominant_model,22,f
5,MLP,1.63,0.27,1.29,0.2,3.04,0.48,0.94,0.02,-3.3,0,dominant_model,21,f


<IPython.core.display.Javascript object>

In [331]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(3, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [332]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:4].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
22,LSTM_7.0,1.66,0.44,1.27,0.29,2.97,0.65,0.89,0.06,-3.27,2,intermediate_model,19,w
8,Conv1D_7.0,1.75,0.31,1.4,0.24,3.2,0.52,0.93,0.02,-2.82,3,intermediate_model,19,f
21,BidirectionalLSTM_1.0,1.87,0.58,1.42,0.41,3.28,0.93,0.85,0.09,-2.48,5,intermediate_model,14,w
4,Transformer_1.0,2.0,0.4,1.37,0.23,3.15,0.52,0.88,0.04,-2.59,4,intermediate_model,13,c


<IPython.core.display.Javascript object>

In [333]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(19, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [334]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,BidirectionalLSTM_1.0,5.702331,3.53104,2.016202,0.26667,4.662165,0.697932,-0.333849,1.696471,5.40433,17,non_dominant_model,0,c
16,BidirectionalLSTM_1.0,4.27694,1.591414,3.100156,0.94878,6.951162,2.304203,-1.250738,1.117185,9.199412,22,non_dominant_model,0,at
18,Conv1D_14.0,4.16768,1.18191,3.329923,1.021107,7.550541,2.2372,-1.26394,1.299202,10.01321,22,non_dominant_model,0,at


<IPython.core.display.Javascript object>

In [335]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(3, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [336]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [337]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    20
Conv1D              5
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [338]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [339]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [340]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,80.0
Conv1D,non_dominant_model,20.0
Neural Networks,intermediate_model,75.0
Neural Networks,dominant_model,15.0
Neural Networks,non_dominant_model,10.0


<IPython.core.display.Javascript object>

In [341]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,5.0,20.0
Dominated_Count,mean,10.8,8.85
Dominated_Count,std,6.98,6.94
Dominated_Count,min,3.0,0.0
Dominated_Count,25%,8.0,4.0
Dominated_Count,50%,10.0,7.0
Dominated_Count,75%,11.0,13.75
Dominated_Count,max,22.0,22.0
Dominates_Count,count,5.0,20.0
Dominates_Count,mean,7.4,9.7


<IPython.core.display.Javascript object>

In [342]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [343]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,5,2,intermediate_model,4
Neural Networks,20,3,intermediate_model,15


<IPython.core.display.Javascript object>

In [344]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model     4
                 non_dominant_model     1
Neural Networks  intermediate_model    15
                 dominant_model         3
                 non_dominant_model     2
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [345]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [346]:
summary_stats.round(2)

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,5,7.4,7.0,54,37,1.48
Neural Networks,20,9.7,9.5,177,194,7.76


<IPython.core.display.Javascript object>

In [347]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,5.0,2.597912,0.957984,1.753235,1.896894,2.557865,2.613889,4.16768,5.0,0.810398,...,11.0,22.0,5.0,7.4,7.231874,0.0,3.0,7.0,8.0,19.0
Neural Networks,20.0,2.436073,1.04814,1.317312,1.865776,2.009159,2.689787,5.702331,20.0,0.737678,...,13.75,22.0,20.0,9.7,7.204531,0.0,3.75,9.5,13.25,22.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [348]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [349]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [350]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [351]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
20,MLP,1.32,0.24,1.04,0.19,2.46,0.38,0.93,0.03,-4.51,0,22,dominant_model,w,False
9,Transformer_1.0,1.64,0.32,1.26,0.22,2.9,0.51,0.94,0.02,-3.45,0,22,dominant_model,f,True
5,MLP,1.63,0.27,1.29,0.2,3.04,0.48,0.94,0.02,-3.3,0,21,dominant_model,f,False
22,LSTM_7.0,1.66,0.44,1.27,0.29,2.97,0.65,0.89,0.06,-3.27,2,19,intermediate_model,w,False
8,Conv1D_7.0,1.75,0.31,1.4,0.24,3.2,0.52,0.93,0.02,-2.82,3,19,intermediate_model,f,False


<IPython.core.display.Javascript object>

## Variable Grouping: CM-P-CS

#### Analysis of the best models identified in the Local Analysis

**Best models for variable groupping CM-P-CS were from:**

*BTiSS:*
    Dominance analysis: Plant S and Plant K
    SCPM: Plant S and Plant K

*TiSS:*
    Dominance analysis: Plant S
    SCPM: Plant S

In [352]:
df_results["Features"].unique()

array(['Chemical + Mineralogical', 'Chemical + Mineralogical + Physical',
       'Chemical + Mineralogical + Physical - Early CS'], dtype=object)

<IPython.core.display.Javascript object>

In [353]:
df_results_cm_p_cs = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [354]:
df_results_cm_p_cs["Features"].unique()

array(['Chemical + Mineralogical + Physical'], dtype=object)

<IPython.core.display.Javascript object>

In [355]:
df_results_cm_p_cs.shape

(55, 17)

<IPython.core.display.Javascript object>

### Plant S

In [356]:
df_results_cm_p_cs = df_results_cm_p_cs[
    df_results_cm_p_cs["Plant"].eq("s")
].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Blocking time series

In [357]:
df_results_cm_p_cs["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [358]:
df_results_cm_p_cs_btss = (
    df_results_cm_p_cs[
        df_results_cm_p_cs["Cross Validation"].eq("Blocking Time Series Split")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [359]:
df_results_cm_p_cs_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [360]:
df_results_cm_p_cs_btss.shape

(5, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [361]:
df_results_cm_p_cs_btss = compute_scpm(df_results_cm_p_cs_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [362]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_btss)
dominance_matrix_cm_p_cs_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [363]:
dominance_matrix_cm_p_cs_btss.shape, len(dominance_graph_cm_p_cs_btss)

((5, 5), 5)

<IPython.core.display.Javascript object>

In [364]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    3
dominant_model        1
non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [365]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/global_models/ecics/ecics_209_s_dominance_analysis_cm_p_cs_btss.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [366]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
0,209,s,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,MLP,MLP,,0.706974,...,1.343428,0.982151,0.131605,0.091949,0.217774,0.006596,-5.297239,0,dominant_model,4
1,209,s,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Conv1D,Conv1D,Conv1D_1.0,1.0,0.909706,...,1.770868,0.971428,0.138244,0.104172,0.235113,0.008137,-2.180513,1,intermediate_model,3
4,209,s,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_1.0,1.0,1.090873,...,2.068817,0.958023,0.197852,0.182819,0.414481,0.015695,0.307526,2,intermediate_model,2
3,209,s,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,LSTM,LSTM_1.0,1.0,1.115614,...,2.069726,0.95711,0.100757,0.115338,0.305806,0.008406,0.493156,3,intermediate_model,1
2,209,s,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,Transformer,Transformer_1.0,1.0,1.670219,...,2.61505,0.900631,0.4944,0.187695,0.453612,0.04708,6.67707,4,non_dominant_model,0


<IPython.core.display.Javascript object>

In [367]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [368]:
# Top 4
df_sorted_topo_cols.iloc[0:5]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
0,MLP,0.71,0.13,0.54,0.09,1.34,0.22,0.98,0.01,-5.3,0,dominant_model,4,s
1,Conv1D_1.0,0.91,0.14,0.72,0.1,1.77,0.24,0.97,0.01,-2.18,1,intermediate_model,3,s
4,BidirectionalLSTM_1.0,1.09,0.2,0.83,0.18,2.07,0.41,0.96,0.02,0.31,2,intermediate_model,2,s
3,LSTM_1.0,1.12,0.1,0.84,0.12,2.07,0.31,0.96,0.01,0.49,3,intermediate_model,1,s
2,Transformer_1.0,1.67,0.49,1.03,0.19,2.62,0.45,0.9,0.05,6.68,4,non_dominant_model,0,s


<IPython.core.display.Javascript object>

In [369]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [370]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
1,Conv1D_1.0,0.91,0.14,0.72,0.1,1.77,0.24,0.97,0.01,-2.18,1,intermediate_model,3,s
4,BidirectionalLSTM_1.0,1.09,0.2,0.83,0.18,2.07,0.41,0.96,0.02,0.31,2,intermediate_model,2,s
3,LSTM_1.0,1.12,0.1,0.84,0.12,2.07,0.31,0.96,0.01,0.49,3,intermediate_model,1,s


<IPython.core.display.Javascript object>

In [371]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(3, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [372]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
2,Transformer_1.0,1.67,0.49,1.03,0.19,2.62,0.45,0.9,0.05,6.68,4,non_dominant_model,0,s


<IPython.core.display.Javascript object>

In [373]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [374]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [375]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    4
Conv1D             1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [376]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [377]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [378]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [379]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,100.0
Neural Networks,intermediate_model,50.0
Neural Networks,dominant_model,25.0
Neural Networks,non_dominant_model,25.0


<IPython.core.display.Javascript object>

In [380]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,1.0,4.0
Dominated_Count,mean,1.0,2.25
Dominated_Count,std,,1.71
Dominated_Count,min,1.0,0.0
Dominated_Count,25%,1.0,1.5
Dominated_Count,50%,1.0,2.5
Dominated_Count,75%,1.0,3.25
Dominated_Count,max,1.0,4.0
Dominates_Count,count,1.0,4.0
Dominates_Count,mean,3.0,1.75


<IPython.core.display.Javascript object>

In [381]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [382]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,1,1,intermediate_model,1
Neural Networks,4,3,intermediate_model,2


<IPython.core.display.Javascript object>

In [383]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model    1
Neural Networks  intermediate_model    2
                 dominant_model        1
                 non_dominant_model    1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [384]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [385]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [386]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,1,3.0,3.0,1,3,0.6
Neural Networks,4,1.75,1.5,9,7,1.4


<IPython.core.display.Javascript object>

In [387]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,1.0,0.909706,,0.909706,0.909706,0.909706,0.909706,0.909706,1.0,0.138244,...,1.0,1.0,1.0,3.0,,3.0,3.0,3.0,3.0,3.0
Neural Networks,4.0,1.14592,0.396448,0.706974,0.994898,1.103244,1.254266,1.670219,4.0,0.231153,...,3.25,4.0,4.0,1.75,1.707825,0.0,0.75,1.5,2.5,4.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [388]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [389]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [390]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [391]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
0,MLP,0.71,0.13,0.54,0.09,1.34,0.22,0.98,0.01,-5.3,0,4,dominant_model,s,True
1,Conv1D_1.0,0.91,0.14,0.72,0.1,1.77,0.24,0.97,0.01,-2.18,1,3,intermediate_model,s,True
4,BidirectionalLSTM_1.0,1.09,0.2,0.83,0.18,2.07,0.41,0.96,0.02,0.31,2,2,intermediate_model,s,True
3,LSTM_1.0,1.12,0.1,0.84,0.12,2.07,0.31,0.96,0.01,0.49,3,1,intermediate_model,s,True
2,Transformer_1.0,1.67,0.49,1.03,0.19,2.62,0.45,0.9,0.05,6.68,4,0,non_dominant_model,s,True


<IPython.core.display.Javascript object>

In [392]:
df_results_cm_p_cs = (
    df_results[df_results["Features"].eq("Chemical + Mineralogical + Physical")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [393]:
df_results_cm_p_cs["Features"].unique()

array(['Chemical + Mineralogical + Physical'], dtype=object)

<IPython.core.display.Javascript object>

In [394]:
df_results_cm_p_cs.shape

(55, 17)

<IPython.core.display.Javascript object>

### Blocking time series

In [395]:
df_results_cm_p_cs["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [396]:
df_results_cm_p_cs_btss = (
    df_results_cm_p_cs[
        df_results_cm_p_cs["Cross Validation"].eq("Blocking Time Series Split")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [397]:
df_results_cm_p_cs_btss["Cross Validation"].unique()

array(['Blocking Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [398]:
df_results_cm_p_cs_btss.shape

(30, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [399]:
df_results_cm_p_cs_btss = compute_scpm(df_results_cm_p_cs_btss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [400]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_btss)
dominance_matrix_cm_p_cs_btss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_btss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [401]:
dominance_matrix_cm_p_cs_btss.shape, len(dominance_graph_cm_p_cs_btss)

((30, 30), 30)

<IPython.core.display.Javascript object>

In [402]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    25
non_dominant_model     4
dominant_model         1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [403]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/global_models/ecics/ecics_209_k_dominance_analysis_cm_p_cs_btss.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [404]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
20,209,s,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,MLP,MLP,,0.706974,...,1.343428,0.982151,0.131605,0.091949,0.217774,0.006596,-5.207017,0,dominant_model,29
25,209,k,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,MLP,MLP,,0.856619,...,1.566781,0.973031,0.215978,0.157436,0.394729,0.019362,-4.615279,1,intermediate_model,28
21,209,s,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Conv1D,Conv1D,Conv1D_1.0,1.0,0.909706,...,1.770868,0.971428,0.138244,0.104172,0.235113,0.008137,-4.318579,2,intermediate_model,27
0,203,c,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,MLP,MLP,,1.083815,...,2.040679,0.966125,0.127806,0.089256,0.240282,0.005697,-3.681784,3,intermediate_model,25
24,209,s,Blocking Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_1.0,1.0,1.090873,...,2.068817,0.958023,0.197852,0.182819,0.414481,0.015695,-3.649509,3,intermediate_model,25


<IPython.core.display.Javascript object>

In [405]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
].round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [406]:
# Top 4
df_sorted_topo_cols.iloc[0:5]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
20,MLP,0.71,0.13,0.54,0.09,1.34,0.22,0.98,0.01,-5.21,0,dominant_model,29,s
25,MLP,0.86,0.22,0.67,0.16,1.57,0.39,0.97,0.02,-4.62,1,intermediate_model,28,k
21,Conv1D_1.0,0.91,0.14,0.72,0.1,1.77,0.24,0.97,0.01,-4.32,2,intermediate_model,27,s
0,MLP,1.08,0.13,0.84,0.09,2.04,0.24,0.97,0.01,-3.68,3,intermediate_model,25,c
24,BidirectionalLSTM_1.0,1.09,0.2,0.83,0.18,2.07,0.41,0.96,0.02,-3.65,3,intermediate_model,25,s


<IPython.core.display.Javascript object>

In [407]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [408]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
25,MLP,0.86,0.22,0.67,0.16,1.57,0.39,0.97,0.02,-4.62,1,intermediate_model,28,k
21,Conv1D_1.0,0.91,0.14,0.72,0.1,1.77,0.24,0.97,0.01,-4.32,2,intermediate_model,27,s
0,MLP,1.08,0.13,0.84,0.09,2.04,0.24,0.97,0.01,-3.68,3,intermediate_model,25,c


<IPython.core.display.Javascript object>

In [409]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(25, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [410]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:2]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
11,Conv1D_1.0,2.33,0.84,1.88,0.73,4.28,1.6,-1.31,3.39,5.17,24,non_dominant_model,0,b
18,LSTM_1.0,2.87,1.09,2.15,0.77,4.82,1.64,-0.83,2.28,5.88,26,non_dominant_model,0,at


<IPython.core.display.Javascript object>

In [411]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(4, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [412]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [413]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    24
Conv1D              6
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [414]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [415]:
# pd.set_option("display.max_rows", None)
# df_sorted_topo_models_grouped

<IPython.core.display.Javascript object>

In [416]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [417]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,66.67
Conv1D,non_dominant_model,33.33
Neural Networks,intermediate_model,87.5
Neural Networks,non_dominant_model,8.33
Neural Networks,dominant_model,4.17


<IPython.core.display.Javascript object>

In [418]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,6.0,24.0
Dominated_Count,mean,13.17,11.46
Dominated_Count,std,8.47,6.74
Dominated_Count,min,2.0,0.0
Dominated_Count,25%,7.5,6.75
Dominated_Count,50%,13.0,13.0
Dominated_Count,75%,19.25,16.0
Dominated_Count,max,24.0,26.0
Dominates_Count,count,6.0,24.0
Dominates_Count,mean,11.0,12.0


<IPython.core.display.Javascript object>

In [419]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [420]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,6,2,intermediate_model,4
Neural Networks,24,3,intermediate_model,21


<IPython.core.display.Javascript object>

In [421]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model     4
                 non_dominant_model     2
Neural Networks  intermediate_model    21
                 non_dominant_model     2
                 dominant_model         1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [422]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [423]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [424]:
summary_stats

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,6,11.0,9.5,79,66,2.2
Neural Networks,24,12.0,10.0,275,288,9.6


<IPython.core.display.Javascript object>

In [425]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,6.0,2.013751,0.950187,0.909706,1.444368,1.922693,2.279499,3.644053,6.0,0.95934,...,19.25,24.0,6.0,11.0,11.349009,0.0,1.0,9.5,18.75,27.0
Neural Networks,24.0,1.882511,0.902998,0.706974,1.440724,1.849106,2.034806,5.339302,24.0,0.611644,...,16.0,26.0,24.0,12.0,9.903886,0.0,3.0,10.0,20.75,29.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [426]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [427]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [428]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [429]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
20,MLP,0.71,0.13,0.54,0.09,1.34,0.22,0.98,0.01,-5.21,0,29,dominant_model,s,True
25,MLP,0.86,0.22,0.67,0.16,1.57,0.39,0.97,0.02,-4.62,1,28,intermediate_model,k,True
21,Conv1D_1.0,0.91,0.14,0.72,0.1,1.77,0.24,0.97,0.01,-4.32,2,27,intermediate_model,s,True
0,MLP,1.08,0.13,0.84,0.09,2.04,0.24,0.97,0.01,-3.68,3,25,intermediate_model,c,True
24,BidirectionalLSTM_1.0,1.09,0.2,0.83,0.18,2.07,0.41,0.96,0.02,-3.65,3,25,intermediate_model,s,True


<IPython.core.display.Javascript object>

### Time Series Split

In [430]:
df_results_cm_p_cs = (
    df_results[
        df_results["Features"].eq("Chemical + Mineralogical + Physical")
        & ~df_results["Plant"].eq("k")
    ]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [431]:
df_results_cm_p_cs["Features"].unique()

array(['Chemical + Mineralogical + Physical'], dtype=object)

<IPython.core.display.Javascript object>

In [432]:
df_results_cm_p_cs.shape

(50, 17)

<IPython.core.display.Javascript object>

In [433]:
df_results_cm_p_cs["Cross Validation"].unique()

array(['Blocking Time Series Split', 'Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [434]:
df_results_cm_p_cs_tss = (
    df_results_cm_p_cs[df_results_cm_p_cs["Cross Validation"].eq("Time Series Split")]
    .copy()
    .reset_index(drop=True)
)

<IPython.core.display.Javascript object>

In [435]:
df_results_cm_p_cs_tss["Cross Validation"].unique()

array(['Time Series Split'], dtype=object)

<IPython.core.display.Javascript object>

In [436]:
df_results_cm_p_cs_tss.shape

(25, 17)

<IPython.core.display.Javascript object>

###### SCPM computation

In [437]:
df_results_cm_p_cs_tss = compute_scpm(df_results_cm_p_cs_tss)

<IPython.core.display.Javascript object>

##### Dominance Analysis

In [438]:
# Per project
dominance_dict = make_dominance_analysis(df_results_cm_p_cs_tss)
dominance_matrix_cm_p_cs_tss = dominance_dict["dominance_matrix"]
dominance_graph_cm_p_cs_tss = dominance_dict["dominance_graph"]
df_sorted_count = dominance_dict["df_sorted_count"]
df_sorted_topo = dominance_dict["df_sorted_topo"]

<IPython.core.display.Javascript object>

In [439]:
dominance_matrix_cm_p_cs_tss.shape, len(dominance_graph_cm_p_cs_tss)

((25, 25), 25)

<IPython.core.display.Javascript object>

In [440]:
df_sorted_topo["Classification"].value_counts()

Classification
intermediate_model    20
non_dominant_model     4
dominant_model         1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [441]:
# df_sorted_topo.to_csv(
#     "../../../../reports/results/global_models/ecics/ecics_209_s_dominance_analysis_cm_p_cs_tss.csv",
#     index=False,
# )

<IPython.core.display.Javascript object>

In [442]:
df_sorted_topo.head(5)

Unnamed: 0,Company,Plant,Cross Validation,Features,Features_bkp,Model,Model_bkp,Model_bkp_2,Timesteps,RMSE_mean,...,MAPE_mean,R2_mean,RMSE_std,MAE_std,MAPE_std,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count
20,209,s,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,MLP,MLP,,0.790668,...,1.452909,0.976708,0.101774,0.063122,0.146045,0.00529,-4.738469,0,dominant_model,24
21,209,s,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Conv1D,Conv1D,Conv1D_1.0,1.0,0.97057,...,1.735976,0.964481,0.203195,0.087436,0.188137,0.012758,-4.112339,1,intermediate_model,23
23,209,s,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,LSTM,LSTM_1.0,1.0,1.179825,...,1.759961,0.940172,0.527326,0.156331,0.385089,0.055051,-3.932824,2,intermediate_model,20
24,209,s,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,BidirectionalLSTM,BidirectionalLSTM_1.0,1.0,1.185788,...,1.972467,0.946583,0.239995,0.148219,0.380745,0.021361,-3.570323,2,intermediate_model,21
0,203,c,Time Series Split,Chemical + Mineralogical + Physical,Chemical + Physical,Neural Networks,MLP,MLP,,1.555784,...,2.545077,0.929168,0.217635,0.243941,0.472629,0.01732,-2.186171,4,intermediate_model,15


<IPython.core.display.Javascript object>

In [443]:
df_sorted_topo_cols = df_sorted_topo[
    [
        # "Model",
        # "Model_bkp",
        "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Classification",
        "Dominates_Count",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

###### Dominant Models

In [444]:
# Top 4
df_sorted_topo_cols.iloc[0:2].sort_values(by="Dominates_Count", ascending=False).round(
    2
)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
20,MLP,0.79,0.1,0.58,0.06,1.45,0.15,0.98,0.01,-4.74,0,dominant_model,24,s
21,Conv1D_1.0,0.97,0.2,0.71,0.09,1.74,0.19,0.96,0.01,-4.11,1,intermediate_model,23,s


<IPython.core.display.Javascript object>

In [445]:
df_sorted_topo_cols[df_sorted_topo_cols["Classification"].eq("dominant_model")].shape

(1, 14)

<IPython.core.display.Javascript object>

###### Intermediate models

In [446]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:4].round(2)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
21,Conv1D_1.0,0.97,0.2,0.71,0.09,1.74,0.19,0.96,0.01,-4.11,1,intermediate_model,23,s
24,BidirectionalLSTM_1.0,1.19,0.24,0.79,0.15,1.97,0.38,0.95,0.02,-3.57,2,intermediate_model,21,s
23,LSTM_1.0,1.18,0.53,0.71,0.16,1.76,0.39,0.94,0.06,-3.93,2,intermediate_model,20,s
5,MLP,1.5,0.45,1.15,0.31,2.57,0.58,0.94,0.02,-2.1,3,intermediate_model,18,ab


<IPython.core.display.Javascript object>

In [447]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("intermediate_model")
].shape

(20, 14)

<IPython.core.display.Javascript object>

###### non-dominant models

In [448]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].sort_values(by=["Dominates_Count", "SCPM"], ascending=[False, True]).iloc[:3]

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Classification,Dominates_Count,Plant
8,LSTM_7.0,3.140499,1.62154,2.385823,1.149827,5.315326,2.237891,0.730408,0.243026,4.11616,13,non_dominant_model,0,ab
16,Conv1D_7.0,2.80101,0.378777,2.242115,0.283773,5.041565,0.418911,-0.128617,0.613507,4.174323,21,non_dominant_model,0,at
17,Transformer_1.0,2.965125,0.64683,2.243967,0.393951,5.186654,1.132405,-0.099444,0.395899,4.380655,21,non_dominant_model,0,at


<IPython.core.display.Javascript object>

In [449]:
df_sorted_topo_cols[
    df_sorted_topo_cols["Classification"].eq("non_dominant_model")
].shape

(4, 14)

<IPython.core.display.Javascript object>

###### Statistics per model type

In [450]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        # "Model_bkp",
        # "Model_bkp_2",
        "RMSE_mean",
        "RMSE_std",
        "MAE_mean",
        "MAE_std",
        "MAPE_mean",
        "MAPE_std",
        "R2_mean",
        "R2_std",
        "SCPM",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
        "Plant",
    ]
]  # .round(2)

<IPython.core.display.Javascript object>

In [451]:
df_sorted_topo["Model"].value_counts()

Model
Neural Networks    20
Conv1D              5
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [452]:
df_sorted_topo_models_grouped = df_sorted_topo_models_cols.groupby("Model").describe().T

<IPython.core.display.Javascript object>

In [453]:
df_sorted_topo_models_cols = df_sorted_topo[
    [
        "Model",
        "Dominated_Count",
        "Dominates_Count",
        "Classification",
    ]
]

<IPython.core.display.Javascript object>

In [454]:
(
    df_sorted_topo_models_cols.groupby("Model")[["Classification"]]
    .value_counts(normalize=True)
    .to_frame()
    * 100
).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Model,Classification,Unnamed: 2_level_1
Conv1D,intermediate_model,80.0
Conv1D,non_dominant_model,20.0
Neural Networks,intermediate_model,80.0
Neural Networks,non_dominant_model,15.0
Neural Networks,dominant_model,5.0


<IPython.core.display.Javascript object>

In [455]:
df_sorted_topo_models_cols.groupby("Model").describe().round(2).T

Unnamed: 0,Model,Conv1D,Neural Networks
Dominated_Count,count,5.0,20.0
Dominated_Count,mean,9.4,9.7
Dominated_Count,std,7.37,6.33
Dominated_Count,min,1.0,0.0
Dominated_Count,25%,6.0,4.0
Dominated_Count,50%,9.0,10.0
Dominated_Count,75%,10.0,13.25
Dominated_Count,max,21.0,21.0
Dominates_Count,count,5.0,20.0
Dominates_Count,mean,11.2,9.25


<IPython.core.display.Javascript object>

In [456]:
# df_sorted_topo_models_cols

<IPython.core.display.Javascript object>

In [457]:
df_sorted_topo_models_cols.groupby("Model")["Classification"].describe()

Unnamed: 0_level_0,count,unique,top,freq
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Conv1D,5,2,intermediate_model,4
Neural Networks,20,3,intermediate_model,16


<IPython.core.display.Javascript object>

In [458]:
df_sorted_topo_models_cols.groupby("Model")[["Classification"]].value_counts()

Model            Classification    
Conv1D           intermediate_model     4
                 non_dominant_model     1
Neural Networks  intermediate_model    16
                 non_dominant_model     3
                 dominant_model         1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [459]:
summary_stats = df_sorted_topo_models_cols.groupby("Model").agg(
    Total_Models=("Model", "count"),
    Mean_Dominance_Count=("Dominates_Count", "mean"),
    Median_Dominance_Count=("Dominates_Count", "median"),
    Total_Dominated=("Dominated_Count", "sum"),
    Total_Dominating=("Dominates_Count", "sum"),
)

total_models = summary_stats[
    "Total_Models"
].sum()  # Get the total number of models across all categories
summary_stats["Dominance Proportion"] = summary_stats["Total_Dominating"] / total_models

<IPython.core.display.Javascript object>

In [460]:
summary_stats.round(2)

Unnamed: 0_level_0,Total_Models,Mean_Dominance_Count,Median_Dominance_Count,Total_Dominated,Total_Dominating,Dominance Proportion
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Conv1D,5,11.2,10.0,47,56,2.24
Neural Networks,20,9.25,6.0,194,185,7.4


<IPython.core.display.Javascript object>

In [461]:
df_sorted_topo_models_grouped.T

Unnamed: 0_level_0,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_mean,RMSE_std,RMSE_std,...,Dominated_Count,Dominated_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count,Dominates_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Conv1D,5.0,1.790462,0.662044,0.97057,1.579179,1.714384,1.887169,2.80101,5.0,0.3546,...,10.0,21.0,5.0,11.2,8.526429,0.0,8.0,10.0,15.0,23.0
Neural Networks,20.0,2.360903,1.957232,0.790668,1.525315,1.987519,2.509695,10.232263,20.0,0.95165,...,13.25,21.0,20.0,9.25,7.670003,0.0,3.75,6.0,16.0,24.0


<IPython.core.display.Javascript object>

### SCPM Analysis

In [462]:
cols = [
    # "Model",
    # "Model_bkp",
    "Model_bkp_2",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std",
    "MAPE_mean",
    "MAPE_std",
    "R2_mean",
    "R2_std",
    "SCPM",
    "Dominated_Count",
    "Dominates_Count",
    "Classification",
    "Plant",
]

<IPython.core.display.Javascript object>

#### SCPM Analysis by Project

In [463]:
df_sorted_scpm = df_sorted_topo.sort_values(by="SCPM").copy()

<IPython.core.display.Javascript object>

In [464]:
# (df_sorted_topo[cols].reset_index() == df_sorted_scpm[cols].reset_index()).all(axis=1)
df_sorted_scpm["topo_scmp_order_eq"] = df_sorted_topo.index == df_sorted_scpm.index

<IPython.core.display.Javascript object>

In [465]:
df_sorted_scpm[cols + ["topo_scmp_order_eq"]].round(2).head(5)

Unnamed: 0,Model_bkp_2,RMSE_mean,RMSE_std,MAE_mean,MAE_std,MAPE_mean,MAPE_std,R2_mean,R2_std,SCPM,Dominated_Count,Dominates_Count,Classification,Plant,topo_scmp_order_eq
20,MLP,0.79,0.1,0.58,0.06,1.45,0.15,0.98,0.01,-4.74,0,24,dominant_model,s,True
21,Conv1D_1.0,0.97,0.2,0.71,0.09,1.74,0.19,0.96,0.01,-4.11,1,23,intermediate_model,s,True
23,LSTM_1.0,1.18,0.53,0.71,0.16,1.76,0.39,0.94,0.06,-3.93,2,20,intermediate_model,s,True
24,BidirectionalLSTM_1.0,1.19,0.24,0.79,0.15,1.97,0.38,0.95,0.02,-3.57,2,21,intermediate_model,s,True
22,Transformer_1.0,1.37,0.36,0.9,0.16,2.19,0.32,0.93,0.03,-3.03,4,16,intermediate_model,s,False


<IPython.core.display.Javascript object>