In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import mlflow
import pandas as pd
import numpy as np
import xgboost as xgb
import warnings
import json

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

from lib.full_flow_dataloader import load_full_flow_data
from lib.reproduction import major_oxides

from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA, KernelPCA
from sklearn.linear_model import ElasticNet

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras import optimizers
from tensorflow.keras.losses import MeanSquaredError

from scikeras.wrappers import KerasRegressor

In [None]:
def split_data(train: pd.DataFrame, test: pd.DataFrame) -> tuple:
    drop_cols = major_oxides + ["ID", "Sample Name"]

    X_train = train.drop(columns=drop_cols)
    X_test = test.drop(columns=drop_cols)
    y_train = train[major_oxides]
    y_test = test[major_oxides]

    return X_train, y_train, X_test, y_test


def preprocess_data(X_train: pd.DataFrame, X_test: pd.DataFrame, preprocesser_pipeline: Pipeline) -> tuple:
    X_train = preprocesser_pipeline.fit_transform(X_train)
    X_test = preprocesser_pipeline.transform(X_test)

    return X_train, X_test

In [None]:
def build_model(input_dim, output_dim):
    model = Sequential()
    model.add(layers.Input(shape=(input_dim,)))
    model.add(layers.Reshape((48, 128, 1)))
    model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D((2, 2)))

    # Additional convolutional block for better feature extraction
    model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D((2, 2)))

    model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D((2, 2)))

    model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D((2, 2)))

    model.add(layers.Flatten())
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(output_dim))

    # Using L2 regularization
    model.add(layers.Dense(output_dim, kernel_regularizer=regularizers.l2(0.01)))

    # Optimizer with a custom learning rate
    optimizer = optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss=MeanSquaredError())
    return model

INPUT_DIM = 6144  # Number of features per sample
OUTPUT_DIM = 1    # Number of continuous values as output

cnn = KerasRegressor(build_fn=lambda: build_model(INPUT_DIM, OUTPUT_DIM), loss=MeanSquaredError() ,epochs=100, batch_size=32, verbose=0)

In [None]:
sio2_preprocessor_pipeline = Pipeline([
    ("scaler", MaxAbsScaler()),
    ("power_transformer", PowerTransformer())
])

tio2_preprocessor_pipeline = Pipeline([
    ("scaler", MaxAbsScaler()),
    ("power_transformer", PowerTransformer())
])

al203_preprocessor_pipeline = Pipeline([
    ("scaler", MaxAbsScaler()),
    ("power_transformer", PowerTransformer()),
    ("pca", PCA(n_components=34))
])

feot_preprocessor_pipeline = Pipeline([
    ("scaler", MaxAbsScaler()),
    ("power_transformer", PowerTransformer()),
])

mgo_preprocessor_pipeline = Pipeline([
    ("scaler", MaxAbsScaler()),
    ("power_transformer", PowerTransformer()),
    ("kernel_pca", KernelPCA(n_components=60, kernel="poly"))
])

caot_preprocessor_pipeline = Pipeline([
    ("scaler", RobustScaler(quantile_range=(10, 90))),
    ("power_transformer", PowerTransformer()),
    ("kernel_pca", KernelPCA(n_components=60, kernel="poly"))
])

nao2_preprocessor_pipeline = Pipeline([
    ("scaler", MaxAbsScaler()),
    ("power_transformer", PowerTransformer()),
])

k2o_preprocessor_pipeline = Pipeline([
    ("scaler", MinMaxScaler()),
    ("power_transformer", PowerTransformer()),
])

preprocessors = {
    "SiO2": sio2_preprocessor_pipeline,
    "TiO2": tio2_preprocessor_pipeline,
    "Al2O3": al203_preprocessor_pipeline,
    "FeOT": feot_preprocessor_pipeline,
    "MgO": mgo_preprocessor_pipeline,
    "CaO": caot_preprocessor_pipeline,
    "Na2O": nao2_preprocessor_pipeline,
    "K2O": k2o_preprocessor_pipeline
}

In [None]:
base_estimators = {
    "SiO2": [
        ('svr', SVR(kernel="poly", C=100, epsilon=0.1, gamma="scale", degree=2, coef0=1.0)),
        ('etr', ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=13, min_samples_leaf=14, max_features=0.5)),
    ],
    "TiO2": [
        ('xgb', xgb.XGBRegressor(
                n_estimators=100,
                max_depth=4,
                learning_rate=0.05,
                objective="reg:squarederror",
                min_child_weight=5,
                gamma=0.1,
                subsample=0.7,
                colsample_bytree=0.5,
                colsample_bylevel=0.5,
                colsample_bynode=0.5,
                reg_lambda=1,
                reg_alpha=0.5,
                eval_metric="rmse"
            )
        ),
        ('pls', PLSRegression(n_components=5)),
    ],
    "Al2O3" : [
        ('svr', SVR(kernel="poly", C=100, epsilon=0.1, gamma="scale", degree=2, coef0=1.0)),
        ('xgb', xgb.XGBRegressor(
                n_estimators=100,
                max_depth=4,
                learning_rate=0.05,
                objective="reg:squarederror",
                min_child_weight=5,
                gamma=0.1,
                subsample=0.7,
                colsample_bytree=0.5,
                colsample_bylevel=0.5,
                colsample_bynode=0.5,
                reg_lambda=1,
                reg_alpha=0.5,
                eval_metric="rmse"
            )
        ),
        ('pls', PLSRegression(n_components=6)),
    ],
    "FeOT": [
        ('gbr', GradientBoostingRegressor(
                n_estimators=100,
                max_depth=3,
                min_samples_split=2,
                min_samples_leaf=1,
                max_features=None,
                loss='squared_error',
                learning_rate=0.1,
                subsample=1.0,
                criterion='friedman_mse',
                random_state=42,
                verbose=0,
                validation_fraction=0.1,
                n_iter_no_change=None,
                tol=1e-4,
                ccp_alpha=0.0
            )
        ),
        ('svr', SVR(kernel="poly", C=100, epsilon=0.1, gamma="scale", degree=2, coef0=1.0)),
    ],
    "MgO": [
        ('gbr', GradientBoostingRegressor(
                n_estimators=100,
                max_depth=3,
                min_samples_split=2,
                min_samples_leaf=1,
                max_features=None,
                loss='squared_error',
                learning_rate=0.1,
                subsample=1.0,
                criterion='friedman_mse',
                random_state=42,
                verbose=0,
                validation_fraction=0.1,
                n_iter_no_change=None,
                tol=1e-4,
                ccp_alpha=0.0
            )
        ),
        ('pls', PLSRegression(n_components=15)),
        ('eln', ElasticNet(alpha=0.01, l1_ratio=0.3))
    ],
    "CaO": [
        ('pls', PLSRegression(n_components=15)),
        ('xgb', xgb.XGBRegressor(
                n_estimators=100,
                max_depth=4,
                learning_rate=0.05,
                objective="reg:squarederror",
                min_child_weight=5,
                gamma=0.1,
                subsample=0.7,
                colsample_bytree=0.5,
                colsample_bylevel=0.5,
                colsample_bynode=0.5,
                reg_lambda=1,
                reg_alpha=0.5,
                eval_metric="rmse"
            )
        ),
        ('svr', SVR(kernel="poly", C=100, epsilon=0.1, gamma="scale", degree=2, coef0=1.0)),
    ],
    "Na2O": [
        ('svr', SVR(kernel="poly", C=100, epsilon=0.1, gamma="scale", degree=2, coef0=1.0)),
        ('gbr', GradientBoostingRegressor(
                n_estimators=100,
                max_depth=3,
                min_samples_split=2,
                min_samples_leaf=1,
                max_features=None,
                loss='squared_error',
                learning_rate=0.1,
                subsample=1.0,
                criterion='friedman_mse',
                random_state=42,
                verbose=0,
                validation_fraction=0.1,
                n_iter_no_change=None,
                tol=1e-4,
                ccp_alpha=0.0
            )
        ),
    ],
    "K2O": [
        ('svr', SVR(kernel="poly", C=100, epsilon=0.1, gamma="scale", degree=2, coef0=1.0)),
        ('pls', PLSRegression(n_components=15)),
        ('gbr', GradientBoostingRegressor(
                n_estimators=100,
                max_depth=3,
                min_samples_split=2,
                min_samples_leaf=1,
                max_features=None,
                loss='squared_error',
                learning_rate=0.1,
                subsample=1.0,
                criterion='friedman_mse',
                random_state=42,
                verbose=0,
                validation_fraction=0.1,
                n_iter_no_change=None,
                tol=1e-4,
                ccp_alpha=0.0
            )
        ),
    ],
}

In [None]:
meta_learner = SVR(kernel="poly", C=100, epsilon=0.1, gamma="scale", degree=2, coef0=0.1)

In [None]:
train, test = load_full_flow_data()
original_X_train, original_y_train, original_X_test, original_y_test = split_data(train, test)

In [None]:
from datetime import datetime

mlflow.set_experiment(f'Stacking_{datetime.now().strftime("%Y%m%d-%H%M%S")}')

y_preds = {}
models = []

for target in original_y_train.columns:
    with mlflow.start_run(run_name=f"Stacking_{target}"):
        X_train, X_test = original_X_train.copy(), original_X_test.copy()

        current_preprocessor = preprocessors[target]
        X_train, X_test = preprocess_data(X_train, X_test, current_preprocessor)

        current_base_estimators = base_estimators[target]
        stacking_regresor = StackingRegressor(estimators=current_base_estimators, final_estimator=meta_learner, cv=5)
        stacking_regresor.fit(X_train, original_y_train[target])

        y_pred = stacking_regresor.predict(X_test)
        y_preds[target] = y_pred.tolist()

        models.append(stacking_regresor)

        actual_vs_predicted = {"actual": original_y_test[target], "predicted": y_pred}

        rmse = mean_squared_error(original_y_test[target], y_pred, squared=False)
        std_dev = np.std(original_y_test[target] - y_pred)

        mlflow.log_metric("rmse", float(rmse))
        mlflow.log_metric("std_dev", float(std_dev))
        mlflow.log_table(actual_vs_predicted, f"actual_vs_predicted_{target}.json")
        mlflow.sklearn.log_model(stacking_regresor, f"stacking_{target}")

        print(f"RMSE for {target}: {rmse}")
        print(f"Standard deviation for {target}: {std_dev}")