In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import mlflow
import pandas as pd
import numpy as np
import xgboost as xgb
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

from lib.full_flow_dataloader import load_full_flow_data
from lib.reproduction import major_oxides

from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer, StandardScaler, RobustScaler
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA, KernelPCA
from sklearn.linear_model import ElasticNet

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras import optimizers
from tensorflow.keras.losses import MeanSquaredError

from scikeras.wrappers import KerasRegressor

In [None]:
def split_and_preprocess_data(train: pd.DataFrame, test: pd.DataFrame, preprocesser_pipeline: Pipeline) -> tuple:
    drop_cols = major_oxides + ["ID", "Sample Name"]

    # Split data
    X_train = train.drop(columns=drop_cols)
    X_test = test.drop(columns=drop_cols)
    y_train = train[major_oxides]
    y_test = test[major_oxides]

    # Preprocess data
    X_train = preprocesser_pipeline.fit_transform(X_train)
    X_test = preprocesser_pipeline.transform(X_test)

    return X_train, y_train, X_test, y_test

In [None]:
# Possible idea: store preprocess pipeline and related model pipeline in hashmap so that you could give it a key of the 
# target and apply the corresponding pipeline

preprocessor_pipeline = Pipeline([
    ("scaler", MaxAbsScaler()),
    #("scaler", StandardScaler()),
    #("scaler", RobustScaler(quantile_range=(10, 90))),
    ("transformer", PowerTransformer()),
    ("pca", KernelPCA(n_components=60, kernel="poly"))
    #('pca', PCA(n_components=34))
])

In [None]:
train, test = load_full_flow_data()

X_train, y_train, X_test, y_test = split_and_preprocess_data(train, test, preprocessor_pipeline)

In [None]:
# Instantiating models

# ---- SVR ----
kernel="poly"
C=100
eps=0.1
gamma="scale"
degree=2
coef0=1.0

svr = SVR(kernel=kernel, C=C, epsilon=eps, gamma=gamma, degree=degree, coef0=coef0)

# ---- ExtraTreesRegressor ----
n_estimators = 5
max_depth = None
min_samples_split = 2
min_samples_leaf = 1
max_features = 0.3

etr = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, random_state=42)

# ---- Gradient boosting regressor ----

gbr_params = {
    'loss': 'squared_error',
    'learning_rate': 0.1,
    'n_estimators': 100,
    'subsample': 1.0,
    'criterion': 'friedman_mse',
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_depth': 3,
    'max_features': None,
    'random_state': 42,
    'verbose': 0,
    'validation_fraction': 0.1,
    'n_iter_no_change': None,
    'tol': 1e-4,
    'ccp_alpha': 0.0
}

gbr = GradientBoostingRegressor(**gbr_params)

# ---- XGBoost ----


xgb_params = {
    "max_depth": 4,  # Slightly deeper trees since data is high-dimensional
    "min_child_weight": 5,  # Higher to control over-fitting
    "gamma": 0.1,  # Minimum loss reduction required to make further partition
    "subsample": 0.7,  # Subsample ratio of the training instances
    "colsample_bytree": 0.5,  # Subsample ratio of columns when constructing each tree
    "colsample_bylevel": 0.5,  # Subsample ratio of columns for each level
    "colsample_bynode": 0.5,  # Subsample ratio of columns for each split
    "lambda": 1,  # L2 regularization term on weights (lambda)
    "alpha": 0.5,  # L1 regularization term on weights (alpha)
    "learning_rate": 0.05,  # Step size shrinkage used in update to prevent overfitting
    "n_estimators": 100,  # Number of boosting rounds
    "objective": "reg:squarederror",  # Regression with squared loss
    "eval_metric": "rmse",  # Evaluation metric for validation data
}

xgb = xgb.XGBRegressor(**xgb_params)


# ---- PLS ----

n_components = 15

pls = PLSRegression(n_components=n_components)


# ---- ElasticNet ----

alpha = 0.01
l1_ratio = 0.3

eln = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

In [None]:
def build_model(input_dim, output_dim):
    model = Sequential()
    model.add(layers.Input(shape=(input_dim,)))
    model.add(layers.Reshape((48, 128, 1)))
    model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D((2, 2)))
    
    # Additional convolutional block for better feature extraction
    model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D((2, 2)))
    
    model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D((2, 2)))
    
    model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D((2, 2)))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(output_dim))
    
    # Using L2 regularization
    model.add(layers.Dense(output_dim, kernel_regularizer=regularizers.l2(0.01)))
    
    # Optimizer with a custom learning rate
    optimizer = optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss=MeanSquaredError())
    return model

INPUT_DIM = 6144  # Number of features per sample
OUTPUT_DIM = 1    # Number of continuous values as output

cnn = KerasRegressor(build_fn=lambda: build_model(INPUT_DIM, OUTPUT_DIM), loss=MeanSquaredError() ,epochs=100, batch_size=32, verbose=0)

In [None]:
# Constructing pipelines for each oxide
sio2_base_estimators = [
    ('svr', svr),
    ('etr', etr)
]

tio2_base_estimators = [
    #('gbr', gbr),
    ('pls', pls),
    ('xgb', xgb)
]

al203_base_estimators = [
    #('gbr', gbr),
    ('svr', svr),
    ('xgb', xgb),
    ('pls', pls),
]

feot_base_estimators = [
    ('gbr', gbr),
    ('svr', svr)
    #('xgb', xgb)
]

mgo_base_estimators = [
    ('gbr', gbr),
    ('pls', pls),
    ('eln', eln)
]

cao_base_estimators = [
    ('svr', svr),
    ('pls', pls),
    ('xgb', xgb),
]

nao_base_estimators = [
    ('svr', svr),
    ('gbr', gbr)
]

estimators = {
    "SiO2": sio2_base_estimators,
    "TiO2": tio2_base_estimators,
    "Al2O3": al203_base_estimators,
    "FeOT" : feot_base_estimators,
    "MgO" : mgo_base_estimators,
    "CaO" : cao_base_estimators,
    "Na2O" : nao_base_estimators
}

meta_kernel="poly"
meta_C=100
meta_eps=0.1
meta_gamma="scale"
meta_degree=2
meta_coef0=0.1

meta_learner = SVR(kernel=meta_kernel, C=meta_C, epsilon=meta_eps, gamma=meta_gamma, degree=meta_degree, coef0=meta_coef0)

In [None]:
# target: 'SiO2, 'TiO2', 'Al2O3', 'FeOT', 'MgO', 'CaO', 'Na2O', 'K2O'
selected_targets = ['Na2O'] 

for target in selected_targets:
    print(target)
    if target in y_train.columns:
        current_base_estimators = estimators[target]
        stacking_regresor = StackingRegressor(estimators=current_base_estimators, final_estimator=meta_learner, cv=5)
        stacking_regresor.fit(X_train, y_train[target])
        y_pred = stacking_regresor.predict(X_test) 
        rmse = mean_squared_error(y_test[target], y_pred, squared=False)
        print(f"RMSE for {target}: {rmse}")
    else:
        print(f"Target {target} not found in dataset")