In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import mlflow
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from lib.full_flow_dataloader import load_full_flow_data
from lib.reproduction import major_oxides
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler, MaxAbsScaler, KernelCenterer
from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.metrics.pairwise import polynomial_kernel, rbf_kernel, sigmoid_kernel

In [None]:
train, test = load_full_flow_data()

#robust_scaler = RobustScaler(quantile_range=(40, 60.0))
#standard_scaler = StandardScaler()
max_abs_scaler = MaxAbsScaler()
power_scaler = PowerTransformer()
#pca = PCA(n_components=34, whiten=True)
pca = KernelPCA(n_components=60, kernel="cosine")

drop_cols = major_oxides + ["ID", "Sample Name"]

# ---- train transformations ----
X_train = train.drop(columns=drop_cols)
y_train = train[major_oxides]

#X_train = robust_scaler.fit_transform(X_train)
#X_train = standard_scaler.fit_transform(X_train)
X_train = max_abs_scaler.fit_transform(X_train)
X_train = power_scaler.fit_transform(X_train)

X_train = pca.fit_transform(X_train)

# ---- test transformations ---- 
X_test = test.drop(columns=drop_cols)
y_test = test[major_oxides]

#X_test = robust_scaler.transform(X_test)
#X_test = standard_scaler.transform(X_test)
X_test = max_abs_scaler.transform(X_test)
X_test = power_scaler.transform(X_test)

X_test = pca.transform(X_test)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

In [None]:
eigenvalues = pca.eigenvalues_
explained_variance = eigenvalues / eigenvalues.sum()
cumulative_explained_variance = np.cumsum(explained_variance)

plt.figure(figsize=(8, 5))
plt.plot(cumulative_explained_variance)
plt.xlabel('# PC')
plt.xlim(0, 400)
plt.ylabel('Explained variance')
plt.title('Explained Variance by Number of Principal Components')
plt.show()

# --- Useful for calculating PC's for PCA
#explained_variance = pca.explained_variance_ratio_
#print("Explained variance:", explained_variance)
#cumulative_variance = np.cumsum(explained_variance)
#print("Cumulative explained variance:", cumulative_variance)
#n_components = np.where(cumulative_variance >= 0.95)[0][0] + 1
#print("Number of components to keep:", n_components)

In [None]:
import datetime

mlflow.set_experiment(f'Stacking_Scaler_SVR_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')

In [None]:

# disable warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

models = []

kernel="poly"
C=100
eps=0.1
gamma="scale"
degree=2
coef0=1.0

xgb_params = {
    "max_depth": 4,  # Slightly deeper trees since data is high-dimensional
    "min_child_weight": 5,  # Higher to control over-fitting
    "gamma": 0.1,  # Minimum loss reduction required to make further partition
    "subsample": 0.7,  # Subsample ratio of the training instances
    "colsample_bytree": 0.5,  # Subsample ratio of columns when constructing each tree
    "colsample_bylevel": 0.5,  # Subsample ratio of columns for each level
    "colsample_bynode": 0.5,  # Subsample ratio of columns for each split
    "lambda": 1,  # L2 regularization term on weights (lambda)
    "alpha": 0.5,  # L1 regularization term on weights (alpha)
    "learning_rate": 0.05,  # Step size shrinkage used in update to prevent overfitting
    "n_estimators": 100,  # Number of boosting rounds
    "objective": "reg:squarederror",  # Regression with squared loss
    "eval_metric": "rmse",  # Evaluation metric for validation data
}

for target in y_train.columns:
    with mlflow.start_run(run_name=f"SVR_{target}"):
        #estimator = XGBRegressor(**xgb_params)
        #selector = RFECV(estimator, step=20, cv=KFold(5), scoring="neg_mean_squared_error", verbose=1, n_jobs=-1)
        #selector.fit(X_train, y_train[target])

        #X_train_selected = selector.transform(X_train)
        #X_test_selected = selector.transform(X_test)

        svr_reg = SVR(kernel=kernel, degree=degree, C=C, epsilon=eps, coef0=coef0, gamma=gamma)
        
        svr_reg.fit(X_train, y_train[target])
        y_pred = svr_reg.predict(X_test)
        
        rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))

        mlflow.log_metric("rmse", float(rmse))
        mlflow.log_param("target", target)
        mlflow.log_param("kernel", kernel)
        mlflow.log_param("degree", degree)
        mlflow.log_param("coef0", coef0)
        mlflow.log_param("C", C)
        mlflow.log_param("epsilon", eps)
        mlflow.log_param("gamma", gamma)

        models.append(svr_reg)
        mlflow.sklearn.log_model(svr_reg, f"model_{target}")
