In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import mlflow
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

from lib.reproduction import major_oxides

In [4]:
from sklearn.model_selection import train_test_split
from lib.full_flow_dataloader import load_and_scale_data, load_full_flow_data
from lib.utils import custom_train_test_split
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer

norm = 3

# train, test = load_and_scale_data(norm)
train, test = load_full_flow_data()

drop_cols = major_oxides + ["ID", "Sample Name"]

## - VALIDATION -
# split_train, split_val = custom_train_test_split(train, "Sample Name", test_size=0.2, random_state=42)

# X_train = split_train.drop(columns=drop_cols)
# y_train = split_train[major_oxides]
# X_val = split_val.drop(columns=drop_cols)
# y_val = split_val[major_oxides]


# Initialize scalers
max_abs_scaler = MaxAbsScaler()
power_transformer = PowerTransformer()

# Converting train set - comment out if using validation
X_train = train.drop(columns=drop_cols)
y_train = train[major_oxides]
X_train = max_abs_scaler.fit_transform(X_train)
X_train = power_transformer.fit_transform(X_train)

# Converting test set
X_test = test.drop(columns=drop_cols)
y_test = test[major_oxides]
X_test = max_abs_scaler.transform(X_test)
X_test = power_transformer.transform(X_test)

In [5]:
from sklearn.svm import SVR
import warnings
import datetime

# disable warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

models = []

kernel="poly"
C=100
eps=0.1
gamma="scale"
degree=2
coef0=1.0

mlflow.set_experiment(f'SVM_{kernel}_Norm{norm}_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')

for target in y_train.columns:
    with mlflow.start_run(run_name=f"SVM_{target}"):
        svm_reg = SVR(kernel=kernel, degree=degree, C=C, epsilon=eps, coef0=coef0, gamma=gamma)
        svm_reg.fit(X_train, y_train[target])

        y_pred = svm_reg.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))
        mlflow.log_metric("rmse", float(rmse))
        mlflow.log_param("target", target)
        mlflow.log_param("norm", norm)
        mlflow.log_param("kernel", kernel)
        mlflow.log_param("degree", degree)
        mlflow.log_param("coef0", coef0)
        mlflow.log_param("C", C)
        mlflow.log_param("epsilon", eps)
        mlflow.log_param("gamma", gamma)

        models.append(svm_reg)
        mlflow.sklearn.log_model(svm_reg, f"model_{target}")


2024/04/22 12:17:33 INFO mlflow.tracking.fluent: Experiment with name 'SVM_poly_Norm3_20240422-121728' does not exist. Creating a new experiment.


In [None]:
# from sklearn.decomposition import PCA
# from sklearn.pipeline import make_pipeline
# from scipy.stats import expon

# from sklearn.model_selection import RandomizedSearchCV

# param_distributions = {
#     "pca__n_components": [0.85, 0.90, 0.95, 0.99, 2, 5, 10, 15, 20],
#     "svr__kernel": ["poly", "rbf", "sigmoid"],
#     "svr__C": expon(scale=100),  # Continuous distribution for C
#     "svr__epsilon": [0.1, 0.01, 0.001],
#     "svr__degree": [2, 3, 4, 5],
#     "svr__coef0": [0.0, 0.1, 0.5, 1.0],
#     "svr__gamma": ["scale", "auto", 0.1, 0.01, 0.001],
# }

# # Setup RandomizedSearchCV
# rscv = RandomizedSearchCV(
#     estimator=make_pipeline(PCA(), SVR()),
#     param_distributions=param_distributions,
#     n_iter=100,
#     cv=2,
#     n_jobs=-1,
#     verbose=2,
#     random_state=42
# )

# pca_models = []

for target in y_train.columns:
    with mlflow.start_run(run_name=f"SVM_PCA_{target}"):
        rscv.fit(X_train, y_train[target])
        best_estimator = rscv.best_estimator_

        y_pred = best_estimator.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))
        mlflow.log_metric("rmse", float(rmse))

        # log best params from grid search
        mlflow.log_params(rscv.best_params_)
        mlflow.log_param("target", target)
        mlflow.log_param("norm", norm)

#         pca_models.append(best_estimator)
#         mlflow.sklearn.log_model(best_estimator, f"model_{target}")

In [None]:
from datetime import datetime

experiments = mlflow.search_experiments()

svm_norm_experiments = [exp for exp in experiments if exp.name.startswith("SVM_Norm")]

latest_experiment = sorted(
    svm_norm_experiments,
    key=lambda x: datetime.strptime(x.name.split('_')[-1], "%Y%m%d-%H%M%S"),
    reverse=True
)[0]

runs = mlflow.search_runs([latest_experiment.experiment_id])

In [None]:
data = []
for _, row in runs.iterrows():
    run_name = row['tags.mlflow.runName']
    if run_name.startswith("SVM") or run_name.startswith("SVM_PCA"):
        rmse = row['metrics.rmse']
        with_pca = run_name.startswith("SVM_PCA")
        oxide_name = run_name.split('_')[1] if not with_pca else run_name.split('_')[2]
        data.append({'Oxide': oxide_name, 'RMSE': rmse, 'WithPCA': with_pca})

df = pd.DataFrame(data)

In [None]:
df

In [None]:
pivoted_df = df.pivot_table(index='Oxide', columns='WithPCA', values='RMSE', aggfunc='first').rename(columns={True: 'With PCA', False: 'Without PCA'})

In [None]:
pivoted_df