In [49]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
import mlflow
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from lib import full_flow_dataloader

from lib.norms import Norm1Scaler, Norm3Scaler
from lib.reproduction import major_oxides

train_processed, test_processed = full_flow_dataloader.load_full_flow_data()

In [51]:
from sklearn.model_selection import train_test_split
from lib.utils import custom_train_test_split

train_cols = train_processed.columns
test_cols = test_processed.columns
norm = 3

scaler = Norm1Scaler() if norm == 1 else Norm3Scaler()

train = scaler.fit_transform(train_processed)
test = scaler.fit_transform(test_processed)

# turn back into dataframe
train = pd.DataFrame(train, columns=train_cols)
test = pd.DataFrame(test, columns=test_cols)

drop_cols = major_oxides + ["ID", "Sample Name"]

## - VALIDATION -
# split_train, split_val = custom_train_test_split(train, "Sample Name", test_size=0.2, random_state=42)

# X_train = split_train.drop(columns=drop_cols)
# y_train = split_train[major_oxides]
# X_val = split_val.drop(columns=drop_cols)
# y_val = split_val[major_oxides]

# Converting train set - comment out if using validation
X_train = train.drop(columns=drop_cols)
y_train = train[major_oxides]

# Converting test set
X_test = test.drop(columns=drop_cols)
y_test = test[major_oxides]

In [52]:
import datetime

mlflow.set_experiment(f'SVM_Norm{norm}_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')

2024/03/07 13:53:14 INFO mlflow.tracking.fluent: Experiment with name 'SVM_Norm3_20240307-135313' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/441031611857712967', creation_time=1709815995653, experiment_id='441031611857712967', last_update_time=1709815995653, lifecycle_stage='active', name='SVM_Norm3_20240307-135313', tags={}>

In [53]:
from sklearn.svm import SVR
import warnings

# disable warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

models = []

kernel="poly"
C=100
eps=0.1
gamma="scale"
degree=2
coef0=1.0


for target in y_train.columns:
    with mlflow.start_run(run_name=f"SVM_{target}"):
        svm_reg = SVR(kernel=kernel, degree=degree, C=C, epsilon=eps, coef0=coef0, gamma=gamma)
        svm_reg.fit(X_train, y_train[target])
        
        y_pred = svm_reg.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))
        mlflow.log_metric("rmse", float(rmse))
        mlflow.log_param("target", target)
        mlflow.log_param("norm", norm)
        mlflow.log_param("kernel", kernel)
        mlflow.log_param("degree", degree)
        mlflow.log_param("coef0", coef0)
        mlflow.log_param("C", C)
        mlflow.log_param("epsilon", eps)
        mlflow.log_param("gamma", gamma)

        models.append(svm_reg)
        mlflow.sklearn.log_model(svm_reg, f"model_{target}")
        

In [54]:
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from scipy.stats import expon

# grid search pca hyperparameters
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter distributions (or specific lists) you want to sample from
param_distributions = {
    "pca__n_components": [0.85, 0.90, 0.95, 0.99, 2, 5, 10, 15, 20],
    "svr__kernel": ["poly", "rbf", "sigmoid"],
    "svr__C": expon(scale=100),  # Continuous distribution for C
    "svr__epsilon": [0.1, 0.01, 0.001],
    "svr__degree": [2, 3, 4, 5],
    "svr__coef0": [0.0, 0.1, 0.5, 1.0],
    "svr__gamma": ["scale", "auto", 0.1, 0.01, 0.001],
}

# Setup RandomizedSearchCV
rscv = RandomizedSearchCV(
    estimator=make_pipeline(PCA(), SVR()),
    param_distributions=param_distributions,
    n_iter=100,  # Number of parameter settings that are sampled. Adjust based on your computational budget
    cv=2,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

pca_models = []

for target in y_train.columns:
    with mlflow.start_run(run_name=f"SVM_PCA_{target}"):
        svm_reg = SVR(kernel=kernel, degree=degree, C=C, epsilon=eps, coef0=coef0, gamma=gamma)
        rscv.fit(X_train, y_train[target])
        best_estimator = rscv.best_estimator_
        
        y_pred = best_estimator.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))
        mlflow.log_metric("rmse", float(rmse))
        
        # log best params from grid search
        mlflow.log_params(rscv.best_params_)
        mlflow.log_param("target", target)
        mlflow.log_param("norm", norm)

        pca_models.append(best_estimator)
        mlflow.sklearn.log_model(best_estimator, f"model_{target}")

Fitting 2 folds for each of 100 candidates, totalling 200 fits
[CV] END pca__n_components=10, svr__C=159.23005340557225, svr__coef0=0.5, svr__degree=4, svr__epsilon=0.1, svr__gamma=0.001, svr__kernel=sigmoid; total time=   1.0s
[CV] END pca__n_components=10, svr__C=159.23005340557225, svr__coef0=0.5, svr__degree=4, svr__epsilon=0.1, svr__gamma=0.001, svr__kernel=sigmoid; total time=   1.1s
[CV] END pca__n_components=5, svr__C=5.806520264207262, svr__coef0=1.0, svr__degree=5, svr__epsilon=0.01, svr__gamma=auto, svr__kernel=poly; total time=   1.3s
[CV] END pca__n_components=5, svr__C=5.806520264207262, svr__coef0=1.0, svr__degree=5, svr__epsilon=0.01, svr__gamma=auto, svr__kernel=poly; total time=   2.1s
[CV] END pca__n_components=10, svr__C=94.15999701070139, svr__coef0=1.0, svr__degree=4, svr__epsilon=0.001, svr__gamma=scale, svr__kernel=rbf; total time=   1.6s
[CV] END pca__n_components=15, svr__C=140.79733016874118, svr__coef0=0.1, svr__degree=3, svr__epsilon=0.01, svr__gamma=auto, 

In [55]:
from datetime import datetime

# Assuming MLflow tracking URI is set elsewhere or using the default
# List all experiments
experiments = mlflow.search_experiments()

# Filter experiments based on the naming pattern 'SVM_Norm'
svm_norm_experiments = [exp for exp in experiments if exp.name.startswith("SVM_Norm")]

# Sort these experiments by their timestamp in the name, assuming the timestamp follows 'SVM_Norm{norm}_{timestamp}'
latest_experiment = sorted(
    svm_norm_experiments,
    key=lambda x: datetime.strptime(x.name.split('_')[-1], "%Y%m%d-%H%M%S"),
    reverse=True
)[0]

# Now, list all runs in the latest experiment using its experiment_id
runs = mlflow.search_runs([latest_experiment.experiment_id])

In [56]:
data = []
for _, row in runs.iterrows():
    run_name = row['tags.mlflow.runName']  # Adjust based on actual tag used for naming runs
    if run_name.startswith("SVM") or run_name.startswith("SVM_PCA"):
        # Assuming RMSE is stored in metrics with a key such as 'RMSE'
        rmse = row['metrics.rmse']  # Adjust the key based on your actual data structure
        # Extract oxide name from the run name, assuming it follows a recognizable pattern
        # This might need adjustment based on your naming convention
        with_pca = run_name.startswith("SVM_PCA")
        oxide_name = run_name.split('_')[1] if not with_pca else run_name.split('_')[2]  # Example: "SVM_CuOxide_..." -> "CuOxide"
        data.append({'Oxide': oxide_name, 'RMSE': rmse, 'WithPCA': with_pca})

# Convert data to a pandas DataFrame
df = pd.DataFrame(data)

In [57]:
df

Unnamed: 0,Oxide,RMSE,WithPCA
0,K2O,1.159734,True
1,Na2O,0.632748,True
2,CaO,1.650255,True
3,MgO,1.840583,True
4,FeOT,1.829822,True
5,Al2O3,2.254029,True
6,TiO2,0.558295,True
7,SiO2,4.057205,True
8,K2O,0.82161,False
9,Na2O,0.471633,False


In [58]:
pivoted_df = df.pivot_table(index='Oxide', columns='WithPCA', values='RMSE', aggfunc='first').rename(columns={True: 'With PCA', False: 'Without PCA'})

In [59]:
pivoted_df

WithPCA,Without PCA,With PCA
Oxide,Unnamed: 1_level_1,Unnamed: 2_level_1
Al2O3,1.801759,2.254029
CaO,1.551286,1.650255
FeOT,1.828757,1.829822
K2O,0.82161,1.159734
MgO,1.268776,1.840583
Na2O,0.471633,0.632748
SiO2,3.390499,4.057205
TiO2,0.494588,0.558295
