In [64]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [65]:
import mlflow
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

from lib.reproduction import major_oxides



In [66]:
from lib import full_flow_dataloader
from lib.cross_validation import custom_kfold_cross_validation_new


# Load the data and create folds
def get_data_indices(target: str):
    # Simulate loading the data
    train_full, test_full = full_flow_dataloader.load_full_flow_data(load_cache_if_exits=True, average_shots=True)
    full_data = pd.concat([train_full, test_full], axis=0)

    # Generate custom folds
    folds, train, test = custom_kfold_cross_validation_new(
        data=full_data, k=5, group_by="Sample Name", target=target, random_state=42
    )

    train_indices = train.index.values
    index_mapping = {idx: pos for pos, idx in enumerate(train_indices)}

    # Convert DataFrame-based folds to index-based folds
    fold_indices = []
    for train_fold, test_fold in folds:
        train_fold_idx = [index_mapping[idx] for idx in train_fold.index.values if idx in index_mapping]
        test_fold_idx = [index_mapping[idx] for idx in test_fold.index.values if idx in index_mapping]
        fold_indices.append((train_fold_idx, test_fold_idx))

    return fold_indices, train, test, full_data, full_data[target]

In [67]:
target = "SiO2"
assert target in major_oxides, f"Target {target} not in major_oxides"
drop_cols = major_oxides + ["ID", "Sample Name"]

fold_indices, train_data, test_data, full_data, y_full = get_data_indices(target=target)
X_full = full_data.drop(columns=drop_cols)

In [69]:
# Manually perform cross-validation to generate predictions from base estimators
def perform_manual_cv(fold_indices, X_full, y_full, base_estimators):
    meta_features = np.zeros((X_full.shape[0], len(base_estimators)))
    for i, (train_idx, test_idx) in enumerate(fold_indices):
        X_train, X_test = X_full.iloc[train_idx], X_full.iloc[test_idx]
        y_train, y_test = y_full.iloc[train_idx], y_full.iloc[test_idx]

        for j, (name, estimator) in enumerate(base_estimators):
            estimator.fit(X_train, y_train)
            meta_features[test_idx, j] = estimator.predict(X_test)

    return meta_features


meta_features = perform_manual_cv(
    fold_indices, X_full, y_full, sio2_pipeline
)

In [70]:
# Perform cross-validation for the final estimator using the meta-features
def perform_final_cv(fold_indices, meta_features, y_full, final_estimator, metric_fns):
    cv_metrics = []
    for train_idx, test_idx in fold_indices:
        X_train, X_test = meta_features[train_idx], meta_features[test_idx]
        y_train, y_test = y_full.iloc[train_idx], y_full.iloc[test_idx]

        final_estimator.fit(X_train, y_train)
        y_pred = final_estimator.predict(X_test)

        fold_metrics = [metric_fn(y_test, y_pred) for metric_fn in metric_fns]
        cv_metrics.append(fold_metrics)

    return cv_metrics

In [90]:
from sklearn.linear_model import ElasticNet

final_estimator = ElasticNet()

In [91]:
from lib.metrics import rmse_metric, std_dev_metric

metric_fns = [rmse_metric, std_dev_metric]

# Calculate cross-validation metrics for the final estimator
cv_metrics = perform_final_cv(
    fold_indices, meta_features, y_full, final_estimator, metric_fns
)

# Aggregate cross-validation metrics
rmse_cv_scores = [metrics[0] for metrics in cv_metrics]
rmse_cv_mean = np.mean(rmse_cv_scores)
rmse_cv_std = np.std(rmse_cv_scores)
print(f"RMSECV: {rmse_cv_mean} ± {rmse_cv_std}")

RMSECV: 4.017608952189917 ± 0.3183024994904487


In [85]:
rmse_cv_scores

[3.931479950176423, 4.477666619878011, 4.0860331961049745, 3.59433994486635]

In [86]:
# Train the final model on the full training data and evaluate on the test data
X_train_final = train_data.drop(columns=drop_cols)
y_train_final = train_data[target]
X_test_final = test_data.drop(columns=drop_cols)
y_test_final = test_data[target]

In [81]:
# Train base estimators on the entire training set
for name, estimator in sio2_pipeline:
    estimator.fit(X_train_final, y_train_final)

In [82]:
# Generate meta-features for the entire training set
meta_features_train = np.column_stack(
    [estimator.predict(X_train_final) for name, estimator in sio2_pipeline]
)
meta_features_test = np.column_stack(
    [estimator.predict(X_test_final) for name, estimator in sio2_pipeline]
)

# Train the final estimator on the entire training set meta-features
final_estimator.fit(meta_features_train, y_train_final)
y_pred_final = final_estimator.predict(meta_features_test)

# Evaluate the final model
rmsep_final = rmse_metric(y_test_final, y_pred_final)
print(f"RMSEP: {rmsep_final}")

RMSEP: 3.801576940271841
