In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
from lib.reproduction import major_oxides
from sklearn.metrics import mean_squared_error
from lib import full_flow_dataloader
import mlflow
import numpy as np
import datetime
import os
os.environ["KERAS_BACKEND"] = "torch"

import torch
import keras


torch.manual_seed(42)
np.random.seed(42)


In [14]:
print(keras.__version__)

3.2.1


In [15]:
import torch.nn as nn
import torch.optim as optim

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [16]:
train_processed, test_processed = full_flow_dataloader.load_full_flow_data(load_cache_if_exits=True, average_shots=True)

In [17]:
# cnn_regression_optimized.py
from keras import layers, optimizers, regularizers

def build_model(input_dim, output_dim):
    model = keras.models.Sequential()
    model.add(layers.Input(shape=(input_dim,)))
    model.add(layers.Reshape((48, 128, 1)))
    model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D((2, 2)))
    
    # Additional convolutional block for better feature extraction
    model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D((2, 2)))
    
    model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D((2, 2)))
    
    model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D((2, 2)))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(output_dim))
    
    # Using L2 regularization
    model.add(layers.Dense(output_dim, kernel_regularizer=regularizers.l2(0.01)))
    
    # Optimizer with a custom learning rate
    optimizer = optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='mse', metrics=['root_mean_squared_error', 'mae'])
    return model

# Constants
INPUT_DIM = 6144  # Number of features per sample
OUTPUT_DIM = 8    # Number of continuous values as output

# Model Creation
model = build_model(INPUT_DIM, OUTPUT_DIM)
model.summary()


In [18]:
drop_cols = major_oxides + ["ID", "Sample Name"]

X_train = train_processed.drop(columns=drop_cols)
y_train = train_processed[target]

X_test = test_processed.drop(columns=drop_cols)
y_test = test_processed[target]


In [19]:
X_train_reshaped = X_train.to_numpy().reshape(-1, 6144, 1)
X_test_reshaped = X_test.to_numpy().reshape(-1, 6144, 1)

In [20]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

In [21]:
def run_cnn_experiment(
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
    y_test: np.ndarray,
    model: keras.Model,
    epochs: int,
    batch_size: int,
    callbacks: list = [],
    major_oxides: list = [],
):
    with mlflow.start_run(run_name="CNN"):
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1, callbacks=callbacks)
        y_pred = model.predict(X_test)
        for i, oxide in enumerate(major_oxides):
            y_test_oxide = y_test[:, i]
            y_pred_oxide = y_pred[:, i]
            rmse = mean_squared_error(y_test_oxide, y_pred_oxide, squared=False)
            mlflow.log_metric(f"rmse_{oxide}", float(rmse))

In [22]:
callback = keras.callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True)

class MLFlowCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if logs is not None:
            for key, value in logs.items():
                mlflow.log_metric(f"{key}", value, step=epoch)


mlflow.set_experiment(f'CNN_Residual_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')
run_cnn_experiment(
    X_train_reshaped,
    y_train.to_numpy(),
    X_test_reshaped,
    y_test.to_numpy(),
    model,
    epochs=1000,
    batch_size=32,
    callbacks=[MLFlowCallback(), callback],
    major_oxides=major_oxides,
)



2024/04/16 10:54:49 INFO mlflow.tracking.fluent: Experiment with name 'CNN_Residual_20240416-105445' does not exist. Creating a new experiment.


Epoch 1/1000
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - loss: 647.0948 - mae: 18.3376 - root_mean_squared_error: 24.0567 - val_loss: 1578.1470 - val_mae: 34.7313 - val_root_mean_squared_error: 39.7257
Epoch 2/1000
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - loss: 79.3242 - mae: 7.0195 - root_mean_squared_error: 8.9005 - val_loss: 1198.9543 - val_mae: 30.2340 - val_root_mean_squared_error: 34.6258
Epoch 3/1000
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - loss: 65.9020 - mae: 6.4426 - root_mean_squared_error: 8.1100 - val_loss: 859.8171 - val_mae: 24.9313 - val_root_mean_squared_error: 29.3224
Epoch 4/1000
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - loss: 72.0433 - mae: 6.7637 - root_mean_squared_error: 8.4767 - val_loss: 619.5441 - val_mae: 20.8796 - val_root_mean_squared_error: 24.8904
Epoch 5/1000
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [153]:
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from lib.norms import Norm3Scaler

# Define constants
N_FEATURES = 6144  # Number of features per sample
N_OUTPUTS = 8      # Number of continuous values as output

# Set parameters for SVR
svr_params = {
    "kernel": "poly",
    "C": 100,
    "epsilon": 0.1,
    "gamma": "scale",
    "degree": 2,
    "coef0": 1.0
}

# Set parameters for GBR
gbr_params = {
    "n_estimators": 100,
    "learning_rate": 0.1,
    "max_depth": 3
}

# Fit models for each target
models = {'gbr': [], 'svr': []}
scaler = Norm3Scaler()

for target in y_train.columns:
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Fit SVR
    with mlflow.start_run(run_name=f"SVR_{target}"):
        svr = SVR(**svr_params)
        svr.fit(X_train_scaled, y_train[target])
        
        y_pred_svr = svr.predict(X_test_scaled)
        rmse_svr = np.sqrt(mean_squared_error(y_test[target], y_pred_svr))
        
        mlflow.log_metrics({"rmse": float(rmse_svr)})
        mlflow.log_params(svr_params)
        mlflow.sklearn.log_model(svr, f"SVR_model_{target}")
        
        models['svr'].append(svr)

    # Fit GBR
    with mlflow.start_run(run_name=f"GBR_{target}"):
        gbr = GradientBoostingRegressor(**gbr_params)
        gbr.fit(X_train, y_train[target])  # No need to scale for GBR
        
        y_pred_gbr = gbr.predict(X_test)
        rmse_gbr = np.sqrt(mean_squared_error(y_test[target], y_pred_gbr))
        
        mlflow.log_metrics({"rmse": float(rmse_gbr)})
        mlflow.log_params(gbr_params)
        mlflow.sklearn.log_model(gbr, f"GBR_model_{target}")
        
        models['gbr'].append(gbr)

In [154]:
models['cnn'] = model

In [159]:
def predict_ensemble(models, X_cnn, X_svr_gbr):
    # X is the input features, assumed preprocessed for the CNN
    cnn_prediction = models['cnn'].predict(X_cnn)  # Prediction from CNN
    
    # Assuming X_standard is the standard-scaled version of X for GBR and SVR
    X_standard = scaler.transform(X_svr_gbr)
    
    gbr_predictions = np.array([models['gbr'][i].predict(X_standard) for i in range(N_OUTPUTS)]).T
    svr_predictions = np.array([models['svr'][i].predict(X_standard) for i in range(N_OUTPUTS)]).T
    
    # Averaging predictions from each model
    final_prediction = (cnn_prediction + gbr_predictions + svr_predictions) / 3
    return final_prediction

def calculate_rmse_per_oxide(y_true, y_pred):
    """
    Calculate the Root Mean Squared Error (RMSE) between the true and predicted values for each major oxide.
    
    Parameters:
    -----------
    y_true : DataFrame of shape (n_samples, n_outputs)
        True values for each major oxide.
    y_pred : DataFrame of shape (n_samples, n_outputs)
        Predicted values for each major oxide.
    
    Returns:
    --------
    rmse_per_oxide : dict
        A dictionary with major oxides as keys and their corresponding RMSE as values.
    """
    rmse_per_oxide = {}
    for i, oxide in enumerate(major_oxides):
        rmse_per_oxide[oxide] = np.sqrt(mean_squared_error(y_true[oxide], y_pred[:, i]))
    return rmse_per_oxide

# Assuming y_test and X_test_reshaped are defined and major_oxides is a list of the major oxides
y_pred_ensemble = predict_ensemble(models, X_test_reshaped, X_test)
rmse_results = calculate_rmse_per_oxide(y_test, y_pred_ensemble)
for oxide, rmse in rmse_results.items():
    print(f"{oxide}: {rmse}")



[1m 1/13[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 23ms/step

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
SiO2: 9.682961366479164
TiO2: 0.8176471150383866
Al2O3: 3.7492426302332027
FeOT: 3.1567254383507732
MgO: 2.322423060560155
CaO: 3.4183938683962407
Na2O: 0.6845957736001052
K2O: 1.1025774365887657
