In [None]:
import mlflow
from lib.reproduction import training_info
from pathlib import Path
import pickle

mlflow.set_tracking_uri("http://localhost:5000")

In [None]:
client = mlflow.MlflowClient()

kv = {}
# for oxide, v in training_info.items():
#     sub = {}
#     for comp_range_name, vv in v.items():
#         model_name = f"{oxide}_{comp_range_name}_outlier_removal"
#         models = client.search_model_versions(f"name = '{model_name}'")
#         model = mlflow.pyfunc.load_model(models[0].source)
#         sub[comp_range_name] = model
        
#     kv[oxide] = sub


models_path = Path("../models/PLS_Models_12-11-23_174732/")
for oxide, v in training_info.items():
    sub = {}
    for comp_range_name, vv in v.items():
        model_name = f"{oxide}_{comp_range_name}/model.pkl"
        model = pickle.load(open(models_path / model_name, "rb"))
        sub[comp_range_name] = model

    kv[oxide] = sub


kv

In [None]:
import dotenv
from lib.data_handling import load_data
env = dotenv.dotenv_values(dotenv.find_dotenv())

dataset_loc = env.get('DATA_PATH') or ""
print(dataset_loc)

smol_data = load_data(
    "../data/data/calib/calib_2015/1600mm/pls/",
    10
)

In [None]:
from lib.data_handling import CustomSpectralPipeline
from lib.reproduction import masks, major_oxides

compositon_data_path = env.get('COMPOSITION_DATA_PATH') or ""
print(compositon_data_path)

pipeline = CustomSpectralPipeline(
    masks,
    "../data/data/calib/ccam_calibration_compositions.csv",
    major_oxides
)

processed_data = pipeline.fit_transform(smol_data)
processed_data.head()

In [None]:
from lib.norms import Norm1Scaler, Norm3Scaler
from lib.reproduction import spectrometer_wavelength_ranges
import lib.norms
import importlib
importlib.reload(lib.norms)

drop_cols = major_oxides + ['Sample Name']

scaler = Norm1Scaler()
X_1 = scaler.fit_transform(processed_data.drop(drop_cols, axis=1))
X_3 = Norm3Scaler(spectrometer_wavelength_ranges).fit_transform(processed_data.drop(drop_cols, axis=1))

OXIDE = 'FeOT'

y = processed_data[OXIDE]

In [None]:
from matplotlib.pylab import f
from lib.norms import Norm1Scaler, Norm3Scaler
from lib.reproduction import spectrometer_wavelength_ranges, training_info
import numpy as np


def get_weights(y_full, blend_range_min, blend_range_max):
    """
    Helper function to calculate weights for blending predictions.
    """
    w_upper = (y_full - blend_range_min) / (blend_range_max - blend_range_min)
    w_lower = 1 - w_upper
    return w_lower, w_upper


def norm_data(x, oxide: str, model: str):
    """
    Normalizes the data for the given oxide and model.
    """
    norm = training_info[oxide][model]["normalization"]

    if norm == 1:
        scaler = Norm1Scaler()
        print(f"Using Norm1Scaler for {oxide} {model}")
        scaled_df = scaler.fit_transform(x.copy(deep=True))
        assert np.isclose(scaled_df.sum(axis=1), 1).all()
        return scaled_df
    elif norm == 3:
        scaler = Norm3Scaler(spectrometer_wavelength_ranges)
        print(f"Using Norm3Scaler for {oxide} {model}")
        
        scaled_df = scaler.fit_transform(x.copy(deep=True))
        assert np.isclose(scaled_df.sum(axis=1), 3, atol=1e-1).all(), f"Norm3: {scaled_df.sum(axis=1)}"
        return scaled_df
    else:
        raise ValueError(f"Normalization value {norm} not recognized.")


def predict_composition_with_blending(oxide: str, X1, X3, models, ranges):
    """
    Predicts the composition of the given oxide based on the
    full model prediction (y_full) and the optimized blending ranges,
    including blending between Mid-High models as well as Low-Mid models.
    """
    assert len(X1) == len(X3), "X1 and X3 must be the same length"
    
    predictions = []
    blend_ranges = ["Low-Mid", "Mid-High"]

    for i in range(len(X1)):
        full_norm = training_info[oxide]["Full"]["normalization"]
        X_full_norm_row = X1.iloc[i] if full_norm == 1 else X3.iloc[i]
        y_full = models[oxide]["Full"].predict([X_full_norm_row])[0]

        prediction_made = False

        # Check if y_full is within a single range
        for range_name, (range_min, range_max) in ranges[oxide].items():
            if range_min <= y_full <= range_max and range_name not in blend_ranges:
                range_norm = training_info[oxide][range_name]["normalization"]
                X_range_norm_row = X1.iloc[i] if range_norm == 1 else X3.iloc[i]
                predictions.append(models[oxide][range_name].predict([X_range_norm_row])[0])
                print(f"y_full: {y_full}, range: {range_name}_{oxide}")
                prediction_made = True
                break

        if prediction_made:
            continue

        # Blend between Low-Mid and Mid-High models
        for blend_range in blend_ranges:
            if blend_range not in ranges[oxide]:
                continue

            blend_range_min, blend_range_max = ranges[oxide][blend_range]

            if blend_range_min <= y_full <= blend_range_max:
                w_lower, w_upper = get_weights(y_full, blend_range_min, blend_range_max)

                lower, upper = blend_range.split("-")

                # if the model has Mid-High but no mid, inference would fail otherwise  (K2O and Na2O)
                if lower not in models[oxide] and lower == "Mid":
                    lower = "Low"

                assert lower in models[oxide] and upper in models[oxide], f"{lower} or {upper} not in models for {oxide}"

                X_lower_norm_row = X1.iloc[i] if training_info[oxide][lower]["normalization"] == 1 else X3.iloc[i]
                X_upper_norm_row = X1.iloc[i] if training_info[oxide][upper]["normalization"] == 1 else X3.iloc[i]

                y_lower = models[oxide][lower].predict([X_lower_norm_row])[0]
                y_upper = models[oxide][upper].predict([X_upper_norm_row])[0]

                y_final = w_lower * y_lower + w_upper * y_upper

                predictions.append(y_final)
                prediction_made = True
                break

        if not prediction_made:
            raise ValueError(f"{i}: y_full value {y_full} for oxide {oxide} is outside defined blending ranges.")

    return predictions

In [None]:
from sklearn.metrics import mean_squared_error
from lib.reproduction import optimized_blending_ranges  # noqa: E402

# full_norm = training_info['SiO2']['Full']['normalization']

X = processed_data.drop(drop_cols, axis=1)

# y_full = kv['SiO2']['Full'].predict(X_1[:1] if full_norm == 1 else X_3[:1])
# print(y_full)
# kv['SiO2']['Low'].predict(X[:1])
print(y[0:1])
pred = predict_composition_with_blending(OXIDE, X_1[0:1], X_3[0:1], kv, optimized_blending_ranges)
actual = y
# print(actual)

sm_rmse = mean_squared_error(actual, pred, squared=False)

In [None]:
sio2_full = kv[OXIDE]['Full'].predict(X_3)
f_rmse = mean_squared_error(actual, sio2_full, squared=False)

In [None]:
f_rmse, sm_rmse