In [3]:
import mlflow
from lib.reproduction import training_info
from pathlib import Path
import pickle

mlflow.set_tracking_uri("http://localhost:5000")


* 'schema_extra' has been renamed to 'json_schema_extra'


In [4]:
client = mlflow.MlflowClient()

kv = {}
# for oxide, v in training_info.items():
#     sub = {}
#     for comp_range_name, vv in v.items():
#         model_name = f"{oxide}_{comp_range_name}_outlier_removal"
#         models = client.search_model_versions(f"name = '{model_name}'")
#         model = mlflow.pyfunc.load_model(models[0].source)
#         sub[comp_range_name] = model
        
#     kv[oxide] = sub


models_path = Path("../models/PLS_Models_12-11-23_174732/")
for oxide, v in training_info.items():
    sub = {}
    for comp_range_name, vv in v.items():
        model_name = f"{oxide}_{comp_range_name}/model.pkl"
        model = pickle.load(open(models_path / model_name, "rb"))
        sub[comp_range_name] = model

    kv[oxide] = sub


kv

{'SiO2': {'Full': PLSRegression(n_components=6),
  'Low': PLSRegression(n_components=9),
  'Mid': PLSRegression(n_components=6),
  'High': PLSRegression(n_components=5)},
 'TiO2': {'Full': PLSRegression(n_components=5),
  'Low': PLSRegression(n_components=7),
  'Mid': PLSRegression(n_components=5),
  'High': PLSRegression(n_components=3)},
 'Al2O3': {'Full': PLSRegression(n_components=6),
  'Low': PLSRegression(n_components=6),
  'Mid': PLSRegression(n_components=8),
  'High': PLSRegression(n_components=6)},
 'FeOT': {'Full': PLSRegression(n_components=8),
  'Low': PLSRegression(n_components=3),
  'Mid': PLSRegression(n_components=8),
  'High': PLSRegression(n_components=3)},
 'MgO': {'Full': PLSRegression(n_components=7),
  'Low': PLSRegression(n_components=6),
  'Mid': PLSRegression(n_components=9),
  'High': PLSRegression(n_components=8)},
 'CaO': {'Full': PLSRegression(n_components=8),
  'Low': PLSRegression(n_components=9),
  'Mid': PLSRegression(n_components=9),
  'High': PLSRegr

In [5]:
import dotenv
from lib.data_handling import load_data
env = dotenv.dotenv_values(dotenv.find_dotenv())

dataset_loc = env.get('DATA_PATH') or ""
print(dataset_loc)

smol_data = load_data(
    "../data/data/calib/calib_2015/1600mm/pls/",
    10
)

data/data/calib/calib_2015/1600mm/pls


Loading data:   0%|          | 0/10 [00:00<?, ?it/s]

Loading data: 100%|██████████| 10/10 [00:02<00:00,  4.72it/s]


In [6]:
from lib.data_handling import CustomSpectralPipeline
from lib.reproduction import masks, major_oxides

compositon_data_path = env.get('COMPOSITION_DATA_PATH') or ""
print(compositon_data_path)

pipeline = CustomSpectralPipeline(
    masks,
    "../data/data/calib/ccam_calibration_compositions.csv",
    major_oxides
)

processed_data = pipeline.fit_transform(smol_data)
processed_data.head()

data/data/calib/ccam_calibration_compositions.csv


Transforming samples: 100%|██████████| 10/10 [00:00<00:00, 15.99it/s]


Unnamed: 0,246.688,246.741,246.79401,246.847,246.89999,246.953,247.007,247.06,247.11301,247.166,...,848.89642,SiO2,TiO2,Al2O3,FeOT,MgO,CaO,Na2O,K2O,Sample Name
0,399858900000.0,362310200000.0,312583400000.0,254814300000.0,238410900000.0,253572700000.0,286705500000.0,318151900000.0,322830800000.0,310822000000.0,...,22935410000.0,51.03,1,13.83,9.93,6.33,10.7,2.01,0.6,jsc1399
1,310111500000.0,276084000000.0,229415300000.0,178971400000.0,160254600000.0,173736500000.0,203343300000.0,230790300000.0,238090000000.0,228648400000.0,...,22246580000.0,51.03,1,13.83,9.93,6.33,10.7,2.01,0.6,jsc1399
2,478439500000.0,436485900000.0,378726100000.0,300735100000.0,286992900000.0,306050200000.0,343736600000.0,386255000000.0,389610300000.0,375093400000.0,...,22395920000.0,51.03,1,13.83,9.93,6.33,10.7,2.01,0.6,jsc1399
3,540226600000.0,497416500000.0,426643200000.0,335467000000.0,323240200000.0,343319800000.0,384717500000.0,429803100000.0,431385900000.0,417308900000.0,...,1634240000.0,51.03,1,13.83,9.93,6.33,10.7,2.01,0.6,jsc1399
4,492592700000.0,445791700000.0,386244100000.0,306331800000.0,298727300000.0,319164800000.0,356717200000.0,390994000000.0,397830100000.0,383662200000.0,...,22341640000.0,51.03,1,13.83,9.93,6.33,10.7,2.01,0.6,jsc1399


In [7]:
from lib.norms import Norm1Scaler, Norm3Scaler
from lib.reproduction import spectrometer_wavelength_ranges
import lib.norms
import importlib
importlib.reload(lib.norms)

drop_cols = major_oxides + ['Sample Name']

scaler = Norm1Scaler(reshaped=True)
X_1 = scaler.fit_transform(processed_data.drop(drop_cols, axis=1))
X_3 = Norm3Scaler(spectrometer_wavelength_ranges, reshaped=True).fit_transform(processed_data.drop(drop_cols, axis=1))

y = processed_data['SiO2']

In [51]:
from lib.norms import Norm1Scaler, Norm3Scaler
from lib.reproduction import spectrometer_wavelength_ranges, training_info
import numpy as np


def get_weights(y_full, blend_range_min, blend_range_max):
    """
    Helper function to calculate weights for blending predictions.
    """
    w_upper = (y_full - blend_range_min) / (blend_range_max - blend_range_min)
    w_lower = 1 - w_upper
    return w_lower, w_upper


def norm_data(x, oxide: str, model: str):
    """
    Normalizes the data for the given oxide and model.
    """
    norm = training_info[oxide][model]["normalization"]

    if norm == 1:
        scaler = Norm1Scaler(reshaped=True)
        print(f"Using Norm1Scaler for {oxide} {model}")
        scaled_df = scaler.fit_transform(x.copy(deep=True))
        assert np.isclose(scaled_df.sum(axis=1), 1).all()
        return scaled_df
    elif norm == 3:
        scaler = Norm3Scaler(spectrometer_wavelength_ranges, reshaped=True)
        print(f"Using Norm3Scaler for {oxide} {model}")
        
        scaled_df = scaler.fit_transform(x.copy(deep=True))
        assert np.isclose(scaled_df.sum(axis=1), 3, atol=1e-1).all(), f"Norm3: {scaled_df.sum(axis=1)}"
        return scaled_df
    else:
        raise ValueError(f"Normalization value {norm} not recognized.")


def predict_composition_with_blending(oxide: str, X1, X3, models, ranges):
    """
    Predicts the composition of the given oxide based on the
    full model prediction (y_full) and the optimized blending ranges,
    including blending between Mid-High models as well as Low-Mid models.
    """
    assert len(X1) == len(X3), "X1 and X3 must be the same length"
    
    # X_full_norm = norm_data(X, oxide, "Full")
    full_norm = training_info[oxide]["Full"]["normalization"]
    X_full_norm = X1 if full_norm == 1 else X3
    y_full = models[oxide]["Full"].predict(X_full_norm)

    print(f"y_full: {y_full}")
    print(X_full_norm.sum(axis=1))

    # Check for non-blending range predictions first
    blend_ranges = ["Low-Mid", "Mid-High"]

    for range_name, (range_min, range_max) in ranges[oxide].items():
        if range_min <= y_full <= range_max and range_name not in blend_ranges:
            # X_range_norm = norm_data(X, oxide, range_name)
            range_norm = training_info[oxide][range_name]["normalization"]
            X_range_norm = X1 if range_norm == 1 else X3
            return models[oxide][range_name].predict(X_range_norm)

    for blend_range in blend_ranges:
        # Check if blend_range is defined for the given oxide
        if blend_range in ranges[oxide]:
            blend_range_min, blend_range_max = ranges[oxide][blend_range]
            print(
                f"Range: {blend_range}, min: {blend_range_min}, max: {blend_range_max}"
            )

            # Check if y_full is within the defined blending range
            if blend_range_min <= y_full <= blend_range_max:
                w_lower, w_upper = get_weights(y_full, blend_range_min, blend_range_max)

                lower, upper = blend_range.split("-")
                X_lower_norm = X1 if training_info[oxide][lower]["normalization"] == 1 else X3
                X_upper_norm = X1 if training_info[oxide][upper]["normalization"] == 1 else X3
                # X_lower_norm = norm_data(X, oxide, lower)
                # X_upper_norm = norm_data(X, oxide, upper)

                y_lower = models[oxide][lower].predict(X_lower_norm)
                y_upper = models[oxide][upper].predict(X_upper_norm)

                y_final = w_lower * y_lower + w_upper * y_upper
                print(f"y_lower: {y_lower}, y_upper: {y_upper}, y_final: {y_final}")
                print(f"w_lower: {w_lower}, w_upper: {w_upper}")

                return y_final

    # Error if y_full is outside any defined range
    raise ValueError(
        f"y_full value {y_full} for oxide {oxide} is outside defined blending ranges."
    )


In [66]:
from sklearn.metrics import mean_squared_error
from lib.reproduction import optimized_blending_ranges  # noqa: E402

# full_norm = training_info['SiO2']['Full']['normalization']

X = processed_data.drop(drop_cols, axis=1)

# y_full = kv['SiO2']['Full'].predict(X_1[:1] if full_norm == 1 else X_3[:1])
# print(y_full)
# kv['SiO2']['Low'].predict(X[:1])
pred = predict_composition_with_blending('SiO2', X_1, X_3, kv, optimized_blending_ranges)
actual = y[1:2]
print(actual)

mean_squared_error(actual, pred, squared=False)

y_full: [54.13805783 51.50492972 51.45122949 52.63165814 53.91332172 51.44537396
 50.76262913 51.45923849 48.75820086 49.05743391 39.48781027 37.6435152
 36.59012852 40.32425388 37.93645726 56.2516696  59.98056639 64.52630012
 61.3595477  58.1947017  54.58858826 56.01936076 56.78367471 57.09494591
 52.02235479 70.1872745  72.79338652 66.51134967 63.37180116 65.58874789
 63.94620115 64.25018123 66.59577167 63.46601577 63.93002768 55.75983017
 56.12985591 56.03496974 56.24362615 56.31263857 59.02870599 59.79357037
 59.68093867 60.17422006 58.74761535 76.57397477 78.47147015 78.96569598
 79.69986502 77.25377282]
0     1.0
1     1.0
2     1.0
3     1.0
4     1.0
5     1.0
6     1.0
7     1.0
8     1.0
9     1.0
10    1.0
11    1.0
12    1.0
13    1.0
14    1.0
15    1.0
16    1.0
17    1.0
18    1.0
19    1.0
20    1.0
21    1.0
22    1.0
23    1.0
24    1.0
25    1.0
26    1.0
27    1.0
28    1.0
29    1.0
30    1.0
31    1.0
32    1.0
33    1.0
34    1.0
35    1.0
36    1.0
37    1.0
38 



ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()