In [1]:
from dotenv import dotenv_values

env = dotenv_values()
comp_data_loc = env.get("COMPOSITION_DATA_PATH")
dataset_loc = env.get("DATA_PATH")
print(dataset_loc)

data/data/calib/calib_2015/1600mm/pls


# ICA Predictions
- Preprocessing
    - Take first location of each shot
    - Wavelength mask transform
    - Use Norm1 and Norm3 to normalize the data
    - Transpose
- Run ICA
- Postprocess
- Add to aggregate DataFrame
- Fetch Linear Regression models from mlflow
- Run Linear Regression models
- Get RMSE

In [2]:
from lib.norms import Norm

ica_training_info = {
    "SiO2": {"law": "Log-square", "norm": Norm.NORM_1},
    "TiO2": {"law": "Geometric", "norm": Norm.NORM_3},
    "Al2O3": {"law": "Geometric", "norm": Norm.NORM_3},
    "FeOT": {"law": "Geometric", "norm": Norm.NORM_1},
    "MgO": {"law": "Exponential", "norm": Norm.NORM_1},
    "CaO": {"law": "Parabolic", "norm": Norm.NORM_1},
    "Na2O": {"law": "Parabolic", "norm": Norm.NORM_3},
    "K2O": {"law": "Geometric", "norm": Norm.NORM_3},
}

In [3]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")

experiment_name = "ICA Evaluation"
mlflow.set_experiment(experiment_name)
mlflow.autolog()


* 'schema_extra' has been renamed to 'json_schema_extra'
2024/01/14 08:41:52 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [4]:
from lib.reproduction import major_oxides

oxide_models = {}

experiment_id = '549682258983743113'
experiment = mlflow.get_experiment(experiment_id)
runs = mlflow.search_runs(experiment_ids=[experiment_id])

for _, run in runs.iterrows():
    run_id = run['run_id']
    oxide_value = run['params.oxide']  # Assuming 'oxide' is stored as a parameter

    # Fetch the model artifact if it's a scikit-learn model
    client = mlflow.tracking.MlflowClient()
    artifacts = client.list_artifacts(run_id)
    for artifact in artifacts:
        if 'model' in artifact.path.lower():
            model_uri = f"runs:/{run_id}/{artifact.path}"
            model = mlflow.sklearn.load_model(model_uri)
            oxide_models[oxide_value] = model

oxide_models.keys()

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

dict_keys(['K2O', 'Na2O', 'CaO', 'MgO', 'FeOT', 'Al2O3', 'TiO2', 'SiO2'])

In [5]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

ica_oxide_predictions = {}
rmses = {}

ica_df_norm1 = pd.read_csv("./data/data/jade/ica/norm1-test/ica_data.csv")
compositions_df_norm1 = pd.read_csv("./data/data/jade/ica/norm1-test/composition_data.csv")

ica_df_norm3 = pd.read_csv("./data/data/jade/ica/norm3-test/ica_data.csv")
compositions_df_norm3 = pd.read_csv("./data/data/jade/ica/norm3-test/composition_data.csv")

In [6]:
na1 = ica_df_norm1[ica_df_norm1.isna().any(axis=1)]
na3 = ica_df_norm3[ica_df_norm3.isna().any(axis=1)]

na1.count().sum(), na3.count().sum()

(0, 0)

In [7]:
ca1 = compositions_df_norm1[compositions_df_norm1.isna().any(axis=1)]
ca3 = compositions_df_norm3[compositions_df_norm3.isna().any(axis=1)]

ca1.count().sum(), ca3.count().sum()

(0, 0)

In [8]:
targets_1 = ica_df_norm1["target"]
ica_df_norm1.drop(columns=["target"], inplace=True)

targets_3 = ica_df_norm3["target"]
ica_df_norm3.drop(columns=["target"], inplace=True)

In [9]:
import mlflow

In [10]:
for oxide, info in ica_training_info.items():
    print(f"Predicting {oxide}")
    model_name = info["law"]
    norm = info["norm"]
    
    print(f"Model: {model_name} | Norm: {norm.value}")
    
    with mlflow.start_run(run_name=f"ICA_EVAL_{oxide}"):
        X_test = ( ica_df_norm1 if norm == Norm.NORM_1 else ica_df_norm3 )
        y_test = (compositions_df_norm1[oxide] if norm == Norm.NORM_1 else compositions_df_norm3[oxide])

        assert X_test[X_test.isna().any(axis=1)].count().sum() == 0, "NaNs in X_test"

        negative_value_indices = np.where(X_test < 0)
        negative_value_locations = list(zip(negative_value_indices[0], negative_value_indices[1]))
        print (f"Negative values in X_test: {len(negative_value_locations)}")

        if model_name == "Log-square":
            X_test = np.log(X_test**2)
        elif model_name == "Exponential":
            X_test = np.log(X_test)
        elif model_name == "Geometric":
            X_test = np.sqrt(X_test)
        elif model_name == "Parabolic":
            X_test = X_test**2

        if X_test.isna().any().any():
            print("NaNs introduced after transformation")
            mlflow.end_run()
            break
        
        model = oxide_models[oxide]
        y_pred = model.predict(X_test)
        
        mlflow.log_param("oxide", oxide)
        mlflow.log_param("norm", norm.value)
        mlflow.log_param("model", model_name)
        
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mlflow.log_metric("rmse", float(rmse))
        
        ica_oxide_predictions[oxide] = y_pred
        rmses[oxide] = rmse

print(ica_oxide_predictions.keys())
rmses

Predicting SiO2
Model: Log-square | Norm: 1
Negative values in X_test: 9580
Predicting TiO2
Model: Geometric | Norm: 3
Negative values in X_test: 9580
NaNs introduced after transformation
dict_keys(['SiO2'])


  result = func(self.values, **kwargs)


{'SiO2': 10.678834376114056}