In [1]:
from dotenv import dotenv_values
from lib.data_handling import load_split_data

env = dotenv_values()
comp_data_loc = env.get("COMPOSITION_DATA_PATH")
dataset_loc = env.get("DATA_PATH")
print(dataset_loc)

if not comp_data_loc:
    print("Please set COMPOSITION_DATA_PATH in .env file")
    exit(1)

if not dataset_loc:
    print("Please set DATA_PATH in .env file")
    exit(1)


_, ICA_test_data = load_split_data(
    dataset_loc=dataset_loc,
    split_loc="./train_test_split.csv",
    average_shots=False,
)

_, PLS_test_Data = load_split_data(
    dataset_loc=dataset_loc,
    split_loc="./train_test_split.csv",
    average_shots=True,
)

data/data/calib/calib_2015/1600mm/pls


Loading data: 100%|██████████| 414/414 [01:46<00:00,  3.89it/s]
Loading data: 100%|██████████| 414/414 [01:33<00:00,  4.41it/s]


In [2]:
from lib.data_handling import CompositionData


compositions = CompositionData(comp_data_loc)
compositions.composition_data.head()

Unnamed: 0,Target,Spectrum Name,Sample Name,SiO2,TiO2,Al2O3,FeOT,MnO,MgO,CaO,Na2O,K2O,MOC total,Used for 2015 calibration,Used for 2021 Mn calibration,Used for 2022 Li calibration
0,AGV2,AGV2,AGV2,59.3,1.05,16.91,6.02,0.099,1.79,5.2,4.19,2.88,97.44,1.0,1.0,1.0
1,BCR-2,BCR2,BCR2,54.1,2.26,13.5,12.42,0.2,3.59,7.12,3.16,1.79,98.14,1.0,1.0,1.0
2,BEN,BEN,BEN,38.2,2.61,10.07,11.61,0.2,13.15,13.87,3.18,1.39,94.28,1.0,1.0,1.0
3,BHVO2,BHVO2,BHVO2,49.9,2.73,13.5,11.07,0.167,7.23,11.4,2.22,0.52,98.74,1.0,1.0,1.0
4,BIR-1a,BIR1,BIR1,47.7,0.97,15.4,10.19,0.176,9.7,13.4,1.81,0.03,99.38,1.0,1.0,1.0


# ICA Predictions
- Preprocessing
    - Take first location of each shot
    - Wavelength mask transform
    - Use Norm1 and Norm3 to normalize the data
    - Transpose
- Run ICA
- Postprocess
- Add to aggregate DataFrame
- Fetch Linear Regression models from mlflow
- Run Linear Regression models
- Get RMSE

In [3]:
from lib.norms import Norm

ica_training_info = {
    "SiO2": {"law": "Log-square", "norm": Norm.NORM_1},
    "TiO2": {"law": "Geometric", "norm": Norm.NORM_3},
    "Al2O3": {"law": "Geometric", "norm": Norm.NORM_3},
    "FeOT": {"law": "Geometric", "norm": Norm.NORM_1},
    "MgO": {"law": "Exponential", "norm": Norm.NORM_1},
    "CaO": {"law": "Parabolic", "norm": Norm.NORM_1},
    "Na2O": {"law": "Parabolic", "norm": Norm.NORM_3},
    "K2O": {"law": "Geometric", "norm": Norm.NORM_3},
}

In [78]:
import pandas as pd
import numpy as np
from lib.data_handling import WavelengthMaskTransformer
from lib.norms import Norm1Scaler, Norm3Scaler
from lib.reproduction import masks, spectrometer_wavelength_ranges

def preprocess(df: pd.DataFrame, norm: Norm = Norm.NORM_1):
    # Apply masking
    wmt = WavelengthMaskTransformer(masks)
    df = wmt.fit_transform(df)

    # set the wave column as the index
    df.set_index("wave", inplace=True)

    # Normalize the data
    scaler = (
        Norm1Scaler()
        if norm.value == 1
        else Norm3Scaler(spectrometer_wavelength_ranges, reshaped=True)
    )
    df = pd.DataFrame(scaler.fit_transform(df))

    return df.transpose()

def postprocess(
    df: pd.DataFrame, ica_estimated_sources: np.ndarray,
    num_components: int, sample_name: str, composition_df: pd.DataFrame
    ):
    columns = df.columns

    corrcols = [f"IC{i+1}" for i in range(num_components)]
    df_ics = pd.DataFrame(
        ica_estimated_sources,
        index=[f"shot{i+6}" for i in range(45)],
        columns=corrcols,
    )

    df = pd.concat([df, df_ics], axis=1)

    # Correlate the loadings
    corrdf, ids = __correlate_loadings__(df, corrcols, columns)

    # Create the wavelengths matrix for each component
    ic_wavelengths = pd.DataFrame(index=[sample_name], columns=columns)

    for i in range(len(ids)):
        ic = ids[i].split(" ")[0]
        component_idx = int(ic[2]) - 1
        wavelength = corrdf.index[i]
        corr = corrdf.iloc[i].iloc[component_idx]

        ic_wavelengths.loc[sample_name, wavelength] = corr

    # Filter the composition data to only include the oxides and their compositions
    composition_df = composition_df.iloc[:, 3:12]
    composition_df.index = [sample_name]

    return composition_df, ic_wavelengths

# This is a function that finds the correlation between loadings and a set of columns
# The idea is to somewhat automate identifying which element the loading corresponds to.
def __correlate_loadings__(
    df: pd.DataFrame, corrcols: list, icacols: list
) -> (pd.DataFrame, list):
    corrdf = (
        df.corr().drop(labels=icacols, axis=1).drop(labels=corrcols, axis=0)
    )
    ids = []

    for ic_label in icacols:
        tmp = corrdf.loc[ic_label]
        match = tmp.values == np.max(tmp)
        col = corrcols[np.where(match)[0][-1]]

        ids.append(col + " (r=" + str(np.max(tmp)) + ")")

    return corrdf, ids


In [79]:
from ica.ica import run_ica


ica_df_norm1 = pd.DataFrame()
compositions_df_norm1 = pd.DataFrame()

ica_df_norm3 = pd.DataFrame()
compositions_df_norm3 = pd.DataFrame()

for sample_name, location_data_dfs in ICA_test_data.items():
    # Preprocess
    df = location_data_dfs[0]
    
    df_n1 = preprocess(df.copy(deep=True), Norm.NORM_1)
    df_n3 = preprocess(df.copy(deep=True), Norm.NORM_3)
    
    # Run ICA
    ica_estimated_sources_n1 = run_ica(
        df_n1,
        model="jade",
        num_components=8,
    )

    ica_estimated_sources_n3 = run_ica(
        df_n3,
        model="jade",
        num_components=8,
    )
    
    # Postprocess
    n1_composition_df, n1_ic_wavelengths = postprocess(
        df_n1,
        ica_estimated_sources_n1,
        8,
        sample_name,
        compositions.get_composition_for_sample(sample_name).copy(deep=True),
    )
    
    n3_composition_df, n3_ic_wavelengths = postprocess(
        df_n3,
        ica_estimated_sources_n3,
        8,
        sample_name,
        compositions.get_composition_for_sample(sample_name).copy(deep=True),
    )
    
    # Append to the dataframes
    ica_df_norm1 = pd.concat([ica_df_norm1, n1_ic_wavelengths])
    compositions_df_norm1 = pd.concat([compositions_df_norm1, n1_composition_df])
    
    ica_df_norm3 = pd.concat([ica_df_norm3, n3_ic_wavelengths])
    compositions_df_norm3 = pd.concat([compositions_df_norm3, n3_composition_df])

ica_df_norm1.index.name = "target"
ica_df_norm1.columns.name = "wavelengths"
compositions_df_norm1.index.name = "target"
compositions_df_norm1.columns.name = "oxide"

ica_df_norm3.index.name = "target"
ica_df_norm3.columns.name = "wavelengths"
compositions_df_norm3.index.name = "target"
compositions_df_norm3.columns.name = "oxide"

ica_df_norm1.head()

wavelengths,246.68800,246.74100,246.79401,246.84700,246.89999,246.95300,247.00700,247.06000,247.11301,247.16600,...,847.10272,847.30212,847.50153,847.70081,847.90009,848.09943,848.29871,848.49799,848.69720,848.89642
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
r63,0.982586,0.973195,0.958026,0.887797,0.833213,0.890333,0.935169,0.964031,0.970329,0.964332,...,0.612952,0.359783,0.112243,0.23747,0.278105,0.288769,0.332783,0.323123,0.144831,0.065522
jsc1453,0.981191,0.97039,0.981476,0.821567,0.807284,0.923864,0.967362,0.939479,0.981792,0.975443,...,0.410556,0.431336,0.448321,0.464452,0.480765,0.490075,0.490376,0.490453,0.49938,0.517439
mix6o,0.987368,0.988709,0.991112,0.955499,0.878214,0.939299,0.980353,0.988655,0.986261,0.987776,...,0.590164,0.581288,0.572445,0.602448,0.620751,0.622775,0.613344,0.599891,0.581984,0.564231
mix1b,0.877023,0.839778,0.72133,0.653273,0.529306,0.675429,0.793508,0.854571,0.86568,0.823834,...,0.302307,0.292676,0.289561,0.288234,0.286779,0.285346,0.283205,0.280898,0.279653,0.28101
ja3,0.493222,0.527023,0.495026,0.369879,0.380202,0.50354,0.530778,0.478032,0.544762,0.484922,...,0.152876,0.227074,0.201066,0.126643,0.120278,0.125942,0.256865,0.385181,0.315068,0.15559


In [4]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")

experiment_name = "ICA Evaluation"
mlflow.set_experiment(experiment_name)
mlflow.autolog()


* 'schema_extra' has been renamed to 'json_schema_extra'
2024/01/11 10:36:28 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [5]:
from lib.reproduction import major_oxides

oxide_models = {}

experiment_id = '549682258983743113'
experiment = mlflow.get_experiment(experiment_id)
runs = mlflow.search_runs(experiment_ids=[experiment_id])

for _, run in runs.iterrows():
    run_id = run['run_id']
    oxide_value = run['params.oxide']  # Assuming 'oxide' is stored as a parameter

    # Fetch the model artifact if it's a scikit-learn model
    client = mlflow.tracking.MlflowClient()
    artifacts = client.list_artifacts(run_id)
    for artifact in artifacts:
        if 'model' in artifact.path.lower():
            model_uri = f"runs:/{run_id}/{artifact.path}"
            model = mlflow.sklearn.load_model(model_uri)
            oxide_models[oxide_value] = model

oxide_models.keys()

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

dict_keys(['K2O', 'Na2O', 'CaO', 'MgO', 'FeOT', 'Al2O3', 'TiO2', 'SiO2'])

In [82]:
ica_df_norm1_2 = ica_df_norm1.apply(pd.to_numeric, errors='coerce')
ica_df_norm3_2 = ica_df_norm3.apply(pd.to_numeric, errors='coerce')
compositions_df_norm1_2 = compositions_df_norm1.apply(pd.to_numeric, errors='coerce')
compositions_df_norm3_2 = compositions_df_norm3.apply(pd.to_numeric, errors='coerce')

In [31]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

ica_oxide_predictions = {}
rmses = {}

ica_df_norm1 = pd.read_csv("./data/data/jade/ica/norm1-test/ica_data.csv")
compositions_df_norm1 = pd.read_csv("./data/data/jade/ica/norm1-test/composition_data.csv")

ica_df_norm3 = pd.read_csv("./data/data/jade/ica/norm3-test/ica_data.csv")
compositions_df_norm3 = pd.read_csv("./data/data/jade/ica/norm3-test/composition_data.csv")

In [33]:
na1 = ica_df_norm1[ica_df_norm1.isna().any(axis=1)]
na3 = ica_df_norm3[ica_df_norm3.isna().any(axis=1)]

na1.count().sum(), na3.count().sum()

(0, 0)

In [37]:
ca1 = compositions_df_norm1[compositions_df_norm1.isna().any(axis=1)]
ca3 = compositions_df_norm3[compositions_df_norm3.isna().any(axis=1)]

ca1.count().sum(), ca3.count().sum()

(0, 0)

In [35]:
targets_1 = ica_df_norm1["target"]
ica_df_norm1.drop(columns=["target"], inplace=True)

targets_3 = ica_df_norm3["target"]
ica_df_norm3.drop(columns=["target"], inplace=True)

In [16]:
import mlflow

In [40]:
for oxide, info in ica_training_info.items():
    print(f"Predicting {oxide}")
    model_name = info["law"]
    norm = info["norm"]
    
    print(f"Model: {model_name} | Norm: {norm.value}")
    
    with mlflow.start_run(run_name=f"ICA_EVAL_{oxide}"):
        X_test = ( ica_df_norm1 if norm == Norm.NORM_1 else ica_df_norm3 )
        y_test = (compositions_df_norm1[oxide] if norm == Norm.NORM_1 else compositions_df_norm3[oxide])

        assert X_test[X_test.isna().any(axis=1)].count().sum() == 0, "NaNs in X_test"

        negative_value_indices = np.where(X_test < 0)
        negative_value_locations = list(zip(negative_value_indices[0], negative_value_indices[1]))
        print (f"Negative values in X_test: {len(negative_value_locations)}")

        if model_name == "Log-square":
            X_test = np.log(X_test**2)
        elif model_name == "Exponential":
            X_test = np.log(X_test)
        elif model_name == "Geometric":
            X_test = np.sqrt(X_test)
        elif model_name == "Parabolic":
            X_test = X_test**2

        if X_test.isna().any().any():
            print("NaNs introduced after transformation")
            mlflow.end_run()
            break
        
        model = oxide_models[oxide]
        y_pred = model.predict(X_test)
        
        mlflow.log_param("oxide", oxide)
        mlflow.log_param("norm", norm.value)
        mlflow.log_param("model", model_name)
        
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mlflow.log_metric("rmse", float(rmse))
        
        ica_oxide_predictions[oxide] = y_pred
        rmses[oxide] = rmse

print(ica_oxide_predictions.keys())
rmses

Predicting SiO2
Model: Log-square | Norm: 1
Negative values in X_test: 9580
Predicting TiO2
Model: Geometric | Norm: 3
Negative values in X_test: 9580
NaNs introduced after transformation
dict_keys(['SiO2'])


  result = func(self.values, **kwargs)


{'SiO2': 10.678834376114056}