In [1]:
import logging
from pathlib import Path
from typing import Dict

import mlflow
import numpy as np
import pandas as pd
from dotenv import dotenv_values
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

# from config import logger
from lib.data_handling import CustomSpectralPipeline, load_split_data  # type: ignore
from lib.norms import Norm1Scaler, Norm3Scaler
from lib.outlier_removal import (
    calculate_leverage_residuals,
    identify_outliers,
    plot_leverage_residuals,
)
from lib.reproduction import (
    major_oxides,
    masks,
    oxide_ranges,
    paper_individual_sm_rmses,
    spectrometer_wavelength_ranges,
    training_info,
    optimized_blending_ranges,
)
from lib.utils import (
    custom_kfold_cross_validation,
    filter_data_by_compositional_range,
)
from PLS_SM.inference import predict_composition_with_blending


* 'schema_extra' has been renamed to 'json_schema_extra'


In [None]:
env = dotenv_values()
comp_data_loc = env.get("COMPOSITION_DATA_PATH")
dataset_loc = env.get("DATA_PATH")

take_samples = None

# data = load_data(str(dataset_loc))
train_data, test_data = load_split_data(
    str(dataset_loc), split_loc="./train_test_split.csv", average_shots=True
)

pipeline = CustomSpectralPipeline(
    masks=masks,
    composition_data_loc=comp_data_loc,
    major_oxides=major_oxides,
)
train_processed = pipeline.fit_transform(train_data)
test_processed = pipeline.fit_transform(test_data)