In [1]:
from lib.data_handling import *
from lib.reproduction import major_oxides, masks
from lib.config import AppConfig
from lib.norms import Norm1Scaler, Norm3Scaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

import numpy as np
import pandas as pd
import logging
import datetime
import mlflow

In [2]:
class OutlierRemoverWithMAD(BaseEstimator, TransformerMixin):
    def __init__(self, k=3.0, max_iterations=10):
        self.k = k
        self.max_iterations = max_iterations

    def fit(self, X, y=None):
        # This transformer does not need to learn anything from the data,
        # so the fit method doesn't do anything besides returning self.
        return self

    def transform(self, X):
        # Ensure X is a DataFrame (if it's not already)
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        # Apply the outlier removal logic
        non_outlier_indices, _ = self.identify_outliers_with_mad_iterative_multidim(
            X, self.k, self.max_iterations
        )

        # Initialize a boolean array for all indices, marking True for non-outliers
        outlier_mask = np.zeros(len(X), dtype=bool)
        outlier_mask[non_outlier_indices] = True   # True for non-outliers
        outlier_mask = ~outlier_mask              # Invert to mark outliers

        # Exclude wavelength column from being zeroed out
        columns_to_zero = X.columns[X.columns != "wave"]
        X.loc[outlier_mask, columns_to_zero] = 0

        return X


    def identify_outliers_with_mad_iterative_multidim(self, X, k=3.0, max_iterations=10):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("X must be a pandas DataFrame.")

        # print larget x value
        n_samples, n_features = X.shape
        keep_mask = np.ones(n_samples, dtype=bool)

        for feature in range(n_features):
            data = X.iloc[:, feature].to_numpy()
            for iteration in range(max_iterations):
                if not np.any(keep_mask):
                    break

                median = np.median(data[keep_mask])
                absolute_deviation = np.abs(data[keep_mask] - median)
                mad = np.median(absolute_deviation)
                if mad == 0:
                    break

                modified_z_scores = 0.6745 * absolute_deviation / mad
                outliers = modified_z_scores > k

                keep_mask[keep_mask] = ~outliers

                if not np.any(outliers):
                    break

        return np.where(keep_mask)[0], iteration + 1

In [3]:

class SVRCustomSpectralPipeline(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        masks,
        composition_data_loc,
        major_oxides,
    ):
        self.pipeline = Pipeline(
            [
                ("mask_transformer", WavelengthMaskTransformer(masks)),
                ("non_negative_transformer", NonNegativeTransformer()),
                ("outlier_remover", OutlierRemoverWithMAD(k=3.0, max_iterations=10)),
                ("data_reshaper", SpectralDataReshaper(wavelength_feature_name="wave")),
            ]
        )

        self.composition_data = CompositionData(composition_data_loc)
        self.major_oxides = major_oxides

    def _attach_major_oxides(
        self,
        transformed_df: pd.DataFrame,
        sample_name: str,
        location_name: str,
    ):
        sample_composition = self.composition_data.get_composition_for_sample(sample_name)

        if sample_composition.empty:
            raise ValueError("sample_composition is empty, cannot attach major oxides")

        oxides = sample_composition[self.major_oxides].iloc[0]
        transformed_df = transformed_df.assign(**oxides)

        transformed_df["Sample Name"] = sample_name
        transformed_df["ID"] = f"{sample_name}_{location_name}"

        return transformed_df

    def fit_transform(self, sample_data: dict[str, Dict[str, pd.DataFrame]]):
        transformed_samples = []

        for sample_name, sample_location_dfs in tqdm(sample_data.items(), desc="Transforming samples"):
            for _, (location_name, sample_df) in enumerate(sample_location_dfs.items()):
                if self.composition_data.get_composition_for_sample(sample_name=sample_name).empty:
                    continue

                transformed_df = self.pipeline.fit_transform(sample_df)

                transformed_df = self._attach_major_oxides(pd.DataFrame(transformed_df), sample_name, location_name)
                transformed_samples.append(transformed_df)

        df_out = pd.concat(transformed_samples, ignore_index=True).rename(columns=str)

        return df_out

In [4]:
def load_data(load_cache_if_exits: bool = True, average_shots: bool = True):
    """
    Loads the data for the full flow.
    """
    logger = logging.getLogger("train")

    config = AppConfig()
    composition_data_loc = config.composition_data_path
    dataset_loc = config.data_path

    preformatted_data_path = Path(f"{config.data_cache_dir}/_preformatted_mad/")

    data_hash = config.data_hash
    train_path = preformatted_data_path / f"train_{data_hash}.csv"
    test_path = preformatted_data_path / f"test_{data_hash}.csv"

    if (
        load_cache_if_exits
        and preformatted_data_path.exists()
        and train_path.exists()
        and test_path.exists()
    ):
        logger.info(
            "Loading preformatted data from location: %s", preformatted_data_path
        )
        train_processed = pd.read_csv(train_path)
        test_processed = pd.read_csv(test_path)
    else:
        logger.info("Loading data from location: %s", dataset_loc)
        train_data, test_data = load_split_data(
            str(dataset_loc), average_shots=average_shots
        )
        logger.info("Data loaded successfully.")

        logger.info("Initializing CustomSpectralPipeline.")
        pipeline = SVRCustomSpectralPipeline(
            masks=masks,
            composition_data_loc=composition_data_loc,
            major_oxides=major_oxides,
        )
        logger.info("Pipeline initialized. Fitting and transforming data.")
        train_processed = pipeline.fit_transform(train_data)
        test_processed = pipeline.fit_transform(test_data)
        logger.info("Data processing complete.")

        preformatted_data_path.mkdir(parents=True, exist_ok=True)

        train_processed.to_csv(train_path, index=False)
        test_processed.to_csv(test_path, index=False)

    return train_processed, test_processed

def load_and_scale_data(norm: int):
    """
    Loads the data and scales it using the specified normalization method.
    """
    train_processed, test_processed = load_data()

    train_cols = train_processed.columns
    test_cols = test_processed.columns

    scaler = Norm1Scaler() if norm == 1 else Norm3Scaler()
    train = scaler.fit_transform(train_processed)
    test = scaler.fit_transform(test_processed)

    # turn back into dataframe
    train = pd.DataFrame(train, columns=train_cols)
    test = pd.DataFrame(test, columns=test_cols)

    return train, test

def load_train_test_data(norm: int, drop_cols: list = ["ID", "Sample Name"]):
    """
    Loads the train and test data and returns the X and y values.
    """
    train, test = load_and_scale_data(norm)

    # Converting train set
    X_train = train.drop(columns=drop_cols)
    y_train = train[major_oxides]

    # Converting test set
    X_test = test.drop(columns=drop_cols)
    y_test = test[major_oxides]

    return X_train, y_train, X_test, y_test

In [5]:
norm = 3
X_train, y_train, X_test, y_test = load_train_test_data(norm)

Loading data: 100%|██████████| 414/414 [00:23<00:00, 17.58it/s]
Transforming samples: 100%|██████████| 308/308 [00:14<00:00, 20.56it/s]
Transforming samples: 100%|██████████| 78/78 [00:04<00:00, 19.02it/s]


(1538, 6152) (1538, 8) (390, 6152) (390, 8)
       SiO2  TiO2  Al2O3  FeOT    MgO    CaO  Na2O   K2O
0     56.13  0.69  17.69  5.86   3.85   7.07  3.32  1.44
1     56.13  0.69  17.69  5.86   3.85   7.07  3.32  1.44
2     56.13  0.69  17.69  5.86   3.85   7.07  3.32  1.44
3     56.13  0.69  17.69  5.86   3.85   7.07  3.32  1.44
4     56.13  0.69  17.69  5.86   3.85   7.07  3.32  1.44
...     ...   ...    ...   ...    ...    ...   ...   ...
1533  49.31  0.02   0.80  4.89  25.96  15.58  0.19     0
1534  49.31  0.02   0.80  4.89  25.96  15.58  0.19     0
1535  49.31  0.02   0.80  4.89  25.96  15.58  0.19     0
1536  49.31  0.02   0.80  4.89  25.96  15.58  0.19     0
1537  49.31  0.02   0.80  4.89  25.96  15.58  0.19     0

[1538 rows x 8 columns]


In [6]:
mlflow.set_experiment(f'MAD_SVM_Norm{norm}_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')

KeyboardInterrupt: 

In [None]:
models = []

kernel="poly"
C=100
eps=0.1
gamma="scale"
degree=2
coef0=1.0


for target in y_train.columns:
    print(target)
    with mlflow.start_run(run_name=f"MAD_SVM_{target}"):
        svm_reg = SVR(kernel=kernel, degree=degree, C=C, epsilon=eps, coef0=coef0, gamma=gamma)
        print(f"Training for {target}")
        print(f"X_train shape: {X_train.shape}, y_train shape: {y_train[target].shape}")
        svm_reg.fit(X_train, y_train[target])
        print(f"Predicting for {target}")
        y_pred = svm_reg.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))
        mlflow.log_metric("rmse", float(rmse))
        mlflow.log_param("target", target)
        mlflow.log_param("norm", norm)
        mlflow.log_param("kernel", kernel)
        mlflow.log_param("degree", degree)
        mlflow.log_param("coef0", coef0)
        mlflow.log_param("C", C)
        mlflow.log_param("epsilon", eps)
        mlflow.log_param("gamma", gamma)

        models.append(svm_reg)
        mlflow.sklearn.log_model(svm_reg, f"model_{target}")