In [1]:
import logging
from pathlib import Path
from typing import Dict

# import warnings filter
from warnings import simplefilter

import mlflow
import numpy as np
import pandas as pd
from dotenv import dotenv_values
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

# from config import logger
from lib.data_handling import CustomSpectralPipeline, load_split_data, load_data  # type: ignore
from lib.outlier_removal import (
    calculate_leverage_residuals,
    identify_outliers,
    plot_leverage_residuals,
)
from lib.reproduction import (
    major_oxides,
    masks,
    optimized_blending_ranges,
    oxide_ranges,
    paper_individual_sm_rmses,
    spectrometer_wavelength_ranges,
    training_info,
)
from lib.utils import custom_kfold_cross_validation, filter_data_by_compositional_range
from PLS_SM.inference import predict_composition_with_blending

# ignore all future warnings
simplefilter(action="ignore", category=FutureWarning)

env = dotenv_values()
comp_data_loc = env.get("COMPOSITION_DATA_PATH")
dataset_loc = env.get("DATA_PATH")

if not comp_data_loc:
    print("Please set COMPOSITION_DATA_PATH in .env file")
    exit(1)

if not dataset_loc:
    print("Please set DATA_PATH in .env file")
    exit(1)

logger = logging.getLogger("train")

mlflow.set_tracking_uri("http://localhost:5000")

preformatted_data_path = Path("./data/_preformatted_sm/")
train_path = preformatted_data_path / "train.csv"
test_path = preformatted_data_path / "test.csv"

train_n1_path = preformatted_data_path / "train_n1.csv"
train_n3_path = preformatted_data_path / "train_n3.csv"
test_n1_path = preformatted_data_path / "test_n1.csv"
test_n3_path = preformatted_data_path / "test_n3.csv"

if (
    not preformatted_data_path.exists()
    or not train_path.exists()
    or not test_path.exists()
):
    take_samples = None

    logger.info("Loading data from location: %s", dataset_loc)
    data = load_data(str(dataset_loc), 10)
    # train_data, test_data = load_split_data(
    #     str(dataset_loc), split_loc="./train_test_split.csv", average_shots=True
    # )
    train_data, test_data = data, data.copy()
    logger.info("Data loaded successfully.")

    logger.info("Initializing CustomSpectralPipeline.")
    pipeline = CustomSpectralPipeline(
        masks=masks,
        composition_data_loc=comp_data_loc,
        major_oxides=major_oxides,
    )
    logger.info("Pipeline initialized. Fitting and transforming data.")
    train_processed = pipeline.fit_transform(train_data)
    test_processed = pipeline.fit_transform(test_data)
    logger.info("Data processing complete.")

    preformatted_data_path.mkdir(parents=True, exist_ok=True)

    train_processed.to_csv(train_path, index=False)
    test_processed.to_csv(test_path, index=False)
else:
    logger.info("Loading preformatted data from location: %s", preformatted_data_path)
    train_processed = pd.read_csv(train_path)
    test_processed = pd.read_csv(test_path)

Loading data: 100%|██████████| 10/10 [00:02<00:00,  4.97it/s]
Transforming samples: 100%|██████████| 10/10 [00:00<00:00, 13.78it/s]
Transforming samples: 100%|██████████| 10/10 [00:00<00:00, 15.24it/s]


In [302]:
from sklearn.base import BaseEstimator, TransformerMixin
import enum
from typing import Dict, Tuple

class Norm3Scaler(BaseEstimator, TransformerMixin):
    def __init__(
        self, wavelength_ranges: Dict[str, Tuple[float, float]], reshaped=False
    ):
        self.scaler = (
            Norm3ScalerReshapedData(wavelength_ranges)
            if reshaped
            else Norm3ScalerOriginalData(wavelength_ranges)
        )

    def fit(self, df):
        return self.scaler.fit(df)

    def transform(self, df):
        return self.scaler.transform(df)


class Norm3ScalerOriginalData(BaseEstimator, TransformerMixin):
    def __init__(self, wavelength_ranges: Dict[str, Tuple[float, float]]):
        self.wavelength_ranges = wavelength_ranges
        self.totals = None

    def fit(self, df):
        """
        Compute the total intensity for each spectrometer range.
        """
        self.totals = {}
        shot_columns = df.columns[df.columns.str.startswith("shot")]
        for key, (start, end) in self.wavelength_ranges.items():
            mask = (df["wave"] >= start) & (df["wave"] <= end)
            self.totals[key] = df.loc[mask, shot_columns].sum().sum()
        return self

    def transform(self, df):
        """
        Apply norm3 normalization to the DataFrame.
        """
        if self.totals is None:
            raise ValueError("The fit method must be called before transform.")

        shot_columns = df.columns[df.columns.str.startswith("shot")]
        for key, (start, end) in self.wavelength_ranges.items():
            mask = (df["wave"] >= start) & (df["wave"] <= end)
            df.loc[mask, shot_columns] = df.loc[mask, shot_columns].div(
                self.totals[key], axis=1
            )
        return df


class Norm3ScalerReshapedData(BaseEstimator, TransformerMixin):
    """
    This class is used to normalize the data in the same way as the
    Norm3Scaler class, but it is used for the reshaped data. This is
    necessary because the reshaped data has a different format than
    the original data.

    The reshaped data has the following format:
    - Each row represents a single shot
    - Each column represents a single wavelength
    - The column names are the wavelengths
    """

    def __init__(self, wavelength_ranges: Dict[str, Tuple[float, float]]):
        self.wavelength_ranges = wavelength_ranges
        self.totals = None
        self.out_of_range_columns = None

    def fit(self, df):
        """
        Compute the total intensity for each spectrometer range.
        """
        self.totals = {}

        # Convert column names to floats. If conversion fails, assign NaN
        float_cols = pd.to_numeric(df.columns, errors='coerce')

        # Initialize an empty set to keep track of all selected columns
        selected_columns_set = set()

        for key, (start, end) in self.wavelength_ranges.items():
            # Use boolean indexing to select columns in the specified range
            selected_columns = df.columns[(float_cols >= start) & (float_cols <= end)]
            selected_columns_set.update(selected_columns)

            # Compute the sum of intensities in these columns
            self.totals[key] = df[selected_columns].sum().sum()

        # Identify columns that are not in any range
        all_columns_set = set(df.columns)
        self.out_of_range_columns = all_columns_set - selected_columns_set

        # only keep out of range columns that are floats
        self.out_of_range_columns = pd.to_numeric(list(self.out_of_range_columns), errors="coerce")
        # remove nans
        self.out_of_range_columns = self.out_of_range_columns[~np.isnan(self.out_of_range_columns)]
        # convert back to string
        self.out_of_range_columns = self.out_of_range_columns.astype(str)

        # Handle or report out-of-range columns
        # if self.out_of_range_columns:
        #     print("Warning: There are columns outside of the specified ranges:", self.out_of_range_columns)
        # else:
        #     print("All columns fall within the specified ranges.")
        
        assert len(self.totals) == 3, "Expected 3 spectrometer ranges"
        print(self.totals)
        sum_of_totals = sum(self.totals.values())
        print(sum_of_totals)
        return self

    def transform(self, df):
        """
        Apply norm3 normalization to the DataFrame.
        """
        if self.totals is None:
            raise ValueError("The fit method must be called before transform.")

        for key, (start, end) in self.wavelength_ranges.items():
            # Select columns in the specified range and ignore non-float columns
            selected_columns = []
            for col in df.columns:
                try:
                    if start <= float(col) <= end:
                        selected_columns.append(col)
                except ValueError:
                    # Ignore columns that cannot be converted to float
                    continue

            # Normalize intensities in these columns
            df[selected_columns] = df[selected_columns].div(self.totals[key], axis=0)

        # drop columns that are not in any range
        df.drop(columns=self.out_of_range_columns, inplace=True)
        print(self.out_of_range_columns)
        print(df.head())
        return df


drop_cols = major_oxides + ["Sample Name", "ID"]

In [303]:
train_processed.drop(columns=drop_cols).T.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1528,1529,1530,1531,1532,1533,1534,1535,1536,1537
count,5485.0,5485.0,5485.0,5485.0,5485.0,5485.0,5485.0,5485.0,5485.0,5485.0,...,5485.0,5485.0,5485.0,5485.0,5485.0,5485.0,5485.0,5485.0,5485.0,5485.0
mean,91277520000.0,91844840000.0,101847100000.0,104285900000.0,103772600000.0,82290760000.0,84370580000.0,80619010000.0,79861650000.0,84488460000.0,...,72386010000.0,84968030000.0,79351800000.0,74604560000.0,73490040000.0,111477400000.0,119116500000.0,120181200000.0,117146500000.0,107845100000.0
std,224676100000.0,229504700000.0,256740100000.0,249957900000.0,253480700000.0,191430200000.0,205488600000.0,195422900000.0,192445700000.0,205145700000.0,...,156950000000.0,187041700000.0,170207300000.0,161900900000.0,161647200000.0,253393700000.0,245284400000.0,264735200000.0,253777400000.0,241301300000.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7512402000.0,7645420000.0,8049826000.0,8411631000.0,8322361000.0,8571419000.0,8231139000.0,7810937000.0,7749104000.0,8166487000.0,...,8322698000.0,9212789000.0,8629386000.0,8210420000.0,8484050000.0,11251100000.0,11936160000.0,11004460000.0,11089860000.0,10910290000.0
50%,28409750000.0,29281020000.0,31391230000.0,33182970000.0,32606930000.0,29883500000.0,28664480000.0,27611840000.0,26618560000.0,28956920000.0,...,27561520000.0,30935310000.0,29478620000.0,27460750000.0,28120130000.0,38147360000.0,40728480000.0,37410680000.0,37999300000.0,35877060000.0
75%,85653430000.0,86693890000.0,93546470000.0,97938590000.0,98698730000.0,80802990000.0,80447000000.0,78202020000.0,78096040000.0,81286860000.0,...,73919030000.0,82366540000.0,78878920000.0,74654250000.0,74411980000.0,109849000000.0,113445900000.0,110004700000.0,109361500000.0,106624800000.0
max,4265688000000.0,4592867000000.0,5009349000000.0,4616491000000.0,4809581000000.0,2832522000000.0,3141137000000.0,2822526000000.0,2953743000000.0,2980790000000.0,...,2513769000000.0,2527600000000.0,2305864000000.0,2346277000000.0,2505947000000.0,4375193000000.0,3105788000000.0,3561074000000.0,3592492000000.0,4133483000000.0


In [304]:
train_processed.drop(columns=drop_cols).T.sum().sum()

9.881857093203355e+17

In [305]:
train_cols = train_processed.columns

In [306]:
scaler = Norm3Scaler(wavelength_ranges=spectrometer_wavelength_ranges, reshaped=True)
train_processed = scaler.fit_transform(train_processed)

{'UV': 6.176395627352914e+17, 'VIO': 2.6972361404241165e+17, 'VNIR': 5.452579200577558e+16}
9.418889687834787e+17
['329.151' '337.39499' '328.245' '327.88199' '337.616' '330.23499'
 '332.84399' '326.97299' '337.08401' '330.05399' '331.40601' '329.513'
 '337.43799' '331.22601' '333.92001' '335.39301' '327.836' '333.15799'
 '337.79401' '330.55099' '494.00601' '330.14401' '336.14999' '331.901'
 '332.93399' '332.79901' '336.46201' '337.883' '335.70499' '327.47299'
 '334.54599' '336.81699' '326.88199' '335.66' '326.746' '329.96399'
 '493.32919' '336.90601' '328.97' '338.01599' '331.18201' '336.10599'
 '328.01801' '338.32599' '326.92801' '336.68399' '329.603' '330.95599'
 '333.60599' '335.21399' '332.26099' '335.74899' '330.45999' '332.48499'
 '333.06799' '328.742' '494.68271' '329.28601' '338.147' '335.03699'
 '329.91901' '327.064' '335.30499' '329.466' '328.56201' '332.17099'
 '326.06299' '332.53' '327.927' '330.009' '334.94699' '334.858' '334.41'
 '327.01901' '336.37299' '331.94601' '336.

In [307]:
out_of_range_cols = scaler.scaler.out_of_range_columns # type: ignore

# remove out of range columns from train_cols and test_cols
train_cols = [col for col in train_cols if col not in out_of_range_cols] # type: ignore

In [308]:
train_processed = pd.DataFrame(train_processed, columns=train_cols)

In [309]:
train_processed.head()

Unnamed: 0,246.688,246.741,246.79401,246.847,246.89999,246.953,247.007,247.06,247.11301,247.166,...,SiO2,TiO2,Al2O3,FeOT,MgO,CaO,Na2O,K2O,Sample Name,ID
0,2.940677e-07,2.628665e-07,2.218354e-07,1.800553e-07,1.617129e-07,1.721856e-07,1.985312e-07,2.240468e-07,2.363474e-07,2.364594e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161336_ccs
1,3.245969e-07,2.903669e-07,2.464502e-07,2.033082e-07,1.835096e-07,1.933978e-07,2.212464e-07,2.489649e-07,2.61788e-07,2.600798e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161134_ccs
2,3.407285e-07,3.059225e-07,2.603263e-07,2.124742e-07,1.869284e-07,1.941817e-07,2.236271e-07,2.5417e-07,2.677841e-07,2.649219e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_162544_ccs
3,4.0889e-07,3.678309e-07,3.126661e-07,2.532956e-07,2.223067e-07,2.316652e-07,2.680076e-07,3.05278e-07,3.215675e-07,3.166276e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161514_ccs
4,3.518082e-07,3.182807e-07,2.736818e-07,2.256416e-07,2.033465e-07,2.133621e-07,2.437775e-07,2.755768e-07,2.89062e-07,2.842021e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_160941_ccs


In [310]:
totals = {}

# Convert column names to floats. If conversion fails, assign NaN
float_cols = pd.to_numeric(train_processed.columns, errors='coerce')

# Initialize an empty set to keep track of all selected columns
selected_columns_set = set()

for key, (start, end) in spectrometer_wavelength_ranges.items():
    # Use boolean indexing to select columns in the specified range
    selected_columns = train_processed.columns[(float_cols >= start) & (float_cols <= end)]
    selected_columns_set.update(selected_columns)

    # Compute the sum of intensities in these columns
    totals[key] = train_processed[selected_columns].sum().sum()

totals

{'UV': 1.0, 'VIO': 1.0, 'VNIR': 0.9999999999999999}

In [311]:
scaler.scaler.out_of_range_columns

array(['329.151', '337.39499', '328.245', '327.88199', '337.616',
       '330.23499', '332.84399', '326.97299', '337.08401', '330.05399',
       '331.40601', '329.513', '337.43799', '331.22601', '333.92001',
       '335.39301', '327.836', '333.15799', '337.79401', '330.55099',
       '494.00601', '330.14401', '336.14999', '331.901', '332.93399',
       '332.79901', '336.46201', '337.883', '335.70499', '327.47299',
       '334.54599', '336.81699', '326.88199', '335.66', '326.746',
       '329.96399', '493.32919', '336.90601', '328.97', '338.01599',
       '331.18201', '336.10599', '328.01801', '338.32599', '326.92801',
       '336.68399', '329.603', '330.95599', '333.60599', '335.21399',
       '332.26099', '335.74899', '330.45999', '332.48499', '333.06799',
       '328.742', '494.68271', '329.28601', '338.147', '335.03699',
       '329.91901', '327.064', '335.30499', '329.466', '328.56201',
       '332.17099', '326.06299', '332.53', '327.927', '330.009',
       '334.94699', '334.858', 

In [312]:
train_processed.head()

Unnamed: 0,246.688,246.741,246.79401,246.847,246.89999,246.953,247.007,247.06,247.11301,247.166,...,SiO2,TiO2,Al2O3,FeOT,MgO,CaO,Na2O,K2O,Sample Name,ID
0,2.940677e-07,2.628665e-07,2.218354e-07,1.800553e-07,1.617129e-07,1.721856e-07,1.985312e-07,2.240468e-07,2.363474e-07,2.364594e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161336_ccs
1,3.245969e-07,2.903669e-07,2.464502e-07,2.033082e-07,1.835096e-07,1.933978e-07,2.212464e-07,2.489649e-07,2.61788e-07,2.600798e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161134_ccs
2,3.407285e-07,3.059225e-07,2.603263e-07,2.124742e-07,1.869284e-07,1.941817e-07,2.236271e-07,2.5417e-07,2.677841e-07,2.649219e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_162544_ccs
3,4.0889e-07,3.678309e-07,3.126661e-07,2.532956e-07,2.223067e-07,2.316652e-07,2.680076e-07,3.05278e-07,3.215675e-07,3.166276e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161514_ccs
4,3.518082e-07,3.182807e-07,2.736818e-07,2.256416e-07,2.033465e-07,2.133621e-07,2.437775e-07,2.755768e-07,2.89062e-07,2.842021e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_160941_ccs


In [313]:
train_processed_T = train_processed.drop(columns=drop_cols).T

In [314]:
train_processed.head()

Unnamed: 0,246.688,246.741,246.79401,246.847,246.89999,246.953,247.007,247.06,247.11301,247.166,...,SiO2,TiO2,Al2O3,FeOT,MgO,CaO,Na2O,K2O,Sample Name,ID
0,2.940677e-07,2.628665e-07,2.218354e-07,1.800553e-07,1.617129e-07,1.721856e-07,1.985312e-07,2.240468e-07,2.363474e-07,2.364594e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161336_ccs
1,3.245969e-07,2.903669e-07,2.464502e-07,2.033082e-07,1.835096e-07,1.933978e-07,2.212464e-07,2.489649e-07,2.61788e-07,2.600798e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161134_ccs
2,3.407285e-07,3.059225e-07,2.603263e-07,2.124742e-07,1.869284e-07,1.941817e-07,2.236271e-07,2.5417e-07,2.677841e-07,2.649219e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_162544_ccs
3,4.0889e-07,3.678309e-07,3.126661e-07,2.532956e-07,2.223067e-07,2.316652e-07,2.680076e-07,3.05278e-07,3.215675e-07,3.166276e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161514_ccs
4,3.518082e-07,3.182807e-07,2.736818e-07,2.256416e-07,2.033465e-07,2.133621e-07,2.437775e-07,2.755768e-07,2.89062e-07,2.842021e-07,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_160941_ccs


In [315]:
train_processed.describe()

Unnamed: 0,246.688,246.741,246.79401,246.847,246.89999,246.953,247.007,247.06,247.11301,247.166,...,848.6972,848.89642,SiO2,TiO2,Al2O3,FeOT,MgO,CaO,Na2O,K2O
count,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,...,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0
mean,5.96918e-07,5.478494e-07,4.72691e-07,3.784585e-07,3.505087e-07,3.728394e-07,4.212672e-07,4.718655e-07,4.829326e-07,4.7205e-07,...,4.475571e-07,4.664242e-07,55.87685,0.912611,14.877605,7.354719,4.27296,5.257851,2.247211,2.313615
std,5.029685e-07,4.647466e-07,3.897875e-07,2.884878e-07,2.676583e-07,2.900287e-07,3.372568e-07,3.866733e-07,3.825322e-07,3.616607e-07,...,4.583912e-07,4.766075e-07,14.669544,0.817497,6.471634,5.925113,5.729803,7.313715,2.129128,1.946189
min,3.255042e-08,3.371595e-08,3.454026e-08,2.811763e-08,2.046611e-08,1.836804e-08,2.418026e-08,3.305267e-08,3.804021e-08,3.972834e-08,...,0.0,0.0,0.22,0.0,0.01,0.06,0.01,0.0,0.0,0.0
25%,2.74169e-07,2.495727e-07,2.085089e-07,1.744901e-07,1.574792e-07,1.684305e-07,1.933283e-07,2.204106e-07,2.329351e-07,2.321426e-07,...,9.597016e-09,1.828662e-08,49.31,0.44,12.82,3.955,1.6925,0.37,0.625,0.6
50%,4.596886e-07,4.167135e-07,3.576766e-07,2.975416e-07,2.715845e-07,2.854463e-07,3.247427e-07,3.619567e-07,3.765334e-07,3.719208e-07,...,3.873529e-07,4.048342e-07,57.05,0.72,15.84,6.07,2.87,1.64,2.035,2.0
75%,8.173827e-07,7.495711e-07,6.584779e-07,5.453936e-07,5.103813e-07,5.346986e-07,5.959522e-07,6.508928e-07,6.645227e-07,6.516213e-07,...,7.926182e-07,8.119093e-07,64.8,1.06,17.915,9.61,4.66,8.38,3.47,3.685
max,4.281211e-06,3.886751e-06,3.185317e-06,2.220394e-06,1.995496e-06,2.311295e-06,2.87958e-06,3.377259e-06,3.358326e-06,3.096733e-06,...,2.600732e-06,2.788083e-06,97.71,5.81,38.79,50.071536,56.14,37.22,25.96,12.05


In [316]:
train_processed.sum().sum()

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('float64'), dtype('<U8318')) -> None

In [317]:
train_processed = pd.read_csv(train_path)

In [318]:
train_processed.head()

Unnamed: 0,246.688,246.741,246.79401,246.847,246.89999,246.953,247.007,247.06,247.11301,247.166,...,SiO2,TiO2,Al2O3,FeOT,MgO,CaO,Na2O,K2O,Sample Name,ID
0,181627800000.0,162356700000.0,137014300000.0,111209300000.0,99880300000.0,106348600000.0,122620700000.0,138380200000.0,145977500000.0,146046700000.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161336_ccs
1,200483900000.0,179342100000.0,152217400000.0,125571200000.0,113342800000.0,119450100000.0,136650600000.0,153770600000.0,161690600000.0,160635600000.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161134_ccs
2,210447400000.0,188949800000.0,160787800000.0,131232400000.0,115454300000.0,119934300000.0,138120900000.0,156985500000.0,165394000000.0,163626200000.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_162544_ccs
3,252546700000.0,227186900000.0,193114900000.0,156445400000.0,137305400000.0,143085600000.0,165532100000.0,188551800000.0,198612800000.0,195561700000.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161514_ccs
4,217290700000.0,196582800000.0,169036700000.0,139365200000.0,125594800000.0,131780900000.0,150566600000.0,170207100000.0,178536100000.0,175534400000.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_160941_ccs


In [320]:
(start, end) = spectrometer_wavelength_ranges["UV"]

float_cols = pd.to_numeric(train_processed.columns, errors="coerce")
selected_columns = train_processed.columns[(float_cols >= start) & (float_cols <= end)]

In [321]:
selected_columns

Index(['246.688', '246.741', '246.79401', '246.847', '246.89999', '246.953',
       '247.007', '247.06', '247.11301', '247.166',
       ...
       '325.517', '325.561', '325.60699', '325.65302', '325.698', '325.74399',
       '325.789', '325.83499', '325.88199', '325.92599'],
      dtype='object', length=1606)

In [326]:
# Compute the sum of intensities in these columns
total = train_processed[selected_columns].sum()
total

246.688      5.670301e+14
246.741      5.204184e+14
246.79401    4.490232e+14
246.847      3.595090e+14
246.89999    3.329586e+14
                 ...     
325.74399    1.546129e+14
325.789      1.546162e+14
325.83499    1.582299e+14
325.88199    1.834164e+14
325.92599    1.856122e+14
Length: 1606, dtype: float64

In [329]:
train_processed[selected_columns].div(total, axis=1).sum().sum()

1606.0