In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import mlflow
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from lib import full_flow_dataloader
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import VarianceThreshold
from lib.config import AppConfig
from lib.data_handling import load_split_data, CustomSpectralPipeline, WavelengthMaskTransformer, NonNegativeTransformer, SpectralDataReshaper
from lib.reproduction import major_oxides, masks

from lib.norms import Norm1Scaler, Norm3Scaler


In [3]:
class VarianceThresholdTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.0):
        self.threshold = threshold

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            X = X.values  # Ensure handling as numpy array
        self.variances_ = np.var(X, axis=1)  # Calculate variances row-wise
        print("Calculated variances:", self.variances_)  # Debug: Output the variances
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values

        # Debug: Ensure the shape and type of variances are correct
        print("Variances on transform:", self.variances_)
        print("Threshold:", self.threshold)

        rows_to_zero_out = self.variances_ < self.threshold
        print("Rows to zero out (mask):", rows_to_zero_out)  # Which rows are marked to zero out

        X_transformed = np.copy(X)
        X_transformed[rows_to_zero_out, :] = 0

        # Debug: Number of rows actually zeroed out
        print(f"Number of rows zeroed out: {np.sum(rows_to_zero_out)}")

        return X_transformed

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)


In [92]:
config = AppConfig()

composition_data_loc = config.composition_data_path
dataset_loc = config.data_path

# if not train_data or not test_data: # type: ignore
train_data, test_data = load_split_data(
	str(dataset_loc), average_shots=True
)

Loading data: 100%|██████████| 414/414 [00:22<00:00, 18.22it/s]


In [93]:
pipeline = CustomSpectralPipeline(
	masks=masks,
	composition_data_loc=composition_data_loc,
	major_oxides=major_oxides,
)

# pipeline.pipeline.steps.insert(-2, ('variance_threshold', VarianceThresholdTransformer(threshold=0)))
# pipeline.pipeline.steps.append(('variance_threshold', VarianceThresholdTransformer(threshold=0.9)))

train_processed = pipeline.fit_transform(train_data)
test_processed = pipeline.fit_transform(test_data)


Transforming samples:   0%|          | 0/308 [00:00<?, ?it/s]

Transforming samples: 100%|██████████| 308/308 [00:13<00:00, 23.41it/s]
Transforming samples: 100%|██████████| 78/78 [00:03<00:00, 23.34it/s]


In [106]:
scaler = Norm3Scaler()
train_processed = scaler.fit_transform(train_processed)
test_processed = scaler.transform(test_processed)

In [94]:
train_processed.iloc[:, 200:300]

Unnamed: 0,251.45399,251.506,251.558,251.612,251.664,251.716,251.77,251.823,251.875,251.92799,...,256.181,256.233,256.285,256.33801,256.39001,256.44101,256.495,256.547,256.599,256.64999
0,9.864223e+11,1.283051e+12,1.446894e+12,1.662103e+12,1.923722e+12,1.866667e+12,1.274945e+12,8.138546e+11,7.603231e+11,8.702122e+11,...,2.055250e+11,3.500348e+11,5.545744e+11,7.313358e+11,6.843560e+11,4.679429e+11,2.780677e+11,1.653261e+11,1.627443e+11,1.725966e+11
1,9.340033e+11,1.195670e+12,1.331225e+12,1.533589e+12,1.764239e+12,1.713824e+12,1.185696e+12,7.686207e+11,7.199318e+11,8.200800e+11,...,2.113141e+11,3.569021e+11,5.563103e+11,7.183242e+11,6.631527e+11,4.534475e+11,2.681449e+11,1.552175e+11,1.572471e+11,1.672647e+11
2,1.068447e+12,1.387753e+12,1.546457e+12,1.788038e+12,2.082432e+12,2.002366e+12,1.356910e+12,8.704779e+11,8.152674e+11,9.348677e+11,...,2.105661e+11,3.587259e+11,5.745650e+11,7.609672e+11,7.220627e+11,4.937088e+11,2.912892e+11,1.749739e+11,1.726100e+11,1.808504e+11
3,1.121378e+12,1.457785e+12,1.633324e+12,1.876318e+12,2.187772e+12,2.122826e+12,1.425465e+12,9.047725e+11,8.460525e+11,9.701102e+11,...,2.212724e+11,3.727815e+11,5.998696e+11,7.948871e+11,7.468635e+11,5.111497e+11,2.994250e+11,1.767438e+11,1.780473e+11,1.851622e+11
4,9.975943e+11,1.292387e+12,1.447840e+12,1.650085e+12,1.902704e+12,1.826214e+12,1.269562e+12,8.640144e+11,8.108526e+11,9.042491e+11,...,2.682707e+11,4.533201e+11,7.376671e+11,9.783386e+11,9.242172e+11,6.347349e+11,3.763932e+11,2.325430e+11,2.257815e+11,2.409430e+11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1533,9.506193e+11,1.180871e+12,1.317148e+12,1.475714e+12,1.731064e+12,1.769919e+12,1.234682e+12,7.495081e+11,7.214264e+11,8.224720e+11,...,9.382661e+10,1.498440e+11,2.205922e+11,3.083025e+11,2.724146e+11,1.758873e+11,1.006016e+11,4.073272e+10,2.713110e+10,3.412601e+10
1534,9.937543e+11,1.250863e+12,1.398204e+12,1.570468e+12,1.851645e+12,1.896586e+12,1.299680e+12,7.553773e+11,7.266579e+11,8.416468e+11,...,7.418376e+10,1.104506e+11,1.593049e+11,2.209218e+11,1.892229e+11,1.252323e+11,7.313297e+10,3.285690e+10,2.243204e+10,2.559275e+10
1535,1.081016e+12,1.333999e+12,1.485179e+12,1.665291e+12,1.930948e+12,1.971524e+12,1.397003e+12,8.825402e+11,8.427643e+11,9.501161e+11,...,1.375247e+11,2.278977e+11,3.318931e+11,4.519318e+11,4.081070e+11,2.647552e+11,1.573848e+11,6.502384e+10,5.709795e+10,6.754603e+10
1536,9.476923e+11,1.185034e+12,1.320686e+12,1.482550e+12,1.734794e+12,1.780499e+12,1.223831e+12,7.286955e+11,6.994471e+11,8.082712e+11,...,7.494205e+10,1.159219e+11,1.665239e+11,2.326714e+11,1.976693e+11,1.302759e+11,7.667984e+10,3.708962e+10,2.319821e+10,2.541582e+10


In [95]:
def drop_non_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
	cols_to_drop = major_oxides + ["Sample Name", "ID"]
	return df.drop(columns=cols_to_drop)

In [119]:
def generate_exponential_quantiles(min_quantile, max_quantile, num_points):
    # Generate linearly spaced values in the exponent space between log(min_quantile) and log(max_quantile)
    exponent_space = np.linspace(np.log(min_quantile), np.log(max_quantile), num_points)

    # Calculate the actual quantiles
    quantiles = np.exp(exponent_space)

    return quantiles

min_q = 1e-9  # starting quantile
max_q = 1e-5   # ending quantile
points = 100   # number of points

quantile_values = generate_exponential_quantiles(min_q, max_q, points)
print(quantile_values)

[1.00000000e-09 1.09749877e-09 1.20450354e-09 1.32194115e-09
 1.45082878e-09 1.59228279e-09 1.74752840e-09 1.91791026e-09
 2.10490414e-09 2.31012970e-09 2.53536449e-09 2.78255940e-09
 3.05385551e-09 3.35160265e-09 3.67837977e-09 4.03701726e-09
 4.43062146e-09 4.86260158e-09 5.33669923e-09 5.85702082e-09
 6.42807312e-09 7.05480231e-09 7.74263683e-09 8.49753436e-09
 9.32603347e-09 1.02353102e-08 1.12332403e-08 1.23284674e-08
 1.35304777e-08 1.48496826e-08 1.62975083e-08 1.78864953e-08
 1.96304065e-08 2.15443469e-08 2.36448941e-08 2.59502421e-08
 2.84803587e-08 3.12571585e-08 3.43046929e-08 3.76493581e-08
 4.13201240e-08 4.53487851e-08 4.97702356e-08 5.46227722e-08
 5.99484250e-08 6.57933225e-08 7.22080902e-08 7.92482898e-08
 8.69749003e-08 9.54548457e-08 1.04761575e-07 1.14975700e-07
 1.26185688e-07 1.38488637e-07 1.51991108e-07 1.66810054e-07
 1.83073828e-07 2.00923300e-07 2.20513074e-07 2.42012826e-07
 2.65608778e-07 2.91505306e-07 3.19926714e-07 3.51119173e-07
 3.85352859e-07 4.229242

In [120]:
# Calculate the upper and lower quantiles of the test data
pd.set_option('display.precision', 10)
pd.set_option('display.float_format', '{:.10f}'.format)

train = drop_non_numeric_columns(train_processed)
test = drop_non_numeric_columns(test_processed)

zero_sum_columns = train.columns[train.sum() == 0]
test_no_zero_columns = test.drop(columns=zero_sum_columns)

flattened_series = test_no_zero_columns.values.ravel()
quantiles = pd.Series(flattened_series).quantile(quantile_values)
quantiles

0.0000000010   0.0000000000
0.0000000011   0.0000000000
0.0000000012   0.0000000000
0.0000000013   0.0000000000
0.0000000015   0.0000000000
                   ...     
0.0000068926   0.0000000000
0.0000075646   0.0000000000
0.0000083022   0.0000000000
0.0000091116   0.0000000000
0.0000100000   0.0000000000
Length: 100, dtype: float64

In [121]:
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

import matplotlib.pyplot as plt

num_dropped_colums_per_iteration = []

for i, threshold in enumerate(quantiles.values):
	# Identify columns where the sum is zero (these columns contain only zero)
	zero_sum_columns = train.columns[(train.sum(axis=0) == 0)]

	# Drop zero sum columns from train and test, keeping the original for later
	train_reduced = train.drop(columns=zero_sum_columns)
	test_reduced = test.drop(columns=zero_sum_columns)

	selector = VarianceThreshold(threshold=threshold)
	train_transformed = selector.fit_transform(train_reduced)
	test_transformed = selector.transform(test_reduced)

	# Prepare DataFrame from the numpy arrays returned by VarianceThreshold
	train_transformed_df = pd.DataFrame(train_transformed, columns=train_reduced.columns[selector.get_support()])
	test_transformed_df = pd.DataFrame(test_transformed, columns=test_reduced.columns[selector.get_support()])

	# Add back the zero-sum columns with all zeros
	for column in zero_sum_columns:
		train_transformed_df[column] = 0
		test_transformed_df[column] = 0

	# Ensure the column order is the same as the original DataFrames
	# train_final = train_transformed_df[train.columns]
	# test_final = test_transformed_df[test.columns]

	# Identify columns dropped due to zero variance (after removing zero sum columns)
	dropped_variance_columns = train_reduced.columns[~selector.get_support()]

	# Printing the shape of the data before and after transformation
	print(f"Threshold: {threshold}")
	# Print the variance
	# print(f"Variance of each column: {selector.variances_}")

	# Print the min of the variance
	# print(f"Min Variance: {selector.variances_.min()}")

	print(f"Original Training data shape: {train.shape}")
	print(f"Original Test data shape: {test.shape}")
	print(f"Transformed Training data shape: {train_transformed_df.shape}")
	print(f"Transformed Test data shape: {test_transformed_df.shape}")
	print(f"Sum: {train_transformed_df.sum().sum()}")

	# Print the number of columns dropped due to zero variance
	print(f"Number of columns dropped due to zero variance: {len(dropped_variance_columns)}")

	num_dropped_colums_per_iteration.append(len(dropped_variance_columns))

# plot the number of dropped columns per iteration
plt.plot(quantiles, num_dropped_colums_per_iteration)
plt.xlabel("Threshold")
plt.ylabel("Number of dropped columns")
plt.title("Number of dropped columns per threshold")
plt.show()

Threshold: 0.0
Original Training data shape: (1538, 6144)
Original Test data shape: (390, 6144)
Transformed Training data shape: (1538, 6144)
Transformed Test data shape: (390, 6144)
Sum: 4614.0
Number of columns dropped due to zero variance: 0
Threshold: 0.0
Original Training data shape: (1538, 6144)
Original Test data shape: (390, 6144)
Transformed Training data shape: (1538, 6144)
Transformed Test data shape: (390, 6144)
Sum: 4614.0
Number of columns dropped due to zero variance: 0
Threshold: 0.0
Original Training data shape: (1538, 6144)
Original Test data shape: (390, 6144)
Transformed Training data shape: (1538, 6144)
Transformed Test data shape: (390, 6144)
Sum: 4614.0
Number of columns dropped due to zero variance: 0
Threshold: 0.0
Original Training data shape: (1538, 6144)
Original Test data shape: (390, 6144)
Transformed Training data shape: (1538, 6144)
Transformed Test data shape: (390, 6144)
Sum: 4614.0
Number of columns dropped due to zero variance: 0
Threshold: 0.0
Origi

In [113]:
train_transformed_df.iloc[:, 200:300]
print(train.sum(axis=0).sum())
print(train_transformed_df.sum(axis=0).sum())

4614.0
390.7375486127391
