In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import mlflow
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from lib import full_flow_dataloader
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import VarianceThreshold
from lib.config import AppConfig
from lib.data_handling import load_split_data, CustomSpectralPipeline, WavelengthMaskTransformer, NonNegativeTransformer, SpectralDataReshaper
from lib.reproduction import major_oxides, masks

from lib.norms import Norm1Scaler, Norm3Scaler


In [None]:
config = AppConfig()

composition_data_loc = config.composition_data_path
dataset_loc = config.data_path

# if not train_data or not test_data: # type: ignore
train_data, test_data = load_split_data(
	str(dataset_loc), average_shots=True
)

In [None]:
pipeline = CustomSpectralPipeline(
	masks=masks,
	composition_data_loc=composition_data_loc,
	major_oxides=major_oxides,
)

# pipeline.pipeline.steps.insert(-2, ('variance_threshold', VarianceThresholdTransformer(threshold=0)))
# pipeline.pipeline.steps.append(('variance_threshold', VarianceThresholdTransformer(threshold=0.9)))

train_processed = pipeline.fit_transform(train_data)
test_processed = pipeline.fit_transform(test_data)


In [None]:
scaler = Norm3Scaler()
train_processed = scaler.fit_transform(train_processed)
test_processed = scaler.transform(test_processed)

In [None]:
train_processed.iloc[:, 200:300]

In [None]:
def drop_non_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
	cols_to_drop = major_oxides + ["Sample Name", "ID"]
	return df.drop(columns=cols_to_drop)

In [None]:
def generate_quantiles(min_quantile, max_quantile, num_points):
    return np.linspace(min_quantile, max_quantile, num_points)
    # # Generate linearly spaced values in the exponent space between log(min_quantile) and log(max_quantile)
    # exponent_space = np.linspace(np.log(min_quantile), np.log(max_quantile), num_points)

    # # Calculate the actual quantiles
    # quantiles = np.exp(exponent_space)

    # return quantiles

min_q = 1e-9  # starting quantile
max_q = 0.15   # ending quantile
points = 100   # number of points

quantile_values = generate_quantiles(min_q, max_q, points)
print(quantile_values)

In [None]:
train_non_numeric = drop_non_numeric_columns(train_processed)
test_non_numeric = drop_non_numeric_columns(test_processed)

zero_sum_columns = train_non_numeric.columns[train_non_numeric.sum() == 0]
test_no_zero_columns = test_non_numeric.drop(columns=zero_sum_columns)

flattened_series = test_no_zero_columns.values.ravel()
quantiles = pd.Series(flattened_series).quantile(quantile_values)
quantiles

In [None]:
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [None]:
import matplotlib.pyplot as plt

num_dropped_colums_per_iteration = {} # threshold: num_dropped_columns

for i, threshold in enumerate(quantiles.values):
	# Identify columns where the sum is zero (these columns contain only zero)
	zero_sum_columns = train_non_numeric.columns[(train_non_numeric.sum(axis=0) == 0)]

	# Drop zero sum columns from train and test, keeping the original for later
	train_reduced = train_non_numeric.drop(columns=zero_sum_columns)
	test_reduced = test_non_numeric.drop(columns=zero_sum_columns)

	selector = VarianceThreshold(threshold=threshold)
	train_transformed = selector.fit_transform(train_reduced)
	test_transformed = selector.transform(test_reduced)

	# Prepare DataFrame from the numpy arrays returned by VarianceThreshold
	train_transformed_df = pd.DataFrame(train_transformed, columns=train_reduced.columns[selector.get_support()])
	test_transformed_df = pd.DataFrame(test_transformed, columns=test_reduced.columns[selector.get_support()])

	# Add back the zero-sum columns with all zeros
	for column in zero_sum_columns:
		train_transformed_df[column] = 0
		test_transformed_df[column] = 0

	# Ensure the column order is the same as the original DataFrames
	# train_final = train_transformed_df[train.columns]
	# test_final = test_transformed_df[test.columns]

	# Identify columns dropped due to zero variance (after removing zero sum columns)
	dropped_variance_columns = train_reduced.columns[~selector.get_support()]

	# Printing the shape of the data before and after transformation
	# print(f"Threshold: {threshold}")
	# Print the variance
	# print(f"Variance of each column: {selector.variances_}")

	# Print the min of the variance
	# print(f"Min Variance: {selector.variances_.min()}")

	# print(f"Original Training data shape: {train_non_numeric.shape}")
	# print(f"Original Test data shape: {test_non_numeric.shape}")
	# print(f"Transformed Training data shape: {train_transformed_df.shape}")
	# print(f"Transformed Test data shape: {test_transformed_df.shape}")
	# print(f"Sum: {train_transformed_df.sum().sum()}")

	# Print the number of columns dropped due to zero variance
	# print(f"Number of columns dropped due to zero variance: {len(dropped_variance_columns)}")

	# num_dropped_colums_per_iteration.append(len(dropped_variance_columns))
	num_dropped_colums_per_iteration[threshold] = len(dropped_variance_columns)

	# plt.plot(selector.variances_)
	# plt.xlabel("Column index")
	# plt.ylabel("Variance")
	# plt.title("Variance of columns")
	# plt.axhline(y=threshold, color='r', linestyle='--')

	# plt.show()

# For each unique number of dropped columns, store the threshold that caused it in a list
thresholds_per_num_dropped_columns = {}

for threshold, num_dropped_columns in num_dropped_colums_per_iteration.items():
	if num_dropped_columns not in thresholds_per_num_dropped_columns:
		thresholds_per_num_dropped_columns[num_dropped_columns] = [threshold]
	else:
		thresholds_per_num_dropped_columns[num_dropped_columns].append(threshold)

print(len(num_dropped_colums_per_iteration))

# plot the number of dropped columns per iteration
# plt.plot(quantiles, num_dropped_colums_per_iteration)
# plt.xlabel("Threshold")
# plt.ylabel("Number of dropped columns")
# plt.title("Number of dropped columns per threshold")
# plt.show()

In [None]:
import json

with open(f"num_dropped_cols.json", "w") as f:
	json.dump(thresholds_per_num_dropped_columns, f, indent=4)

unique = []

for key in thresholds_per_num_dropped_columns:
	unique.append(thresholds_per_num_dropped_columns[key][0])

len(unique)

In [None]:
train_transformed_df.iloc[:, 200:300]
print(train_non_numeric.sum(axis=0).sum())
print(train_transformed_df.sum(axis=0).sum())

In [None]:
plt.plot(selector.variances_)
plt.xlabel("Column index")
plt.ylabel("Variance")
plt.title("Variance of columns")
plt.axhline(y=threshold, color='r', linestyle='--')

plt.show()

In [None]:
from lib.full_flow_dataloader import load_and_scale_data
norm = 3

train, test = load_and_scale_data(norm)

In [None]:
class ZeroSumVarianceThreshold(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.0):
        self.threshold = threshold
        self.zero_sum_columns = None
        self.variance_selector = None

    def fit(self, X, y=None):
        # Identify columns where the sum is zero (these columns contain only zero)
        self.zero_sum_columns = X.columns[X.sum(axis=0) == 0]

        # Drop zero sum columns from X
        reduced_X = X.drop(columns=self.zero_sum_columns)

        # Apply VarianceThreshold to the remaining data
        self.variance_selector = VarianceThreshold(self.threshold)
        self.variance_selector.fit(reduced_X)

        return self

    def transform(self, X):
        # Apply VarianceThreshold to the reduced data
        reduced_X = X.drop(columns=self.zero_sum_columns)
        transformed_data = self.variance_selector.transform(reduced_X)

        # Convert array back to DataFrame, adding zero-sum columns back with zeros
        transformed_data = pd.DataFrame(transformed_data, index=X.index, columns=reduced_X.columns[self.variance_selector.get_support()])
        for column in self.zero_sum_columns:
            transformed_data[column] = 0

        return transformed_data

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import make_scorer, mean_squared_error
import json
import numpy as np


# Custom scorer for RMSE
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse = make_scorer(rmse_scorer, greater_is_better=False)

svr_params = {
    'kernel': 'poly',
    'degree': 2,
    'C': 100,
    'epsilon': 0.1,
    'coef0': 1.0,
    'gamma': 'scale'
}

# Define the pipeline
pipe = Pipeline([
    # ('var_thresh', VarianceThreshold()),
    ('var_thresh', ZeroSumVarianceThreshold()),
    ('svm', SVR(**svr_params)),
])

# Define the parameter grid
param_grid = {
    # 'var_thresh__threshold': quantiles.values  # Using the quantiles you've already calculated
    'var_thresh__threshold': unique  # Using the quantiles you've already calculated
}

drop_cols = major_oxides + ["ID", "Sample Name"]

X_train = train.drop(columns=drop_cols)
y_train = train[major_oxides]

X_test = test.drop(columns=drop_cols)
y_test = test[major_oxides]

results = {}

for target in major_oxides:
    # Set up GridSearchCV
    print(f"Optimizing for {target}")
    results[target] = {}

    # Fit the grid search object
    grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring=make_scorer(rmse_scorer, greater_is_better=False), verbose=10, n_jobs=-1)
    grid_search.fit(X_train, y_train[target])

    # Train a model without the variance thresholding for comparison
    svm = SVR(**svr_params)
    svm.fit(X_train, y_train[target])
    y_pred = svm.predict(X_test)
    rmse_no_variance = np.sqrt(mean_squared_error(y_test[target], y_pred))

    print("RMSE without variance thresholding:", rmse_no_variance)
    results[target]['rmse_no_variance'] = rmse_no_variance

    # Print the best parameters
    print(f"Best parameters: {grid_search.best_params_['var_thresh__threshold']:.20f}")
    print("Best RMSE:", -grid_search.best_score_)

    # Save the grid search object with oxide
    results[target]['grid_search'] = {
        "variance_threshold": grid_search.best_params_["var_thresh__threshold"],
        "best_rmse": -grid_search.best_score_
    }

    # For each run save the results as to a file
    with open(f"var_threshold_results.json", "w") as f:
        json.dump(results, f, indent=4)

In [None]:
for oxide in major_oxides:
	print(f"{oxide}: {results[oxide].best_params_['var_thresh__threshold']:.20f}. RMSE: {-results[oxide].best_score_}")