# Evalute Train/Test splits

This notebook evaluates the train/test splits we're using to establish a baseline.

We'd like the train set to have the extreme values, and the test set to have the more common values.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dotenv import dotenv_values

env = dotenv_values()
comp_data_loc = env.get("COMPOSITION_DATA_PATH")
dataset_loc = env.get("DATA_PATH")

if not comp_data_loc:
    print("Please set COMPOSITION_DATA_PATH in .env file")
    exit(1)

if not dataset_loc:
    print("Please set DATA_PATH in .env file")
    exit(1)

In [None]:
from lib.data_handling import CompositionData


cd = CompositionData(str(comp_data_loc))

In [None]:
from train_test_split import get_all_samples

samples = get_all_samples(cd, str(dataset_loc))

In [None]:
from train_test_split import filter_samples


filtered_samples = filter_samples(samples, [])

In [None]:
# Add the necessary imports
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

oxide = 'SiO2'

# ignore futurewarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Assuming df is already created and contains the composition data
# Focus on a single oxide, SiO2
df[f'{oxide}_bin'] = pd.qcut(df[oxide], q=4, labels=False)

# Identify extremes
percentage = 0.05
extreme_indices = df[(df[oxide] > df[oxide].quantile(1-percentage)) | (df[oxide] < df[oxide].quantile(percentage))].index
non_extreme_df = df.drop(extreme_indices)

# Initialize StratifiedKFold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Generate the folds
for i, (train_index, test_index) in enumerate(kf.split(non_extreme_df, non_extreme_df[f'{oxide}_bin'])):
    # Adding extremes to each training set
    extreme_part = np.array_split(extreme_indices, 5)
    train_index = np.concatenate([non_extreme_df.index[train_index], extreme_part[i]])

    train, test = df.loc[train_index], df.loc[non_extreme_df.index[test_index]]
    print(f"Fold {i+1}")
    print("Train indices:", train_index, "Test indices:", non_extreme_df.index[test_index])
    print("Train set size:", len(train), "Test set size:", len(test))
    print("Train SiO2 distribution:\n", train[f'{oxide}_bin'].value_counts(normalize=True))
    print("Test SiO2 distribution:\n", test[f'{oxide}_bin'].value_counts(normalize=True))
    print("\n")

# Analysis of the results
# Plotting the distribution of the SiO2 variable in each fold
plt.figure(figsize=(20, 10))

for i, (train_index, test_index) in enumerate(kf.split(non_extreme_df, non_extreme_df[f'{oxide}_bin'])):
    extreme_part = np.array_split(extreme_indices, 5)
    train_index = np.concatenate([non_extreme_df.index[train_index], extreme_part[i]])
    train, test = df.loc[train_index], df.loc[non_extreme_df.index[test_index]]
    
    plt.subplot(2, 3, i+1)
    sns.histplot(train[oxide], kde=True, color='blue', label='Train')
    sns.histplot(test[oxide], kde=True, color='red', label='Test')
    plt.title(f'Fold {i+1} SiO2 Distribution')
    plt.legend()

plt.tight_layout()
plt.show()