# Evalute Train/Test splits

This notebook evaluates the train/test splits we're using to establish a baseline.

We'd like the train set to have the extreme values, and the test set to have the more common values.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dotenv import dotenv_values

env = dotenv_values()
comp_data_loc = env.get("COMPOSITION_DATA_PATH")
dataset_loc = env.get("DATA_PATH")

if not comp_data_loc:
    print("Please set COMPOSITION_DATA_PATH in .env file")
    exit(1)

if not dataset_loc:
    print("Please set DATA_PATH in .env file")
    exit(1)

In [None]:
from lib.data_handling import CompositionData


cd = CompositionData(str(comp_data_loc))

In [None]:
from train_test_split import get_all_samples

samples = get_all_samples(cd, str(dataset_loc))

In [None]:
from train_test_split import CalibrationDataFilter, filter_samples


filtered_samples = filter_samples(samples, [])

In [None]:
from train_test_split import create_train_test_split_with_extremes

df = create_train_test_split_with_extremes(filtered_samples)
df

In [None]:
df["train_test"].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from lib.reproduction import major_oxides
import warnings

# ignore futurewarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Set the size of the plot
plt.figure(figsize=(20, 10))

# Loop through each major oxide and create a subplot
for index, oxide in enumerate(major_oxides, start=1):
    plt.subplot(2, 4, index)  # Adjust the grid dimensions as needed
    sns.boxplot(x='train_test', y=oxide, data=df)
    plt.title(f'Box Plot of {oxide}')

plt.tight_layout()
plt.show()