**Random data partitioning into calibration, validation and test sets**

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = "/content/spectra and reference parameters.csv"  # Update path if needed
data = pd.read_csv(file_path)

# Remove rows with missing values
data_clean = data.dropna()

# Separate ID and feature columns
ID_clean = data_clean.iloc[:, 0]
X_clean = data_clean.drop(columns=data_clean.columns[0])

# Define split ratios
cal_ratio = 0.7 #####choose % data in calibration set##
val_ratio = 0.15  #####choose % data in validation set##
test_ratio = 1 - (cal_ratio + val_ratio)

# Step 1: Split off Test set
X_temp, X_Test, ID_temp, ID_Test = train_test_split(
    X_clean, ID_clean, test_size=test_ratio, random_state=42
)

# Step 2: Split remaining into Cal and Val
X_Cal, X_Val, ID_Cal, ID_Val = train_test_split(
    X_temp, ID_temp, test_size=val_ratio / (cal_ratio + val_ratio), random_state=42
)

# Combine ID and features for each set
Cal_df = pd.concat([ID_Cal.reset_index(drop=True), X_Cal.reset_index(drop=True)], axis=1)
Val_df = pd.concat([ID_Val.reset_index(drop=True), X_Val.reset_index(drop=True)], axis=1)
Test_df = pd.concat([ID_Test.reset_index(drop=True), X_Test.reset_index(drop=True)], axis=1)

# Save to CSV
Cal_df.to_csv("/content/Cal_Rand.csv", index=False)
Val_df.to_csv("/content/Val_Rand.csv", index=False)
Test_df.to_csv("/content/Test_Rand.csv", index=False)

# Print summary
print("Random Partition Completed:")
print("Calibration set (Cal) size:", Cal_df.shape)
print("Validation set (Val) size:", Val_df.shape)
print("Test set size:", Test_df.shape)


Random Partition Completed:
Calibration set (Cal) size: (858, 308)
Validation set (Val) size: (185, 308)
Test set size: (185, 308)


**Stratified sampling (for fertility classification)**

Before running this code prepare the csv file as follows: Remove extra columns (mass, major dia, minor dia, thickness, yolk mass, shell strength) so that your file conatins sample ID (1st column), Fertility status (2nd column), then spectral variables columns.  Then assign fertility status as follows 1 = fertile, 0 = infertile (2nd column).

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# === Load the dataset === #
new_file_path = '/content/Data_all_fertility.csv'  # <<== (Change file path if needed)
new_data = pd.read_csv(new_file_path)

# Remove rows with missing values
new_data_clean = new_data.dropna()

# Separate ID, Fertility label, and Features
ID_clean = new_data_clean.iloc[:, 0]           # First column = ID
fertility_label = new_data_clean.iloc[:, 1]    # Second column = Fertility
X_clean = new_data_clean.drop(columns=[new_data_clean.columns[0], new_data_clean.columns[1]])  # Features only

# === Set Split Ratios === #
cal_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# === First Split: Cal vs Temp (Val + Test) === #
X_Cal, X_temp, fertility_Cal, fertility_temp, ID_Cal, ID_temp = train_test_split(
    X_clean, fertility_label, ID_clean,
    test_size=1 - cal_ratio,   # 30% goes to temp
    stratify=fertility_label,
    random_state=42
)

# === Second Split: Temp into Val and Test === #
val_ratio_adjusted = val_ratio / (val_ratio + test_ratio)  # 0.5

X_Val, X_Test, fertility_Val, fertility_Test, ID_Val, ID_Test = train_test_split(
    X_temp, fertility_temp, ID_temp,
    test_size=1 - val_ratio_adjusted,  # 0.5 of temp becomes test
    stratify=fertility_temp,
    random_state=42
)

# === Save datasets === #
for split_name, (ID_split, Fertility_split, X_split) in {
    'Cal_Stratified': (ID_Cal, fertility_Cal, X_Cal),
    'Val_Stratified': (ID_Val, fertility_Val, X_Val),
    'Test_Stratified': (ID_Test, fertility_Test, X_Test)
}.items():
    df_split = pd.concat([
        ID_split.reset_index(drop=True),
        Fertility_split.reset_index(drop=True),
        X_split.reset_index(drop=True)
    ], axis=1)
    df_split.to_csv(f'/content/{split_name}.csv', index=False)

# === Print split summary === #
print("Stratified Split Sizes:")
print(f"Cal: {X_Cal.shape}, Val: {X_Val.shape}, Test: {X_Test.shape}")


Stratified Split Sizes:
Cal: (859, 300), Val: (184, 300), Test: (185, 300)
