In [None]:
### Get the 5-fold stratified sets

import pandas as pd
from sklearn.model_selection import StratifiedKFold

# Load your Excel file
file_path = "Sulfonyl_Fluoride_Yields.xlsx"  # Replace with your file path
df = pd.read_excel(file_path)

# Create a binary stratification label based on the Yield_24h threshold
df['Yield_Above_80'] = (df['Yield_24h'] > 80).astype(int)

# Initialize the StratifiedKFold splitter
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Prepare a column to hold the fold assignments
df['Fold'] = -1

# Assign each sample to a fold
for fold_number, (_, test_idx) in enumerate(skf.split(df, df['Yield_Above_80'])):
    df.loc[test_idx, 'Fold'] = fold_number

# Remove the temporary stratification column
df = df.drop(columns=['Yield_Above_80'])

# Save the result to a new Excel file (optional)
df.to_excel("Stratified_Folds_Yield.xlsx", index=False)

# Display the first few rows
print(df.head())

   ID  Yield_24h  dipole(Debye)_max  pyramidalization_Gavrish_S(°)_max  Fold
0   0        1.5             6.1310                           5.098881     2
1   6       23.1             3.8998                           5.047614     4
2   8       76.0             6.5148                           4.937596     2
3  11       62.9             2.5124                           4.999035     4
4  28        4.5             6.1768                           5.090402     2


In [7]:
### Evaluate the accuracy of each set

df = pd.read_excel("Stratified_Folds_Yield.xlsx")

dipole_threshold = 5.87 # < than this
pyram_threshold = 4.95 # < than this

def evaluate_set(df):
    correct = 0
    incorrect = 0
    for i, row in df.iterrows():
        dipole_val = row['dipole(Debye)_max']
        pyram_val = row['pyramidalization_Gavrish_S(°)_max']
        is_in_quadrant = dipole_val < dipole_threshold and pyram_val < pyram_threshold
        is_above_80 = row['Yield_24h'] > 80
        if is_in_quadrant and is_above_80:
            correct += 1
        elif not is_in_quadrant and not is_above_80:
            correct += 1
        else:
            incorrect += 1

    print(f"Correct = {correct}, Incorrect = {incorrect}, Accuracy = {correct / (correct + incorrect):.2f}")


# Group by the 'Fold' column
for i in range(5):
    training_set = df[df['Fold'] != i]
    validation_set = df[df['Fold'] == i]
    
    print("----- FOLD", i, "-----")
    print("TRAINING SET")
    evaluate_set(training_set)
    print("VALIDATION SET")
    evaluate_set(validation_set)



----- FOLD 0 -----
TRAINING SET
Correct = 45, Incorrect = 3, Accuracy = 0.94
VALIDATION SET
Correct = 11, Incorrect = 2, Accuracy = 0.85
----- FOLD 1 -----
TRAINING SET
Correct = 45, Incorrect = 4, Accuracy = 0.92
VALIDATION SET
Correct = 11, Incorrect = 1, Accuracy = 0.92
----- FOLD 2 -----
TRAINING SET
Correct = 44, Incorrect = 5, Accuracy = 0.90
VALIDATION SET
Correct = 12, Incorrect = 0, Accuracy = 1.00
----- FOLD 3 -----
TRAINING SET
Correct = 45, Incorrect = 4, Accuracy = 0.92
VALIDATION SET
Correct = 11, Incorrect = 1, Accuracy = 0.92
----- FOLD 4 -----
TRAINING SET
Correct = 45, Incorrect = 4, Accuracy = 0.92
VALIDATION SET
Correct = 11, Incorrect = 1, Accuracy = 0.92
