# Splitting data
Split the original training data into train, validation, and test sets

In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path


DATA_DIR = Path("../../data")
SEED = 42

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

csv_path = DATA_DIR / "neurips-open-polymer-prediction-2025/train.csv"
train_df = pd.read_csv(csv_path)

# 1. split off 20% for dev_test
temp_df, dev_test = train_test_split(
    train_df,
    test_size=0.2,
    random_state=SEED,  # for reproducibility
    shuffle=True
)

# 2. split the remaining 80% into 75% train / 25% valid → 0.6 / 0.2 overall
dev_train, dev_val = train_test_split(
    temp_df,
    test_size=0.25,  # 0.111 * 0.9 = 0.1 of the original
    random_state=SEED,
    shuffle=True
)

# Verify sizes
print(f"Total rows:   {len(train_df)}")
print(f"Dev train:    {len(dev_train)} ({len(dev_train) / len(train_df):.2%})")
print(f"Dev valid:    {len(dev_val)} ({len(dev_val) / len(train_df):.2%})")
print(f"Dev test:     {len(dev_test)} ({len(dev_test) / len(train_df):.2%})")
print(f"Polymer example:{dev_train['SMILES'].to_list()[:3]}")
print(f"Columns:{dev_train.columns}")

Total rows:   7973
Dev train:    4783 (59.99%)
Dev valid:    1595 (20.01%)
Dev test:     1595 (20.01%)
Polymer example:['*Nc1ccc(CC(CC(C)(C)c2ccc(N*)cc2)=C(C)C)cc1', '*CC(*)(CC(=O)OC)C(=O)OC12CC3CC(C)(CC(C)(C3)C1)C2', '*OP(=O)(Oc1c(Cl)cc(Cl)cc1Cl)Oc1c(Cl)c(Cl)c(*)c(Cl)c1Cl']
Columns:Index(['id', 'SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg'], dtype='object')


In [6]:
train_df.drop(columns=["id"]).describe()

Unnamed: 0,Tg,FFV,Tc,Density,Rg
count,511.0,7030.0,737.0,613.0,614.0
mean,96.452314,0.367212,0.256334,0.985484,16.419787
std,111.228279,0.029609,0.089538,0.146189,4.60864
min,-148.029738,0.226992,0.0465,0.748691,9.728355
25%,13.674509,0.349549,0.186,0.890243,12.540328
50%,74.040183,0.364264,0.236,0.948193,15.052194
75%,161.147595,0.38079,0.3305,1.062096,20.411067
max,472.25,0.777097,0.524,1.840999,34.672906


In [7]:
dev_test.drop(columns=["id"]).describe()

Unnamed: 0,Tg,FFV,Tc,Density,Rg
count,87.0,1419.0,145.0,123.0,124.0
mean,88.357301,0.366495,0.265006,0.974297,16.444188
std,102.567914,0.02686,0.088128,0.141211,4.954409
min,-77.911077,0.271584,0.100667,0.757428,9.767195
25%,11.463709,0.349499,0.193,0.885849,12.380452
50%,70.428842,0.363664,0.244,0.946344,14.813247
75%,136.154956,0.380145,0.335333,1.050418,20.580689
max,384.637936,0.528203,0.507,1.840999,34.487303


In [8]:
# Kolmogorov-Smirnov (K-S) Test
from scipy.stats import ks_2samp


def ks_test_feature(set_a, set_b):
    stat, p_value = ks_2samp(set_a.dropna(), set_b.dropna())
    return stat, p_value


In [9]:
for feature in dev_train.drop(columns=["id", "SMILES"]).columns:
    stat, p_value = ks_test_feature(dev_train[feature], dev_test[feature])
    print(f"K-S Test for {feature}: Statistic={stat:.4f}, p-value={p_value:.4f}")

K-S Test for Tg: Statistic=0.0845, p-value=0.6785
K-S Test for FFV: Statistic=0.0179, p-value=0.8790
K-S Test for Tc: Statistic=0.1176, p-value=0.0868
K-S Test for Density: Statistic=0.0851, p-value=0.4810
K-S Test for Rg: Statistic=0.0653, p-value=0.7921


In [10]:

for feature in dev_train.drop(columns=["id", "SMILES"]).columns:
    stat, p_value = ks_test_feature(dev_train[feature], dev_val[feature])
    print(f"K-S Test for {feature}: Statistic={stat:.4f}, p-value={p_value:.4f}")


K-S Test for Tg: Statistic=0.1307, p-value=0.1093
K-S Test for FFV: Statistic=0.0195, p-value=0.8092
K-S Test for Tc: Statistic=0.1514, p-value=0.0141
K-S Test for Density: Statistic=0.0753, p-value=0.6707
K-S Test for Rg: Statistic=0.0786, p-value=0.6185


In [11]:
import numpy as np
from statsmodels.stats.proportion import proportions_ztest


def run_proportions_ztest(set_a, set_b):
    counts = np.array([set_a.notna().sum(), set_b.notna().sum()])
    nobs = np.array([set_a.size, set_b.size])
    # print(counts, nobs)
    stat, pval = proportions_ztest(counts, nobs)
    return stat, pval


In [12]:
for feature in dev_train.drop(columns=["id", "SMILES"]).columns:
    stat, pval = run_proportions_ztest(dev_train[feature], dev_test[feature])
    print(f"Proportions Z-Test for {feature}: Statistic={stat:.4f}, p-value={pval:.4f}")

Proportions Z-Test for Tg: Statistic=1.5541, p-value=0.1202
Proportions Z-Test for FFV: Statistic=-0.9938, p-value=0.3203
Proportions Z-Test for Tc: Statistic=0.4999, p-value=0.6172
Proportions Z-Test for Density: Statistic=0.1926, p-value=0.8472
Proportions Z-Test for Rg: Statistic=0.1118, p-value=0.9110


In [13]:
for feature in dev_train.drop(columns=["id", "SMILES"]).columns:
    stat, pval = run_proportions_ztest(dev_train[feature], dev_val[feature])
    print(f"Proportions Z-Test for {feature}: Statistic={stat:.4f}, p-value={pval:.4f}")

Proportions Z-Test for Tg: Statistic=-0.5765, p-value=0.5643
Proportions Z-Test for FFV: Statistic=0.2836, p-value=0.7767
Proportions Z-Test for Tc: Statistic=1.1007, p-value=0.2710
Proportions Z-Test for Density: Statistic=0.9270, p-value=0.3539
Proportions Z-Test for Rg: Statistic=0.9270, p-value=0.3539
