# Splitting data
Split the original training data into train, validation, and test sets

In [2]:
%load_ext autoreload
%autoreload 2

In [52]:
DATA_DIR = "../../data"
SEED = 2

In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split

csv_path = DATA_DIR + "/neurips-open-polymer-prediction-2025/train.csv"
train_df = pd.read_csv(csv_path)

# 1. split off 20% for dev_test
temp_df, dev_test = train_test_split(
    train_df,
    test_size=0.2,
    random_state=SEED,  # for reproducibility
    shuffle=True
)

# 2. split the remaining 80% into 75% train / 25% valid → 0.6 / 0.2 overall
dev_train, dev_val = train_test_split(
    temp_df,
    test_size=0.25,  # 0.111 * 0.9 = 0.1 of the original
    random_state=SEED,
    shuffle=True
)

# Verify sizes
print(f"Total rows:   {len(train_df)}")
print(f"Dev train:    {len(dev_train)} ({len(dev_train) / len(train_df):.2%})")
print(f"Dev valid:    {len(dev_val)} ({len(dev_val) / len(train_df):.2%})")
print(f"Dev test:     {len(dev_test)} ({len(dev_test) / len(train_df):.2%})")
print(f"Polymer example:{dev_train['SMILES'].to_list()[:3]}")
print(f"Columns:{dev_train.columns}")

Total rows:   7973
Dev train:    4783 (59.99%)
Dev valid:    1595 (20.01%)
Dev test:     1595 (20.01%)
Polymer example:['*CC(C)=CC(C)S(*)(=O)=O', '*C=C[Ge](C=C[Si](*)(c1ccccc1)c1ccccc1)(c1ccccc1)c1ccccc1', '*CCCOC(=O)c1ccc2cc(C(=O)O*)ccc2c1']
Columns:Index(['id', 'SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg'], dtype='object')


In [54]:
train_df.drop(columns=["id"]).describe()

Unnamed: 0,Tg,FFV,Tc,Density,Rg
count,511.0,7030.0,737.0,613.0,614.0
mean,96.452314,0.367212,0.256334,0.985484,16.419787
std,111.228279,0.029609,0.089538,0.146189,4.60864
min,-148.029738,0.226992,0.0465,0.748691,9.728355
25%,13.674509,0.349549,0.186,0.890243,12.540328
50%,74.040183,0.364264,0.236,0.948193,15.052194
75%,161.147595,0.38079,0.3305,1.062096,20.411067
max,472.25,0.777097,0.524,1.840999,34.672906


In [55]:
dev_test.drop(columns=["id"]).describe()

Unnamed: 0,Tg,FFV,Tc,Density,Rg
count,105.0,1403.0,144.0,125.0,125.0
mean,96.746767,0.36732,0.260388,0.981278,16.921804
std,110.425965,0.028496,0.084556,0.153701,4.978499
min,-148.029738,0.245613,0.0735,0.775,9.943107
25%,19.426745,0.349822,0.190375,0.89408,12.71172
50%,72.320816,0.364874,0.247833,0.946533,16.272229
75%,167.964232,0.380817,0.335083,1.041526,20.5503
max,456.35,0.500778,0.422,1.840999,34.672906


In [56]:
# Kolmogorov-Smirnov (K-S) Test
from scipy.stats import ks_2samp


def ks_test_feature(set_a, set_b):
    stat, p_value = ks_2samp(set_a.dropna(), set_b.dropna())
    return stat, p_value


In [57]:
for feature in dev_train.drop(columns=["id", "SMILES"]).columns:
    stat, p_value = ks_test_feature(dev_train[feature], dev_test[feature])
    print(f"K-S Test for {feature}: Statistic={stat:.4f}, p-value={p_value:.4f}")

K-S Test for Tg: Statistic=0.0685, p-value=0.8279
K-S Test for FFV: Statistic=0.0312, p-value=0.2501
K-S Test for Tc: Statistic=0.0493, p-value=0.9384
K-S Test for Density: Statistic=0.1040, p-value=0.2541
K-S Test for Rg: Statistic=0.1078, p-value=0.2076


In [58]:

for feature in dev_train.drop(columns=["id", "SMILES"]).columns:
    stat, p_value = ks_test_feature(dev_train[feature], dev_val[feature])
    print(f"K-S Test for {feature}: Statistic={stat:.4f}, p-value={p_value:.4f}")


K-S Test for Tg: Statistic=0.1003, p-value=0.3905
K-S Test for FFV: Statistic=0.0295, p-value=0.3063
K-S Test for Tc: Statistic=0.0752, p-value=0.5699
K-S Test for Density: Statistic=0.0526, p-value=0.9568
K-S Test for Rg: Statistic=0.0816, p-value=0.5752


In [59]:
import numpy as np
from statsmodels.stats.proportion import proportions_ztest


def run_proportions_ztest(set_a, set_b):
    counts = np.array([set_a.notna().sum(), set_b.notna().sum()])
    nobs = np.array([set_a.size, set_b.size])
    # print(counts, nobs)
    stat, pval = proportions_ztest(counts, nobs)
    return stat, pval


In [60]:
for feature in dev_train.drop(columns=["id", "SMILES"]).columns:
    stat, pval = run_proportions_ztest(dev_train[feature], dev_test[feature])
    print(f"Proportions Z-Test for {feature}: Statistic={stat:.4f}, p-value={pval:.4f}")

Proportions Z-Test for Tg: Statistic=-0.3507, p-value=0.7258
Proportions Z-Test for FFV: Statistic=0.0614, p-value=0.9510
Proportions Z-Test for Tc: Statistic=0.6717, p-value=0.5018
Proportions Z-Test for Density: Statistic=0.0042, p-value=0.9966
Proportions Z-Test for Rg: Statistic=0.0311, p-value=0.9752


In [61]:
for feature in dev_train.drop(columns=["id", "SMILES"]).columns:
    stat, pval = run_proportions_ztest(dev_train[feature], dev_val[feature])
    print(f"Proportions Z-Test for {feature}: Statistic={stat:.4f}, p-value={pval:.4f}")

Proportions Z-Test for Tg: Statistic=-0.1739, p-value=0.8620
Proportions Z-Test for FFV: Statistic=-0.8799, p-value=0.3789
Proportions Z-Test for Tc: Statistic=1.4235, p-value=0.1546
Proportions Z-Test for Density: Statistic=0.9831, p-value=0.3255
Proportions Z-Test for Rg: Statistic=1.0094, p-value=0.3128
