# NeurIPS - Open Polymer Prediction 2025
---

In [1]:
print("Installing Local Modules")
# !pip install --upgrade pip
# !pip install rdkit
!pip install /kaggle/input/psmiles-module/psmiles_package/*
# in a Kaggle notebook bash cell
!rm -rf /kaggle/working/canonicalize_psmiles /kaggle/working/psmiles
!cp -r /kaggle/input/psmiles-module/psmiles_package/canonicalize-psmiles-0.1.2/canonicalize-psmiles/canonicalize_psmiles \
      /kaggle/working/canonicalize_psmiles
!cp -r /kaggle/input/psmiles-module/psmiles_package/psmiles-0.6.10/psmiles-0.6.10/psmiles/psmiles \
      /kaggle/working/psmiles

Installing Local Modules
[31mERROR: Directory '/kaggle/input/psmiles-module/psmiles_package/canonicalize-psmiles-0.1.2' is not installable. Neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import random
import networkx as nx
from tqdm import tqdm
tqdm.pandas()

import pickle
import gc
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

from rdkit import RDLogger
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdmolops
from rdkit import DataStructs
RDLogger.DisableLog('rdApp.*')

import lightgbm as lgb
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import sys
sys.path.insert(0, "/kaggle/working")

from canonicalize_psmiles.canonicalize import canonicalize as ext_canonicalize
from psmiles.helper      import in_ipynb
from psmiles.psmiles     import PolymerSmiles
from psmiles import PolymerSmiles as PS

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm.auto import tqdm
from gensim.models import Word2Vec
print("Done")

Done


# Data Preprocessing
---


In [3]:
targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

## Seting Up Training Data via NeurIPS Dataset and External Sources

In [4]:
train_df = pd.read_csv("/kaggle/input/neurips-open-polymer-prediction-2025/train.csv")

failed_smiles = []
def make_smile_canonical(smile):
    try:
        ## RDKit
        mol = Chem.MolFromSmiles(smile)
        canon_smile = Chem.MolToSmiles(mol, canonical=True)
        return canon_smile
        ## PSMILES
        # mol = PS(smile)
        # mol.canonicalize
        # return mol.psmiles
    except Exception as e:
        failed_smiles.append(smile)
        return np.nan

# Apply with progress bar
print("Canonicalising NeurIPS train.csv and supplements datasets....")
train_df['SMILES'] = train_df['SMILES'].progress_apply(make_smile_canonical)

## Training Supplement Data by NeurIPS
data_supp_1 = pd.read_csv("/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv")
data_supp_1 = data_supp_1.rename(columns={'TC_mean': 'Tc'})
data_supp_1['SMILES'] = data_supp_1['SMILES'].progress_apply(make_smile_canonical)

# data_supp_2 = pd.read_csv("/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset2.csv")
# data_supp_2['SMILES'] = data_supp_2['SMILES'].progress_apply(make_smile_canonical)

data_supp_3 = pd.read_csv("/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv")
data_supp_3['SMILES'] = data_supp_3['SMILES'].progress_apply(make_smile_canonical)

data_supp_4 = pd.read_csv("/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv")
data_supp_4['SMILES'] = data_supp_4['SMILES'].progress_apply(make_smile_canonical)

# Loading external datasets
## https://www.kaggle.com/datasets/minatoyukinaxlisa/tc-smiles
data_tc = pd.read_csv("/kaggle/input/augmented-polymer-data/Tc_SMILES.csv")
print("Canonicalising Kaggle Tc SMILES....")
data_tc['SMILES'] = data_tc['SMILES'].progress_apply(make_smile_canonical)

## https://www.sciencedirect.com/science/article/pii/S2590159123000377#ec0005
data_tg1 = pd.read_excel("/kaggle/input/augmented-polymer-data/tg_data_1.xlsx")
data_tg1 = data_tg1.rename(columns={'Tg [K]': 'Tg'})
data_tg1['Tg'] = data_tg1['Tg'] - 273.15
print("Canonicalising Science Direct Tg [K]....")
data_tg1['SMILES'] = data_tg1['SMILES'].progress_apply(make_smile_canonical)

## https://springernature.figshare.com/articles/dataset/dataset_with_glass_transition_temperature/24219958?file=42507037
data_tg2 = pd.read_csv("/kaggle/input/augmented-polymer-data/tg_data_2.csv", usecols=['SMILES', 'Tg'])
print("Canonicalising Springer Tg....")
data_tg2['SMILES'] = data_tg2['SMILES'].progress_apply(make_smile_canonical)

## https://github.com/Duke-MatSci/ChemProps
data_dnst = pd.read_excel("/kaggle/input/augmented-polymer-data/density_data.xlsx")
data_dnst = data_dnst.rename(columns={'density(g/cm3)': 'Density'})[['SMILES', 'Density']]
print("Canonicalising GitHub Density Dataset....")
data_dnst['SMILES'] = data_dnst['SMILES'].progress_apply(make_smile_canonical)
data_dnst = data_dnst[(data_dnst['SMILES'].notnull()) & (data_dnst['Density'].notnull()) & (data_dnst['Density'] != 'nylon')]
data_dnst['Density'] = data_dnst['Density'].astype('float64')
data_dnst['Density'] -= 0.118

# Failed SMILES parsing and canonicalize
if failed_smiles:
    print(f"\nFailed to parse {len(failed_smiles)} SMILES. Examples:")
    for i, smile in enumerate(set(failed_smiles)):
        print(f"{i+1}. {smile}")
else:
    print("\nAll SMILES parsed successfully!")

# Adding external data to the `train_df` or the NeurIPS training data
def add_extra_data(df_train, df_extra, target):
    n_samples_before = len(df_train[df_train[target].notnull()])

    df_extra['SMILES'] = df_extra['SMILES'].apply(lambda s: make_smile_canonical(s))
    df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()
    cross_smiles = set(df_extra['SMILES']) & set(df_train['SMILES'])
    unique_smiles_extra = set(df_extra['SMILES']) - set(df_train['SMILES'])

    # Make priority target value from competition's df
    for smile in df_train[df_train[target].notnull()]['SMILES'].tolist():
        if smile in cross_smiles:
            cross_smiles.remove(smile)

    # Imput missing values for competition's SMILES
    for smile in cross_smiles:
        df_train.loc[df_train['SMILES']==smile, target] = df_extra[df_extra['SMILES']==smile][target].values[0]
    
    df_train = pd.concat([df_train, df_extra[df_extra['SMILES'].isin(unique_smiles_extra)]], axis=0).reset_index(drop=True)

    n_samples_after = len(df_train[df_train[target].notnull()])
    print(f'\nFor target "{target}" added {n_samples_after-n_samples_before} new samples!')
    print(f'New unique SMILES: {len(unique_smiles_extra)}')
    return df_train

train_df = add_extra_data(train_df, data_supp_1, 'Tc')
# train_df = add_extra_data(train_df, data_supp_2, 'Density')
train_df = add_extra_data(train_df, data_supp_3, 'Tg')
train_df = add_extra_data(train_df, data_supp_4, 'FFV')
train_df = add_extra_data(train_df, data_tc, 'Tc')
train_df = add_extra_data(train_df, data_tg1, 'Tg')
train_df = add_extra_data(train_df, data_tg2, 'Tg')
train_df = add_extra_data(train_df, data_dnst, 'Density')

print('\n'*3, '--- SMILES for training ---', )
for t in targets:
    print(f'"{t}": {len(train_df[train_df[t].notnull()])}')

Canonicalising NeurIPS train.csv and supplements datasets....


100%|██████████| 7973/7973 [00:04<00:00, 1770.72it/s]
100%|██████████| 874/874 [00:00<00:00, 3274.39it/s]
100%|██████████| 46/46 [00:00<00:00, 1607.60it/s]
100%|██████████| 862/862 [00:00<00:00, 1473.99it/s]


Canonicalising Kaggle Tc SMILES....


100%|██████████| 874/874 [00:00<00:00, 3216.32it/s]


Canonicalising Science Direct Tg [K]....


100%|██████████| 501/501 [00:00<00:00, 3663.45it/s]


Canonicalising Springer Tg....


100%|██████████| 662/662 [00:00<00:00, 2145.17it/s]


Canonicalising GitHub Density Dataset....


100%|██████████| 787/787 [00:00<00:00, 5991.45it/s]


Failed to parse 6 SMILES. Examples:
1. *CN([R'])Cc2cc([R]c1cc(*)c(O)c(CN([R'])C*)c1)cc(*)c2O
2. *NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4
3. O=C=N[R1]N=C=O.O[R2]O.O[R3]O
4. *OC2OC(CO[R])C(OC1OC(CO[R])C(*)C(O[R])C1O[R])C(O[R])C2O[R]
5. *C(F)(F)CC(F)([R])C(*)(F)F
6. *O[Si](*)([R])[R]






For target "Tc" added 129 new samples!
New unique SMILES: 129

For target "Tg" added 46 new samples!
New unique SMILES: 46

For target "FFV" added 862 new samples!
New unique SMILES: 824

For target "Tc" added 0 new samples!
New unique SMILES: 0

For target "Tg" added 499 new samples!
New unique SMILES: 499

For target "Tg" added 105 new samples!
New unique SMILES: 90

For target "Density" added 634 new samples!
New unique SMILES: 519



 --- SMILES for training ---
"Tg": 1161
"FFV": 7892
"Tc": 866
"Density": 1247
"Rg": 614


## Generating Oligomers to Increase Target Data Points via Data Augmentation

In [5]:
error_creating_oligomer = []

def create_oligomer_with_smiles(monomer_smiles, chain_length, row):
    try:
        polymer = PS(monomer_smiles)
        oligomer = polymer.random_copolymer(polymer, ratio=0.5, units=chain_length)
        mol = oligomer.mol
        oligomer = Chem.MolToSmiles(mol, canonical=True)
        return oligomer
        
    except Exception as e:
        error_creating_oligomer.append([row['id'], chain_length, 1])
        return -1

def generate_oligomer_variations_smiles(monomer_smiles, row, num_variations):
    oligomers = []
    
    chain_lengths = list(range(1, num_variations + 1))  # 1-mer to 5-mer
    
    # Generate Oligomers
    for i in range(num_variations):
        chain_length = chain_lengths[i]
        
        try:
            # Use PSMILES to create oligomer
            oligomer_smiles = create_oligomer_with_smiles(monomer_smiles, chain_length, row) if (chain_length != 1) else monomer_smiles
            if (oligomer_smiles == -1):
                continue
                
            oligomer = {
                'monomer_id': row['id'],
                'oligomer_id': row['id'] if (chain_length == 1) else f"{row['id']}_oligo_{i+1:02d}",
                'chain_length': chain_length,
                'SMILES': oligomer_smiles,
                'monomer_SMILES': monomer_smiles,
                'Tg': row['Tg'],
                'FFV': row['FFV'],
                'Tc': row['Tc'],
                'Density': row['Density'],
                'Rg': row['Rg'],
            }
            oligomers.append(oligomer)
            
        except Exception as e:
            error_creating_oligomer.append([row['id'], chain_length, 2])
            continue
    
    return oligomers

def data_augmentation():
    print("SMILES-based Homo-Oligomer Generator")
    print("=" * 50)
    
    try:
        print(train_df.info())
        print("\n" + "="*80 + "\n")
        
        all_oligomers = []
        
        # Process each monomer
        for idx, row in tqdm(train_df.iterrows(), 
                        total=len(train_df), 
                        desc="Generating oligomers",
                        unit="monomer"):
            monomer_smiles = row['SMILES']
            
            # Generate oligomers using PSMILES
            oligomers = generate_oligomer_variations_smiles(monomer_smiles, row, 5)
            all_oligomers.extend(oligomers)

        print(f"Skipped {len(error_creating_oligomer)} SMILES monomers, due to error in Oligomers creation.")
        
        # Create comprehensive DataFrame
        oligomer_df = pd.DataFrame(all_oligomers)
        
        # Save results
        oligomer_df.to_csv('augmented_data.csv', index=False)
        print(f"Generated {len(all_oligomers)} oligomers saved to 'augmented_data.csv'")
        
        # Advanced analysis
        print("\nDetailed Analysis:")
        print(f"Total oligomers generated: {len(all_oligomers)}")
        
        # Chain length distribution
        print("\nChain Length Distribution:")
        chain_dist = oligomer_df['chain_length'].value_counts().sort_index()
        for length, count in chain_dist.items():
            print(f"  {length}-mer: {count} oligomers")
        
        return oligomer_df
        
    except Exception as e:
        print(f"Unexpected error: {e}")
        return None

def demo_psmiles_functionality():
    print("PSMILES Functionality Demo")
    print("-" * 30)
    
    # Sample monomer with attachment points
    sample_monomer = "*Oc1ccc(C=C)cc1*"
    
    try:
        polymer = PS(sample_monomer)
        print(f"Sample monomer: {sample_monomer}")
        
        # Generate oligomers of different lengths
        for length in [2, 5]:
            oligomer = polymer.random_copolymer(polymer, ratio=0.5, units=length)
            mol = oligomer.mol
            oligomer = Chem.MolToSmiles(mol, canonical=True)
            print(f"{length}-mer: {oligomer}")
            
    except Exception as e:
        print(f"Demo error: {e}")

# Initiate the Augmentation process
# demo_psmiles_functionality()
# print("\n")

# train_df = data_augmentation()
# print("\nAugmentation Completed")

# print('\n'*1, '--- SMILES for training ---', )
# for t in targets:
#     print(f'"{t}": {len(train_df[train_df[t].notnull()])}')

In [6]:
# Identify the duplicates in the 'SMILES' column
duplicates = train_df[train_df.duplicated(subset=['SMILES'], keep=False)]

print("Duplicate SMILES rows before dropping:")
temp = []
for i, row in duplicates.iterrows():
    temp.append([i, row['SMILES'], row['chain_length'], row['Tg'], row['FFV'], row['Tc'], row['Density'], row['Rg']])
temp = sorted(temp, key=lambda x: x[1])
for i in temp:
    print(i[1:])

Duplicate SMILES rows before dropping:


In [7]:
# Group by 'SMILES' and aggregate columns
df = train_df.groupby('SMILES', as_index=False).agg({
    'Tg': lambda x: x.mean(skipna=True) if x.notna().any() else np.nan,
    'FFV': lambda x: x.mean(skipna=True) if x.notna().any() else np.nan,
    'Tc': lambda x: x.mean(skipna=True) if x.notna().any() else np.nan,
    'Density': lambda x: x.mean(skipna=True) if x.notna().any() else np.nan,
    'Rg': lambda x: x.mean(skipna=True) if x.notna().any() else np.nan,
    **{
        col: 'first'
        for col in train_df.columns
        if col not in ['SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg']
    }
})

# Reorder columns to match original structure
train_df = df[train_df.columns]

# print("Train DF (no duplicates in SMILES, combined values):\n")
# print(train_df.info)

# Save results
# train_df.to_csv('augmented_data.csv', index=False)
# print(f"\nGenerated a total of {len(train_df)} unique oligomers of different monomers and saved to 'augmented_data.csv'")

In [8]:
# train_df.drop(['monomer_id', 'chain_length', 'monomer_SMILES'], axis=1, inplace=True)
# train_df

## Generating and Integrating Molecular Properties and Descriptors for Enhanced Predictive Modeling

In [9]:
useless_cols = [    
    "MaxPartialCharge",
    # Nan Data
    'BCUT2D_MWHI',
    'BCUT2D_MWLOW',
    'BCUT2D_CHGHI',
    'BCUT2D_CHGLO',
    'BCUT2D_LOGPHI',
    'BCUT2D_LOGPLOW',
    'BCUT2D_MRHI',
    'BCUT2D_MRLOW',

    # Constant data
    'NumRadicalElectrons',
    'SMR_VSA8',
    'SlogP_VSA9',
    'fr_barbitur',
    'fr_benzodiazepine',
    'fr_dihydropyridine',
    'fr_epoxide',
    'fr_isothiocyan',
    'fr_lactam',
    'fr_nitroso',
    'fr_prisulfonamd',
    'fr_thiocyan',

    # High correlated data >0.95
    'MaxEStateIndex',
    'HeavyAtomMolWt',
    'ExactMolWt',
    'NumValenceElectrons',
    'Chi0',
    'Chi0n',
    'Chi0v',
    'Chi1',
    'Chi1n',
    'Chi1v',
    'Chi2n',
    'Kappa1',
    'LabuteASA',
    'HeavyAtomCount',
    'MolMR',
    'Chi3n',
    'BertzCT',
    'Chi2v',
    'Chi4n',
    'HallKierAlpha',
    'Chi3v',
    'Chi4v',
    'MinAbsPartialCharge',
    'MinPartialCharge',
    'MaxAbsPartialCharge',
    'FpDensityMorgan2',
    'FpDensityMorgan3',
    'Phi',
    'Kappa3',
    'fr_nitrile',
    'SlogP_VSA6',
    'NumAromaticCarbocycles',
    'NumAromaticRings',
    'fr_benzene',
    'VSA_EState6',
    'NOCount',
    'fr_C_O',
    'fr_C_O_noCOO',
    'NumHDonors',
    'fr_amide',
    'fr_Nhpyrrole',
    'fr_phenol',
    'fr_phenol_noOrthoHbond',
    'fr_COO2',
    'fr_halogen',
    'fr_diazo',
    'fr_nitro_arom',
    'fr_phos_ester'
]

def compute_all_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(desc_names)
    return [desc[1](mol) for desc in Descriptors.descList if desc[0] not in useless_cols]

def compute_graph_features(smiles, graph_feats):
    mol = Chem.MolFromSmiles(smiles)
    adj = rdmolops.GetAdjacencyMatrix(mol)
    G = nx.from_numpy_array(adj)

    graph_feats['graph_diameter'].append(nx.diameter(G) if nx.is_connected(G) else 0)
    graph_feats['avg_shortest_path'].append(nx.average_shortest_path_length(G) if nx.is_connected(G) else 0)
    graph_feats['num_cycles'].append(len(list(nx.cycle_basis(G))))

def rdkit_morgan_fingerprint(smiles_list, radius=2, nBits=150):
    embeddings = []
    for smiles in tqdm(smiles_list, desc="🧬 Morgan FP Embeddings"):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            embeddings.append(np.array([None] * nBits))
            continue
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
        arr = np.zeros((nBits,), dtype=np.int8)
        DataStructs.ConvertToNumpyArray(fp, arr)
        embeddings.append(arr)
    return embeddings, [f'embedding_{i}' for i in range(nBits)]

failed = 0
def compute_embeddings(smiles_list, embedding_type='chemberta'):
    if embedding_type == 'chemberta':
        tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/chembert-model-smiles-transformer/tensorflow2/default/2/ChemBERTa-77M-MTR/ChemBERTa-77M-MTR")
        model = AutoModel.from_pretrained("/kaggle/input/chembert-model-smiles-transformer/tensorflow2/default/2/ChemBERTa-77M-MTR/ChemBERTa-77M-MTR")
        embeddings = []
        for smiles in tqdm(smiles_list, desc="🧬 ChemBERTa Embeddings"):
            try:
                inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512)
                with torch.no_grad():
                    outputs = model(**inputs)
                emb = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
                embeddings.append(emb)
            except Exception as e:
                print(f"Error computing ChemBERTa embedding for SMILES '{smiles}': {e}")
                embeddings.append(np.array([None] * 384))
        return embeddings, [f'embedding_{i}' for i in range(384)]
    elif embedding_type == 'morgan':
        return rdkit_morgan_fingerprint(smiles_list)
    elif embedding_type == 'polybert':
        tokenizer = AutoTokenizer.from_pretrained("kuelumbus/polyBERT")  # Replace with PolyBERT model if available
        model = AutoModel.from_pretrained("kuelumbus/polyBERT")  # Replace with PolyBERT model if available
        embeddings = []
        for smiles in tqdm(smiles_list, desc="🧬 PolyBERT Embeddings"):
            try:
                inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512)
                with torch.no_grad():
                    outputs = model(**inputs)
                emb = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
                embeddings.append(emb)
            except Exception as e:
                print(f"Error computing PolyBERT embedding for SMILES '{smiles}': {e}")
                embeddings.append(np.array([None] * 600))  # Adjust size based on model output
        return embeddings, [f'embedding_{i}' for i in range(600)]
    else:
        raise ValueError("Unsupported embedding type. Choose 'chemberta' or 'mol2vec'.")

def preprocessing(df):
    desc_names = [desc[0] for desc in Descriptors.descList if desc[0] not in useless_cols]
    print(f"Computing Different Descriptors of all {len(train_df)} Polymer SMILES in `train_df`: ")
    descriptors = [compute_all_descriptors(smi) for smi in tqdm(df['SMILES'].to_list(), desc="🔬 Descriptors")]

    print(f"\nComputing the Graph Features of all {len(train_df)} Polymer SMILES in `train_df`: ")
    graph_feats = {'graph_diameter': [], 'avg_shortest_path': [], 'num_cycles': []}
    for smile in tqdm(df['SMILES'], desc="🌐 Graph features"):
         compute_graph_features(smile, graph_feats)

    # embedding_type = "polybert"
    # print(f"\nComputing {embedding_type.capitalize()} Embeddings for all {len(df)} Polymer SMILES in `df`: ")
    # embeddings, embedding_cols = compute_embeddings(df['SMILES'].to_list(), embedding_type)
        
    result = pd.concat(
        [
            pd.DataFrame(descriptors, columns=desc_names),
            pd.DataFrame(graph_feats),
            # pd.DataFrame(embeddings, columns=embedding_cols)
        ],
        axis=1
    )

    result = result.replace([-np.inf, np.inf], np.nan)
    return result

In [10]:
# train_df = pd.read_csv("/kaggle/input/augmented-polymer-data/augmented_data_with_descriptors_v3_5.csv")
# train_df = train_df.rename(columns={'oligomer_id': 'id'})
# train_df = train_df.drop(['MaxPartialCharge'], axis=1)
train_df = pd.concat([train_df, preprocessing(train_df)], axis=1)
train_df['Ipc'] = np.log10(train_df['Ipc'])
train_df = train_df.replace([-np.inf, np.inf], np.nan)
print(f"",sum(train_df[train_df.columns[7:]].isna().sum()))
train_df = train_df.dropna(subset=train_df.columns[7:])

# Find constant columns for each target
all_features = train_df.columns[7:].tolist()
features = {}
for target in targets:
    const_descs = []
    for col in train_df.columns.drop(targets):
        if train_df[train_df[target].notnull()][col].nunique() == 1:
            const_descs.append(col)
    features[target] = [f for f in all_features if f not in const_descs] 

for target in features:
    print(f"Features for {target}:")
    print(features[target])
    print()
print("Done")

Computing Different Descriptors of all 10080 Polymer SMILES in `train_df`: 


🔬 Descriptors:   0%|          | 0/10080 [00:00<?, ?it/s]


Computing the Graph Features of all 10080 Polymer SMILES in `train_df`: 


🌐 Graph features:   0%|          | 0/10080 [00:00<?, ?it/s]

 1
Features for Tg:
['MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'FpDensityMorgan1', 'AvgIpc', 'BalabanJ', 'Ipc', 'Kappa2', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA7', 'SlogP_VSA8', 'TPSA', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState7', 'VSA_EState8', 'VSA_EState9', 'FractionCSP3', 'NHOHCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'Nu

In [11]:
test_df = pd.read_csv("/kaggle/input/neurips-open-polymer-prediction-2025/test.csv")

# Apply with progress bar
print("Canonicalising NeurIPS test.csv....")
test_df['SMILES'] = test_df['SMILES'].progress_apply(make_smile_canonical)
test_df = pd.concat([test_df, preprocessing(test_df)], axis=1)
test_df['Ipc']=np.log10(test_df['Ipc'])
ID = test_df['id'].copy()
SMILES = test_df['SMILES'].copy()
test_df = test_df.drop(columns=['id', 'SMILES'])
test_df = test_df.replace([-np.inf, np.inf], np.nan)
print(f"",sum(test_df[test_df.isna()].sum()))
test_df = test_df.dropna()
test_df

Canonicalising NeurIPS test.csv....


100%|██████████| 3/3 [00:00<00:00, 924.67it/s]

Computing Different Descriptors of all 10079 Polymer SMILES in `train_df`: 





🔬 Descriptors:   0%|          | 0/3 [00:00<?, ?it/s]


Computing the Graph Features of all 10079 Polymer SMILES in `train_df`: 


🌐 Graph features:   0%|          | 0/3 [00:00<?, ?it/s]

 0.0


Unnamed: 0,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,FpDensityMorgan1,AvgIpc,BalabanJ,Ipc,Kappa2,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA7,SlogP_VSA8,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState1,VSA_EState10,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,NHOHCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAmideBonds,NumAromaticHeterocycles,NumAtomStereoCenters,NumBridgeheadAtoms,NumHAcceptors,NumHeteroatoms,NumHeterocycles,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,NumSpiroAtoms,NumUnspecifiedAtomStereoCenters,RingCount,MolLogP,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,fr_ArN,fr_Ar_COO,fr_Ar_N,fr_Ar_NH,fr_Ar_OH,fr_COO,fr_C_S,fr_HOCCN,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_Ndealkylation1,fr_Ndealkylation2,fr_SH,fr_aldehyde,fr_alkyl_carbamate,fr_alkyl_halide,fr_allylic_oxid,fr_amidine,fr_aniline,fr_aryl_methyl,fr_azide,fr_azo,fr_bicyclic,fr_ester,fr_ether,fr_furan,fr_guanido,fr_hdrzine,fr_hdrzone,fr_imidazole,fr_imide,fr_isocyan,fr_ketone,fr_ketone_Topliss,fr_lactone,fr_methoxy,fr_morpholine,fr_nitro,fr_nitro_arom_nonortho,fr_oxazole,fr_oxime,fr_para_hydroxylation,fr_phos_acid,fr_piperdine,fr_piperzine,fr_priamide,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea,graph_diameter,avg_shortest_path,num_cycles
0,14.296609,0.08466,-5.63114,0.133192,13.384615,540.463,0.564103,2.929406,1.499214,8.923694,9.93719,0.0,0.0,0.0,0.0,0.0,219.216066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.185881,18.495568,0.0,0.0,10.202815,17.767588,0.0,119.315679,17.248535,15.909757,26.34249,17.248535,0.0,24.781798,5.41499,0.0,22.253806,0.0,0.0,43.18,28.89449,26.34249,0.0,5.749512,11.312963,53.777533,18.347335,30.480069,24.265468,10.202815,9.84339,90.894047,0.0,0.0,13.34964,-4.418505,1.260589,-8.142157,0.0,0.0,0.103448,0,0,0,0,0,0,0,0,4,12,0,8,0,0,0,0,0,4,7.3603,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,8.673171,4
1,13.208391,0.079396,-0.162743,0.195542,11.74359,510.589,0.589744,3.012648,1.260496,9.25151,9.730437,0.0,0.0,0.0,0.0,0.0,228.284372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.432465,17.632857,0.0,0.0,0.0,19.262465,0.0,154.70805,17.248535,15.909757,0.0,17.248535,0.0,11.56649,5.41499,0.0,56.817258,0.0,0.0,52.6,0.0,9.589074,0.0,16.98148,28.003318,11.499024,17.19327,48.530937,48.530937,38.112943,9.84339,6.052027,0.0,26.157368,5.538316,4.403457,2.093864,0.0,4.413789,0.0,0.085714,0,0,0,0,0,0,0,0,4,6,0,9,0,0,0,0,0,5,7.2845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,8.836585,5
2,13.556487,0.203889,-0.654083,0.137097,14.454545,586.644,0.568182,3.447207,1.009918,10.46159,11.448546,4.736863,0.0,0.0,0.0,0.0,244.99538,0.0,0.0,0.0,0.0,0.0,6.066367,0.0,0.0,28.651875,35.382472,0.0,5.316789,0.0,38.52493,18.113674,107.182945,22.625927,20.440003,5.687386,11.499024,0.0,42.159271,0.0,0.0,79.956884,0.0,11.126903,93.22,11.814359,19.178149,0.0,17.377811,45.861038,50.881324,16.283065,54.597304,12.132734,18.199101,9.473726,11.392345,0.0,53.510308,1.333487,2.969959,0.224556,6.47942,1.314182,0.0,0.222222,0,0,2,2,4,0,0,0,6,10,2,13,0,0,0,0,0,6,6.1875,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,26,9.912077,6


In [12]:
print('--- SMILES for training ---', )
for t in targets:
    print(f'{t}: {len(train_df[train_df[t].notnull()])}', end="\n")
    print(f'Min {t}: {min(train_df[train_df[t].notnull()][t])}', end=", ")
    print(f'Max {t}: {max(train_df[train_df[t].notnull()][t])}', end="\n")

--- SMILES for training ---
Tg: 1160
Min Tg: -255.14999999999998, Max Tg: 472.25
FFV: 7892
Min FFV: 0.2269924, Max FFV: 0.77709707
Tc: 866
Min Tc: 0.0465, Max Tc: 1.59
Density: 1247
Min Density: 0.748691234, Max Density: 1.9820000000000002
Rg: 614
Min Rg: 9.7283551, Max Rg: 34.672905605


# Model Implementation
---


In [13]:
def model_seed_1(train_d, test_d, model, target, submission=False):
    print(f"Processing {target} Seed 1:", end = " ")
    X = train_d.drop([target, 'id'], axis=1)
    y = train_d[target].copy()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=10
    )

    Model = model(random_state=21)
    if submission == False:
        Model.fit(X_train, y_train)
        y_pred = Model.predict(X_test)
        preds = Model.predict(X)
        print("Done")
        return mean_absolute_error(y_pred, y_test), preds
    if submission == True:
        Model.fit(X, y)
        submission = Model.predict(test_d)
        print("Done")
        return submission

def model_seed_2(train_d, test_d, model, target, submission=False):
    print(f"Processing {target} Seed 2:", end = " ")
    X = train_d.drop([target, 'id'], axis=1)
    y = train_d[target].copy()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=10
    )

    Model = model(random_state=42)
    if submission == False:
        Model.fit(X_train, y_train)
        y_pred = Model.predict(X_test)
        preds = Model.predict(X)
        print("Done")
        return mean_absolute_error(y_pred, y_test), preds
    if submission == True:
        Model.fit(X, y)
        submission = Model.predict(test_d)
        print("Done")
        return submission

def model_seed_3(train_d, test_d, model, target, submission=False):
    print(f"Processing {target} Seed 3:", end = " ")
    X = train_d.drop([target, 'id'], axis=1)
    y = train_d[target].copy()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=10
    )

    Model = model(random_state=100)
    if submission == False:
        Model.fit(X_train, y_train)
        y_pred = Model.predict(X_test)
        preds = Model.predict(X)
        print("Done")
        return mean_absolute_error(y_pred, y_test), preds
    if submission == True:
        Model.fit(X, y)
        submission = Model.predict(test_d)
        print("Done")
        return submission

def model_seed_4(train_d, test_d, model, target, submission=False):
    print(f"Processing {target} Seed 4:", end = " ")
    X = train_d.drop([target, 'id'], axis=1)
    y = train_d[target].copy()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=10
    )

    Model = model(random_state=456)
    if submission == False:
        Model.fit(X_train, y_train)
        y_pred = Model.predict(X_test)
        preds = Model.predict(X)
        print("Done")
        return mean_absolute_error(y_pred, y_test), preds
    if submission == True:
        Model.fit(X, y)
        submission = Model.predict(test_d)
        print("Done")
        return submission

def model_seed_5(train_d, test_d, model, target, submission=False):
    print(f"Processing {target} Seed 5:", end = " ")
    X = train_d.drop([target, 'id'], axis=1)
    y = train_d[target].copy()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=10
    )

    Model = model(random_state=666)
    if submission == False:
        Model.fit(X_train, y_train)
        y_pred = Model.predict(X_test)
        preds = Model.predict(X)
        print("Done")
        return mean_absolute_error(y_pred, y_test), preds
    if submission == True:
        Model.fit(X, y)
        submission = Model.predict(test_d)
        print("Done")
        return submission

In [14]:
MINMAX_DICT = {}
for prop in targets:
    vals = train_df.loc[train_df[prop].notnull(), prop]
    MINMAX_DICT[prop] = [vals.min(), vals.max()]

NULL_FOR_SUBMISSION = -9999

def compute_and_plot_maes(prop_dfs, targets, MINMAX_DICT, NULL_FOR_SUBMISSION=-9999):
    per_maes, counts = {}, {}
    for prop in targets:
        df = prop_dfs[prop]
        y_true = df[prop].values
        y_pred = df[f"{prop}_pred"].values
        error = np.abs(y_true - y_pred) / (MINMAX_DICT[prop][1] - MINMAX_DICT[prop][0])
        per_maes[prop] = np.mean(error)
        counts[prop] = len(y_true)
        
        line_min = min(y_true.min(), y_pred.min())
        line_max = max(y_true.max(), y_pred.max())
        sns.scatterplot(x=y_true, y=y_pred, alpha=0.5)
        plt.plot([line_min, line_max], [line_min, line_max],
                 color='red', linestyle='dashed', linewidth=2)
        plt.title(f"{prop}: scaled MAE = {per_maes[prop]:.5f}")
        plt.xlabel("true"); plt.ylabel("predicted")
        plt.show()
    
    cnt_arr = np.array([counts[prop] for prop in targets])
    weights = np.sqrt(1 / cnt_arr)
    weights = weights / weights.sum() * len(targets)
    overall = np.average(list(per_maes.values()), weights=weights)
    
    print("=== Per-property scaled MAEs ===")
    for prop in targets:
        print(f"{prop}: {per_maes[prop]:.5f}   (n={counts[prop]})")
    print(f"\nOverall weighted scaled MAE: {overall:.5f}")
    
    return per_maes, overall

## Training Model for all 5 Targets

In [15]:
tg = train_df[train_df['Tg'].notnull()].drop(columns=['SMILES', 'FFV', 'Tc', 'Density', 'Rg'])
ffv = train_df[train_df['FFV'].notnull()].drop(columns=['SMILES', 'Tg', 'Tc', 'Density', 'Rg'])
tc = train_df[train_df['Tc'].notnull()].drop(columns=['SMILES', 'Tg', 'FFV', 'Density', 'Rg'])
density = train_df[train_df['Density'].notnull()].drop(columns=['SMILES', 'Tg', 'FFV', 'Tc', 'Rg'])
rg = train_df[train_df['Rg'].notnull()].drop(columns=['SMILES', 'Tg', 'FFV', 'Tc', 'Density'])

In [16]:
def run_model_seeds(train_d, test_d, model, target, submission=False):
    seed_funcs = [model_seed_1, model_seed_2, model_seed_3, model_seed_4, model_seed_5]

    if submission:
        predictions = []
        for seed_fn in tqdm(seed_funcs, desc=f"Generating submission for {target}"):
            preds = seed_fn(train_d, test_d, model, target, submission=True)
            predictions.append(preds)
        return sum(predictions) / len(predictions)
    
    else:
        maes = []
        preds_list = []
        for seed_fn in tqdm(seed_funcs, desc=f"Evaluating {target}"):
            mae, preds = seed_fn(train_d, test_d, model, target, submission=False)
            maes.append(mae)
            preds_list.append(preds)
        avg_preds = sum(preds_list) / len(preds_list)
        avg_mae = sum(maes) / len(maes)
        return avg_mae, avg_preds

In [17]:
# tg_mae, tg_preds = run_model_seeds(tg, test_df, ExtraTreesRegressor, 'Tg', submission=False)
# tg['Tg_pred'] = tg_preds
# ffv_mae, ffv_preds = run_model_seeds(ffv, test_df, ExtraTreesRegressor, 'FFV', submission=False)
# ffv['FFV_pred'] = ffv_preds
# tc_mae, tc_preds = run_model_seeds(tc, test_df, ExtraTreesRegressor, 'Tc', submission=False)
# tc['Tc_pred'] = tc_preds
# density_mae, density_preds = run_model_seeds(density, test_df, ExtraTreesRegressor, 'Density', submission=False)
# density['Density_pred'] = density_preds
# rg_mae, rg_preds = run_model_seeds(rg, test_df, ExtraTreesRegressor, 'Rg', submission=False)
# rg['Rg_pred'] = rg_preds

In [18]:
# prop_dfs = {
#     "Tg": tg,
#     "FFV": ffv,
#     "Tc": tc,
#     "Density": density,
#     "Rg": rg
# }

# per_maes, overall = compute_and_plot_maes(
#     prop_dfs, targets,
#     MINMAX_DICT=MINMAX_DICT,
#     NULL_FOR_SUBMISSION=NULL_FOR_SUBMISSION
# )

In [19]:
# 
tg = train_df[train_df['Tg'].notnull()].drop(columns=['SMILES', 'FFV', 'Tc', 'Density', 'Rg'])
ffv = train_df[train_df['FFV'].notnull()].drop(columns=['SMILES', 'Tg', 'Tc', 'Density', 'Rg'])
tc = train_df[train_df['Tc'].notnull()].drop(columns=['SMILES', 'Tg', 'FFV', 'Density', 'Rg'])
density = train_df[train_df['Density'].notnull()].drop(columns=['SMILES', 'Tg', 'FFV', 'Tc', 'Rg'])
rg = train_df[train_df['Rg'].notnull()].drop(columns=['SMILES', 'Tg', 'FFV', 'Tc', 'Density'])

# 
tg_result = run_model_seeds(tg, test_df, ExtraTreesRegressor, 'Tg', submission=True)
ffv_result = run_model_seeds(ffv, test_df, ExtraTreesRegressor, 'FFV', submission=True)
tc_result = run_model_seeds(tc, test_df, ExtraTreesRegressor, 'Tc', submission=True)
density_result = run_model_seeds(density, test_df, ExtraTreesRegressor, 'Density', submission=True)
rg_result = run_model_seeds(rg, test_df, ExtraTreesRegressor, 'Rg', submission=True)

Generating submission for Tg:   0%|          | 0/5 [00:00<?, ?it/s]

Processing Tg Seed 1: Done
Processing Tg Seed 2: Done
Processing Tg Seed 3: Done
Processing Tg Seed 4: Done
Processing Tg Seed 5: Done


Generating submission for FFV:   0%|          | 0/5 [00:00<?, ?it/s]

Processing FFV Seed 1: Done
Processing FFV Seed 2: Done
Processing FFV Seed 3: Done
Processing FFV Seed 4: Done
Processing FFV Seed 5: Done


Generating submission for Tc:   0%|          | 0/5 [00:00<?, ?it/s]

Processing Tc Seed 1: Done
Processing Tc Seed 2: Done
Processing Tc Seed 3: Done
Processing Tc Seed 4: Done
Processing Tc Seed 5: Done


Generating submission for Density:   0%|          | 0/5 [00:00<?, ?it/s]

Processing Density Seed 1: Done
Processing Density Seed 2: Done
Processing Density Seed 3: Done
Processing Density Seed 4: Done
Processing Density Seed 5: Done


Generating submission for Rg:   0%|          | 0/5 [00:00<?, ?it/s]

Processing Rg Seed 1: Done
Processing Rg Seed 2: Done
Processing Rg Seed 3: Done
Processing Rg Seed 4: Done
Processing Rg Seed 5: Done


In [20]:
sub = {
    'id': ID,
    'SMILES': SMILES,
    'Tg': tg_result,
    'FFV': ffv_result,
    'Tc': tc_result,
    'Density': density_result,
    'Rg': rg_result
}
test_df = pd.DataFrame(sub)
test_df

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,1109053969,*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...,158.488945,0.37283,0.181654,1.139763,20.714171
1,1422188626,*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...,166.650607,0.374475,0.235625,1.110505,19.840821
2,2032016830,*c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6...,99.196068,0.350464,0.267246,1.086206,20.802427


In [21]:
# for t in targets:
#     for s in train_df[train_df[t].notnull()]['SMILES']:
#         if s in test_df['SMILES'].tolist():
#             test_df.loc[test_df['SMILES']==s, t] = train_df[train_df['SMILES']==s][t].values[0]
test_df[['id'] + targets].to_csv('submission.csv', index=False)

In [22]:
ans = pd.read_csv("/kaggle/working/submission.csv")
ans

Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,158.488945,0.37283,0.181654,1.139763,20.714171
1,1422188626,166.650607,0.374475,0.235625,1.110505,19.840821
2,2032016830,99.196068,0.350464,0.267246,1.086206,20.802427


---