This notebook is based on my previous notebook [Baseline_For_Beginners](https://www.kaggle.com/code/adamlogman/baseline-for-beginners),and [Dmitry Uarov](https://www.kaggle.com/dmitryuarov) 's [NeurIPS | Baseline + External data](https://www.kaggle.com/code/dmitryuarov/neurips-baseline-external-data) notebook with modifications to the model

# Import Dependencies 

In [1]:
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3


In [2]:
from collections import OrderedDict
import pandas as pd; pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor,ExtraTreesRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import networkx as nx
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdmolops
from rdkit import Chem
import warnings; warnings.filterwarnings("ignore")

class CFG:
    TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
    SEED = 42
    FOLDS = 5

useless_cols = [   
    'MaxPartialCharge', 
    # Nan data
    'BCUT2D_MWHI',
    'BCUT2D_MWLOW',
    'BCUT2D_CHGHI',
    'BCUT2D_CHGLO',
    'BCUT2D_LOGPHI',
    'BCUT2D_LOGPLOW',
    'BCUT2D_MRHI',
    'BCUT2D_MRLOW',

    # Constant data
    'NumRadicalElectrons',
    'SMR_VSA8',
    'SlogP_VSA9',
    'fr_barbitur',
    'fr_benzodiazepine',
    'fr_dihydropyridine',
    'fr_epoxide',
    'fr_isothiocyan',
    'fr_lactam',
    'fr_nitroso',
    'fr_prisulfonamd',
    'fr_thiocyan',

    # High correlated data >0.95
    'MaxEStateIndex',
    'HeavyAtomMolWt',
    'ExactMolWt',
    'NumValenceElectrons',
    'Chi0',
    'Chi0n',
    'Chi0v',
    'Chi1',
    'Chi1n',
    'Chi1v',
    'Chi2n',
    'Kappa1',
    'LabuteASA',
    'HeavyAtomCount',
    'MolMR',
    'Chi3n',
    'BertzCT',
    'Chi2v',
    'Chi4n',
    'HallKierAlpha',
    'Chi3v',
    'Chi4v',
    'MinAbsPartialCharge',
    'MinPartialCharge',
    'MaxAbsPartialCharge',
    'FpDensityMorgan2',
    'FpDensityMorgan3',
    'Phi',
    'Kappa3',
    'fr_nitrile',
    'SlogP_VSA6',
    'NumAromaticCarbocycles',
    'NumAromaticRings',
    'fr_benzene',
    'VSA_EState6',
    'NOCount',
    'fr_C_O',
    'fr_C_O_noCOO',
    'NumHDonors',
    'fr_amide',
    'fr_Nhpyrrole',
    'fr_phenol',
    'fr_phenol_noOrthoHbond',
    'fr_COO2',
    'fr_halogen',
    'fr_diazo',
    'fr_nitro_arom',
    'fr_phos_ester'
]


In [3]:
def make_smile_canonical(smile): # To avoid duplicates, for example: canonical '*C=C(*)C' == '*C(=C*)C'
    try:
        mol = Chem.MolFromSmiles(smile)
        canon_smile = Chem.MolToSmiles(mol, canonical=True)
        return canon_smile
    except:
        return np.nan

def compute_all_descriptors(smiles, desc_names):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(desc_names)
    return [desc[1](mol) for desc in Descriptors.descList if desc[0] not in useless_cols]

def compute_graph_features(smiles, graph_feats):
    mol = Chem.MolFromSmiles(smiles)
    adj = rdmolops.GetAdjacencyMatrix(mol)
    G = nx.from_numpy_array(adj)
    graph_feats['graph_diameter'].append(nx.diameter(G) if nx.is_connected(G) else 0)
    graph_feats['avg_shortest_path'].append(nx.average_shortest_path_length(G) if nx.is_connected(G) else 0)
    graph_feats['num_cycles'].append(len(list(nx.cycle_basis(G))))

# Read Files
### Main Files
train = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
ss = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv')
ID = test['id'].copy()

### Extra Files
tc_smiles = pd.read_csv('/kaggle/input/tc-smiles/Tc_SMILES.csv')
tg_smiles = pd.read_csv('/kaggle/input/smiles-extra-data/JCIM_sup_bigsmiles.csv')
ktg_smiles = pd.read_excel('/kaggle/input/smiles-extra-data/data_tg3.xlsx')
de_smiles = pd.read_excel('/kaggle/input/smiles-extra-data/data_dnst1.xlsx')

# Preprocessing
train['SMILES'] = train['SMILES'].apply(lambda s: make_smile_canonical(s))
test['SMILES'] = test['SMILES'].apply(lambda s: make_smile_canonical(s))

ktg_smiles.rename(columns={'Tg [K]': 'Tg'}, inplace=True)
tg_smiles.rename(columns={'Tg (C)': 'Tg'}, inplace=True)
tc_smiles.rename(columns={'TC_mean': 'Tc'}, inplace=True)
de_smiles.rename(columns={'density(g/cm3)': 'Density'}, inplace=True)

de_smiles['SMILES'] = de_smiles['SMILES'].apply(lambda s: make_smile_canonical(s))
de_smiles = de_smiles[(de_smiles['SMILES'].notnull())&(de_smiles['Density'].notnull())&(de_smiles['Density'] != 'nylon')]
de_smiles['Density'] = de_smiles['Density'].astype('float64')
de_smiles['Density'] -= 0.118
ktg_smiles['Tg'] = ktg_smiles['Tg'] - 273.15

def preprocessing(df):
    desc_names = [desc[0] for desc in Descriptors.descList if desc[0] not in useless_cols]
    descriptors = [compute_all_descriptors(smi, desc_names) for smi in df['SMILES'].to_list()]
    graph_feats = {'graph_diameter': [], 'avg_shortest_path': [], 'num_cycles': []}
    for smile in df['SMILES']:
        compute_graph_features(smile, graph_feats)
    result = pd.concat(
        [
            pd.DataFrame(descriptors, columns=desc_names),
            pd.DataFrame(graph_feats)
        ],
        axis=1
    )
    result = result.replace([-np.inf, np.inf], np.nan)
    return result

# Feature Extraction 
def add_extra_data(df_train, df_extra, target):
    n_samples_before = len(df_train[df_train[target].notnull()])
    df_extra['SMILES'] = df_extra['SMILES'].apply(lambda s: make_smile_canonical(s))
    df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()
    cross_smiles = set(df_extra['SMILES']) & set(df_train['SMILES'])
    unique_smiles_extra = set(df_extra['SMILES']) - set(df_train['SMILES'])
    # Make priority target value from competition's df
    for smile in df_train[df_train[target].notnull()]['SMILES'].tolist():
        if smile in cross_smiles:
            cross_smiles.remove(smile)
    # Imput missing values for competition's SMILES
    for smile in cross_smiles:
        df_train.loc[df_train['SMILES']==smile, target] = df_extra[df_extra['SMILES']==smile][target].values[0]
    df_train = pd.concat([df_train, df_extra[df_extra['SMILES'].isin(unique_smiles_extra)]], axis=0).reset_index(drop=True)
    n_samples_after = len(df_train[df_train[target].notnull()])
    print(f'\nFor target "{target}" added {n_samples_after-n_samples_before} new samples!')
    print(f'New unique SMILES: {len(unique_smiles_extra)}')
    return df_train

train = add_extra_data(train, tc_smiles, 'Tc')
train = add_extra_data(train, tg_smiles, 'Tg')
train = add_extra_data(train, ktg_smiles, 'Tg')
train = add_extra_data(train, de_smiles, 'Density')

train = pd.concat([train, preprocessing(train)], axis=1)
test = pd.concat([test, preprocessing(test)], axis=1)

# Find constant columns for each target
all_features = train.columns[7:].tolist()
features = {}
for target in CFG.TARGETS:
    const_descs = []
    for col in train.columns.drop(CFG.TARGETS):
        if train[train[target].notnull()][col].nunique() == 1:
            const_descs.append(col)
    features[target] = [f for f in all_features if f not in const_descs]

def fill_ipc(data):   
    print(data.shape)
    data['Ipc'] = np.log10(data['Ipc'])  
    for n in data.columns[7:]:
        data[n] = data[n].replace(-np.inf, np.nan)
        data[n] = data[n].replace(np.inf, np.nan)    
        data[n].fillna(data[n].mean())
    return data

train = fill_ipc(train)
test = fill_ipc(test)


[14:18:18] SMILES Parse Error: syntax error while parsing: *O[Si](*)([R])[R]
[14:18:18] SMILES Parse Error: check for mistakes around position 12:
[14:18:18] *O[Si](*)([R])[R]
[14:18:18] ~~~~~~~~~~~^
[14:18:18] SMILES Parse Error: Failed parsing SMILES '*O[Si](*)([R])[R]' for input: '*O[Si](*)([R])[R]'
[14:18:18] SMILES Parse Error: syntax error while parsing: *NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4
[14:18:18] SMILES Parse Error: check for mistakes around position 28:
[14:18:18] c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=
[14:18:18] ~~~~~~~~~~~~~~~~~~~~^
[14:18:18] SMILES Parse Error: Failed parsing SMILES '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4' for input: '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4'
[14:18:18] SMILES Parse Error: syntax error while parsing: O=C=N[R1]N=C=O.O[R2]O.O[R3]O
[14:18:18] SMILES Parse Error: check for mistakes around position 7:
[14:18:18] O=C=N[R1]N=C=O.O[R2]O.O[R3]O
[14:18:18] ~~~~~~^
[14:18:18] SMILES Parse Error: F


For target "Tc" added 129 new samples!
New unique SMILES: 129

For target "Tg" added 151 new samples!
New unique SMILES: 136

For target "Tg" added 499 new samples!
New unique SMILES: 499

For target "Density" added 634 new samples!
New unique SMILES: 524
(9261, 158)
(3, 153)


In [4]:
trg_models = OrderedDict({
    'Tg': ExtraTreesRegressor, 
    'FFV': ExtraTreesRegressor, 
    'Tc': CatBoostRegressor, 
    'Density': ExtraTreesRegressor, 
    'Rg': ExtraTreesRegressor,
})

# Data Preparation For Model Training
# We'll separate train to be one model for each target variable.
# We will drop the rows with missing values related to that target after separation.
#This is important , dropping them beforehand would result Null for all data.
trn_targets = {}
for target in trg_models.keys():
    trn_targets[target] = train[['SMILES', target]].copy().dropna()

train = train.drop(['id', 'Tg', 'FFV', 'Tc', 'Density', 'Rg'], axis=1)
test = test.drop(['id', 'SMILES'], axis=1)

for target, data in trn_targets.items():
    trn_targets[target] = data.merge(train, on='SMILES', how='left')

for target, data in trn_targets.items():
    trn_targets[target] = data.drop('SMILES', axis=1).dropna()

# Model. Let’s define a reusable function to train and evaluate our machine learning model.
def model(train_d, test_d, model, target, submission=False):
    # We divide the data into training and validation sets for model evaluation
    X = train_d.drop(target, axis=1)
    y = train_d[target].copy()
    Model = model()
    if submission:
        Model.fit(X, y)
        submission = Model.predict(test_d)
        return submission
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
        Model.fit(X_train, y_train)
        y_pred = Model.predict(X_test)
        return mean_absolute_error(y_pred, y_test)         # We assess our model performance using MAE metric

# Model Evaluation
train_mae = OrderedDict()
for target, data in trn_targets.items():
    train_mae[target] = model(data, test, trg_models[target], target, submission=False)
display(train_mae)

# Finally, we use the model to predict on the test set and prepare the submission file.
sub = OrderedDict({'id': ID})
for target, data in trn_targets.items():
    sub[target] = model(data, test, trg_models[target], target, submission=True)

# Submission
submission = pd.DataFrame(sub)
submission.to_csv('submission.csv', index=False)
display(submission)


Learning rate set to 0.038629
0:	learn: 0.1023207	total: 60.3ms	remaining: 1m
1:	learn: 0.1005159	total: 65.3ms	remaining: 32.6s
2:	learn: 0.0986318	total: 70.2ms	remaining: 23.3s
3:	learn: 0.0970325	total: 75ms	remaining: 18.7s
4:	learn: 0.0953473	total: 80.2ms	remaining: 16s
5:	learn: 0.0937419	total: 85.7ms	remaining: 14.2s
6:	learn: 0.0922845	total: 90.7ms	remaining: 12.9s
7:	learn: 0.0906400	total: 95.6ms	remaining: 11.9s
8:	learn: 0.0890473	total: 101ms	remaining: 11.1s
9:	learn: 0.0876884	total: 106ms	remaining: 10.5s
10:	learn: 0.0864398	total: 111ms	remaining: 9.96s
11:	learn: 0.0852053	total: 116ms	remaining: 9.56s
12:	learn: 0.0839479	total: 122ms	remaining: 9.23s
13:	learn: 0.0827617	total: 127ms	remaining: 8.93s
14:	learn: 0.0815484	total: 132ms	remaining: 8.66s
15:	learn: 0.0805342	total: 137ms	remaining: 8.42s
16:	learn: 0.0794173	total: 142ms	remaining: 8.21s
17:	learn: 0.0785678	total: 147ms	remaining: 8.01s
18:	learn: 0.0776779	total: 152ms	remaining: 7.83s
19:	learn:

OrderedDict([('Tg', 35.6646412917694),
             ('FFV', 0.006670058992549782),
             ('Tc', 0.0345094396198343),
             ('Density', 0.03956760186340001),
             ('Rg', 1.8130417533252041)])

Learning rate set to 0.040023
0:	learn: 0.0993555	total: 8.17ms	remaining: 8.17s
1:	learn: 0.0975259	total: 13.5ms	remaining: 6.72s
2:	learn: 0.0958017	total: 18.9ms	remaining: 6.28s
3:	learn: 0.0939474	total: 24.1ms	remaining: 6s
4:	learn: 0.0922230	total: 29.4ms	remaining: 5.85s
5:	learn: 0.0906151	total: 34.6ms	remaining: 5.73s
6:	learn: 0.0891982	total: 39.7ms	remaining: 5.63s
7:	learn: 0.0875773	total: 44.9ms	remaining: 5.57s
8:	learn: 0.0861490	total: 49.9ms	remaining: 5.49s
9:	learn: 0.0847840	total: 54.9ms	remaining: 5.44s
10:	learn: 0.0834475	total: 60.1ms	remaining: 5.41s
11:	learn: 0.0823576	total: 65.3ms	remaining: 5.37s
12:	learn: 0.0810865	total: 70.8ms	remaining: 5.37s
13:	learn: 0.0799694	total: 76.2ms	remaining: 5.37s
14:	learn: 0.0787994	total: 81.8ms	remaining: 5.37s
15:	learn: 0.0778174	total: 87.4ms	remaining: 5.37s
16:	learn: 0.0767702	total: 92.6ms	remaining: 5.36s
17:	learn: 0.0758928	total: 97.9ms	remaining: 5.34s
18:	learn: 0.0749596	total: 103ms	remaining: 5.

Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,149.516774,0.377104,0.210824,1.135852,20.288629
1,1422188626,163.002641,0.380319,0.236521,1.107824,20.083394
2,2032016830,92.515583,0.35019,0.24914,1.073782,19.988765
