In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/melting-point/sample_submission.csv
/kaggle/input/melting-point/train.csv
/kaggle/input/melting-point/test.csv


In [2]:
!pip install -qU rdkit

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
train = pd.read_csv('/kaggle/input/melting-point/train.csv')
test = pd.read_csv('/kaggle/input/melting-point/test.csv')
sample_sub = pd.read_csv('/kaggle/input/melting-point/sample_submission.csv')

In [4]:

from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors
import warnings
warnings.filterwarnings('ignore')

def extract_molecular_features(smiles):
    """Extract comprehensive molecular descriptors from SMILES string"""
    mol = Chem.MolFromSmiles(smiles)
    
    if mol is None:
        # Return dictionary with NaN for invalid SMILES
        return {key: np.nan for key in ['MolWt', 'MolLogP', 'NumHDonors', 'NumHAcceptors', 
                'NumRotatableBonds', 'NumAromaticRings', 'TPSA', 'NumHeteroatoms']}
    
    features = {}
    
    # basic molecular properties
    features['MolWt'] = Descriptors.MolWt(mol)
    features['MolLogP'] = Descriptors.MolLogP(mol)
    features['NumHDonors'] = Descriptors.NumHDonors(mol)
    features['NumHAcceptors'] = Descriptors.NumHAcceptors(mol)
    features['NumRotatableBonds'] = Descriptors.NumRotatableBonds(mol)
    features['NumAromaticRings'] = Descriptors.NumAromaticRings(mol)
    features['NumAliphaticRings'] = Descriptors.NumAliphaticRings(mol)
    features['RingCount'] = Descriptors.RingCount(mol)
    features['TPSA'] = Descriptors.TPSA(mol)
    features['NumHeteroatoms'] = Descriptors.NumHeteroatoms(mol)
    
    features['BertzCT'] = Descriptors.BertzCT(mol)
    features['HallKierAlpha'] = Descriptors.HallKierAlpha(mol)
    features['Kappa1'] = Descriptors.Kappa1(mol)
    features['Kappa2'] = Descriptors.Kappa2(mol)
    features['Kappa3'] = Descriptors.Kappa3(mol)
    
    # electronic properties
    features['NumValenceElectrons'] = Descriptors.NumValenceElectrons(mol)
    features['NumRadicalElectrons'] = Descriptors.NumRadicalElectrons(mol)
    
    # surface area
    features['LabuteASA'] = Descriptors.LabuteASA(mol)
    
    # atom and bond counts
    features['NumAtoms'] = mol.GetNumAtoms()
    features['NumHeavyAtoms'] = mol.GetNumHeavyAtoms()
    features['NumBonds'] = mol.GetNumBonds()
    
    features['FormalCharge'] = Chem.GetFormalCharge(mol)
    
    try:
        features['FractionCsp3'] = Descriptors.FractionCSP3(mol)
    except:
        features['FractionCsp3'] = 0
    
    # Additional descriptors for melting point
    features['MolMR'] = Descriptors.MolMR(mol)
    features['Chi0v'] = Descriptors.Chi0v(mol)
    features['Chi1v'] = Descriptors.Chi1v(mol)
    features['Chi2v'] = Descriptors.Chi2v(mol)
    features['Chi3v'] = Descriptors.Chi3v(mol)
    features['Chi4v'] = Descriptors.Chi4v(mol)
    
    # Balaban index
    try:
        features['BalabanJ'] = Descriptors.BalabanJ(mol)
    except:
        features['BalabanJ'] = 0
    
    # EState indices
    features['MaxEStateIndex'] = Descriptors.MaxEStateIndex(mol)
    features['MinEStateIndex'] = Descriptors.MinEStateIndex(mol)
    
    # Number of specific atom types
    features['NumAliphaticCarbocycles'] = Descriptors.NumAliphaticCarbocycles(mol)
    features['NumAliphaticHeterocycles'] = Descriptors.NumAliphaticHeterocycles(mol)
    features['NumAromaticCarbocycles'] = Descriptors.NumAromaticCarbocycles(mol)
    features['NumAromaticHeterocycles'] = Descriptors.NumAromaticHeterocycles(mol)
    features['NumSaturatedCarbocycles'] = Descriptors.NumSaturatedCarbocycles(mol)
    features['NumSaturatedHeterocycles'] = Descriptors.NumSaturatedHeterocycles(mol)
    
    # Adding Morgan Fingerprint
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=512)
    for idx, bit in enumerate(fp):
        features[f'morgan_fp_{idx}'] = int(bit)
    
    return features

# Extracting features from train and test
print("="*60)
print("EXTRACTING MOLECULAR FEATURES FROM SMILES")
print("="*60)
print("\nThis may take 1-2 minutes...")

train_mol_features = train['SMILES'].apply(extract_molecular_features)
train_mol_df = pd.DataFrame(list(train_mol_features))

test_mol_features = test['SMILES'].apply(extract_molecular_features)
test_mol_df = pd.DataFrame(list(test_mol_features))

print(f"\n✓ Extracted {len(train_mol_df.columns)} molecular features")
print(f"  - Train shape: {train_mol_df.shape}")
print(f"  - Test shape: {test_mol_df.shape}")

# Handling any NaN values
train_mol_df = train_mol_df.fillna(0)
test_mol_df = test_mol_df.fillna(0)

# Showing some of the extracted features
print("\n" + "="*60)
print("SAMPLE MOLECULAR FEATURES")
print("="*60)
print("\nChemical descriptors:")
descriptor_cols = ['MolWt', 'MolLogP', 'NumHDonors', 'NumHAcceptors', 'TPSA', 
                   'NumAromaticRings', 'NumRotatableBonds', 'BertzCT']
print(train_mol_df[descriptor_cols].head())

print("\nCorrelation with melting point (Tm):")
correlations = train_mol_df.corrwith(train['Tm']).abs().sort_values(ascending=False)
print(correlations.head(15))

# Remove the columns that are not features
id_cols = ['id', 'SMILES', 'Tm']
if 'id_bin' in train.columns:
    id_cols.append('id_bin')

feature_cols = [col for col in train.columns if col not in id_cols]

X = train[feature_cols]
y = train['Tm']
X_test = test[feature_cols]

# Removing zero variance features from group features
zero_var_cols = X.columns[X.std() == 0].tolist()
if zero_var_cols:
    print(f"\nRemoving {len(zero_var_cols)} zero-variance Group features")
    X = X.drop(columns=zero_var_cols)
    X_test = X_test.drop(columns=zero_var_cols)

# combine with molecular features
X_combined = pd.concat([X, train_mol_df], axis=1)
X_test_combined = pd.concat([X_test, test_mol_df], axis=1)

print(f"\n" + "="*60)
print("COMBINED FEATURES")
print("="*60)
print(f"Group features: {X.shape[1]}")
print(f"Molecular features: {train_mol_df.shape[1]}")
print(f"Combined features: {X_combined.shape[1]}")

# remove zero variance features from combined set
zero_var_combined = X_combined.columns[X_combined.std() == 0].tolist()
if zero_var_combined:
    print(f"\nRemoving {len(zero_var_combined)} additional zero-variance features")
    X_combined = X_combined.drop(columns=zero_var_combined)
    X_test_combined = X_test_combined.drop(columns=zero_var_combined)

print(f"Final feature count: {X_combined.shape[1]}")

EXTRACTING MOLECULAR FEATURES FROM SMILES

This may take 1-2 minutes...





✓ Extracted 550 molecular features
  - Train shape: (2662, 550)
  - Test shape: (666, 550)

SAMPLE MOLECULAR FEATURES

Chemical descriptors:
     MolWt  MolLogP  NumHDonors  NumHAcceptors   TPSA  NumAromaticRings  \
0  162.032  2.42120           0              0   0.00                 0   
1  217.271  4.47430           1              0  15.79                 4   
2  160.220  2.36462           0              2  17.82                 2   
3   84.074  0.09430           1              1  37.30                 0   
4  118.245  2.49490           1              1   0.00                 0   

   NumRotatableBonds     BertzCT  
0                  0  179.793356  
1                  0  833.661406  
2                  1  401.130826  
3                  0  104.451830  
4                  3   33.161259  

Correlation with melting point (Tm):
BertzCT                   0.576550
NumBonds                  0.534738
NumHeavyAtoms             0.494154
NumAtoms                  0.494154
LabuteASA          

In [5]:

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.base import clone
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import inspect
import numpy as np
import pandas as pd


class RegressionTrainer:
    def __init__(self, model, n_folds=5, random_state=42):
        self.model = model
        self.n_folds = n_folds
        self.random_state = random_state
        self.kfold = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
        
    def train_predict(self, X_train, y_train, X_test, verbose=True):
        oof_preds = np.zeros(len(X_train))
        test_preds = np.zeros(len(X_test))
        
        for fold, (train_idx, val_idx) in enumerate(self.kfold.split(X_train), 1):
            if verbose:
                print(f"Fold {fold}/{self.n_folds}...", end=" ")
            
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            
            fold_model = clone(self.model)
            
            # Check if model supports eval_set (XGBoost, LightGBM, CatBoost)
            if hasattr(fold_model, 'fit') and 'eval_set' in inspect.signature(fold_model.fit).parameters:
                # Handle different verbose parameter names
                fit_params = {'eval_set': [(X_val, y_val)]}
                
                # XGBoost and CatBoost use 'verbose'
                if 'verbose' in inspect.signature(fold_model.fit).parameters:
                    fit_params['verbose'] = False
                
                fold_model.fit(X_tr, y_tr, **fit_params)
            else:
                fold_model.fit(X_tr, y_tr)
            
            # Predict
            oof_preds[val_idx] = fold_model.predict(X_val)
            test_preds += fold_model.predict(X_test) / self.n_folds
            
            fold_mae = mean_absolute_error(y_val, oof_preds[val_idx])
            if verbose:
                print(f"MAE: {fold_mae:.4f}")
        
        overall_mae = mean_absolute_error(y_train, oof_preds)
        if verbose:
            print(f"\n{'='*50}")
            print(f"Overall OOF MAE: {overall_mae:.4f}")
            print(f"{'='*50}\n")
        
        return oof_preds, test_preds, overall_mae


print("\n" + "="*60)
print("TRAINING WITH MOLECULAR + GROUP FEATURES")
print("="*60)

# XGBoost with molecular features
print("\n🔬 XGBoost with Molecular Features")
print("-"*60)

xgb_mol = XGBRegressor(
    n_estimators=3000,
    max_depth=8,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
)

trainer = RegressionTrainer(xgb_mol, n_folds=5)
oof_xgb_mol, test_xgb_mol, mae_xgb_mol = trainer.train_predict(X_combined, y, X_test_combined)

# LightGBM with molecular features
print("\n LightGBM with Molecular Features")
print("-"*60)

lgb_mol = LGBMRegressor(
    n_estimators=3000,
    num_leaves=64,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=20,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    force_row_wise=True,
    verbosity=-1
)

trainer = RegressionTrainer(lgb_mol, n_folds=5)
oof_lgb_mol, test_lgb_mol, mae_lgb_mol = trainer.train_predict(X_combined, y, X_test_combined)

# CatBoost with molecular features
print("\n CatBoost with Molecular Features")
print("-"*60)

cat_mol = CatBoostRegressor(
    iterations=3000,
    depth=8,
    learning_rate=0.01,
    l2_leaf_reg=3,
    random_state=42,
    verbose=False
)

trainer = RegressionTrainer(cat_mol, n_folds=5)
oof_cat_mol, test_cat_mol, mae_cat_mol = trainer.train_predict(X_combined, y, X_test_combined)


print("\n" + "="*60)
print("CREATING ENSEMBLE")
print("="*60)

# OOF predictions
oof_stack = np.column_stack([oof_xgb_mol, oof_lgb_mol, oof_cat_mol])
test_stack = np.column_stack([test_xgb_mol, test_lgb_mol, test_cat_mol])

# best Ridge alpha
best_alpha = None
best_mae = float('inf')

print("\nOptimizing Ridge ensemble:")
for alpha in [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]:
    ridge = Ridge(alpha=alpha, random_state=42)
    ridge.fit(oof_stack, y)
    ridge_oof = ridge.predict(oof_stack)
    mae = mean_absolute_error(y, ridge_oof)
    
    print(f"  Alpha {alpha:7.3f}: MAE = {mae:.4f}")
    
    if mae < best_mae:
        best_mae = mae
        best_alpha = alpha

print(f"\n Best alpha: {best_alpha} (MAE: {best_mae:.4f})")

# Train final model
final_ridge = Ridge(alpha=best_alpha, random_state=42)
final_ridge.fit(oof_stack, y)
ensemble_test = final_ridge.predict(test_stack)

# Show weights
print("\n Ensemble Weights:")
model_names = ['XGBoost', 'LightGBM', 'CatBoost']
for name, coef in zip(model_names, final_ridge.coef_):
    print(f"  {name:12s}: {coef:8.4f}")
print(f"  {'Intercept':12s}: {final_ridge.intercept_:8.4f}")

# submission
submission = pd.DataFrame({
    'id': test['id'],
    'Tm': ensemble_test
})

submission.to_csv('submission.csv', index=False)

print("\n" + "="*60)
print(" SUBMISSION CREATED!")
print("="*60)
print(f"\n Expected CV Score: {best_mae:.4f}")
print(f" Previous best (without molecular features): 36.19")
print(f" Improvement: {36.19 - best_mae:.4f} ({((36.19 - best_mae)/36.19)*100:.1f}%)")

print(f"\nPrediction range: {ensemble_test.min():.1f}K to {ensemble_test.max():.1f}K")
print(f"Training range: {y.min():.1f}K to {y.max():.1f}K")

print("\n File saved as: submission.csv")
print("\nFirst 10 predictions:")
print(submission.head(10))


TRAINING WITH MOLECULAR + GROUP FEATURES

🔬 XGBoost with Molecular Features
------------------------------------------------------------
Fold 1/5... MAE: 29.3076
Fold 2/5... MAE: 28.4346
Fold 3/5... MAE: 29.4964
Fold 4/5... MAE: 28.8042
Fold 5/5... MAE: 27.9506

Overall OOF MAE: 28.7987


 LightGBM with Molecular Features
------------------------------------------------------------
Fold 1/5... MAE: 30.9321
Fold 2/5... MAE: 30.5195
Fold 3/5... MAE: 31.2283
Fold 4/5... MAE: 30.7437
Fold 5/5... MAE: 29.7274

Overall OOF MAE: 30.6303


 CatBoost with Molecular Features
------------------------------------------------------------
Fold 1/5... MAE: 29.6296
Fold 2/5... MAE: 29.0399
Fold 3/5... MAE: 29.0321
Fold 4/5... MAE: 28.8693
Fold 5/5... MAE: 28.4976

Overall OOF MAE: 29.0139


CREATING ENSEMBLE

Optimizing Ridge ensemble:
  Alpha   0.001: MAE = 28.5766
  Alpha   0.010: MAE = 28.5766
  Alpha   0.100: MAE = 28.5766
  Alpha   1.000: MAE = 28.5766
  Alpha  10.000: MAE = 28.5766
  Alpha 100.