# Chemprop scikit-learn Estimator Example
Demonstrating usage of `ChempropTransformer` and `ChempropRegressor` with common scikit-learn workflows including cross-validation, hyperparameter tuning, and model persistence.

#Defaults
Namespace(smiles_columns=None, reaction_columns=None, no_header_row=False, num_workers=0, batch_size=64, accelerator='auto', devices='auto', rxn_mode='REAC_DIFF', multi_hot_atom_featurizer_mode='V2', keep_h=False, add_h=False, ignore_stereo=False, reorder_atoms=False, molecule_featurizers=None, descriptors_path=None, no_descriptor_scaling=False, no_atom_feature_scaling=False, no_atom_descriptor_scaling=False, no_bond_feature_scaling=False, no_bond_descriptor_scaling=False, atom_features_path=None, atom_descriptors_path=None, bond_features_path=None, bond_descriptors_path=None, constraints_path=None, constraints_to_targets=None, config_path=None, data_path=None, output_dir=None, remove_checkpoints=False, checkpoint=None, freeze_encoder=False, model_frzn=None, frzn_ffn_layers=0, from_foundation=None, ensemble_size=1, message_hidden_dim=300, message_bias=False, depth=3, undirected=False, dropout=0.0, mpn_shared=False, aggregation='norm', aggregation_norm=100, atom_messages=False, activation='RELU', activation_args=None, ffn_hidden_dim=300, ffn_num_layers=1, batch_norm=False, multiclass_num_classes=3, atom_task_weights=None, atom_ffn_hidden_dim=300, atom_ffn_num_layers=1, atom_multiclass_num_classes=3, bond_task_weights=None, bond_ffn_hidden_dim=300, bond_ffn_num_layers=1, bond_multiclass_num_classes=3, atom_constrainer_ffn_hidden_dim=300, atom_constrainer_ffn_num_layers=1, bond_constrainer_ffn_hidden_dim=300, bond_constrainer_ffn_num_layers=1, weight_column=None, target_columns=None, mol_target_columns=None, atom_target_columns=None, bond_target_columns=None, ignore_columns=None, no_cache=False, splits_column=None, task_type='regression', loss_function=None, v_kl=0.0, eps=1e-08, alpha=0.1, metrics=None, tracking_metric='val_loss', show_individual_scores=False, task_weights=None, warmup_epochs=2, init_lr=0.0001, max_lr=0.001, final_lr=0.0001, epochs=50, patience=None, grad_clip=None, class_balance=False, split='RANDOM', split_sizes=[0.8, 0.1, 0.1], split_key_molecule=0, num_replicates=1, num_folds=None, save_smiles_splits=False, splits_file=None, data_seed=0, pytorch_seed=None)

In [None]:
from chemprop.sklearn_integration.chemprop_lightning_estimator import ChempropMoleculeTransformer, ChempropRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
import numpy as np
import joblib

# Sample data
X = np.array([
    "CCO", "CCN", "CCC", "COC", "CNC", "CCCl", "CCBr", "CCF", "CCI", "CC=O",
    "CC#N", "CC(C)O", "CC(C)N", "CC(C)C", "COC(C)", "CN(C)C", "C1CCCCC1", "C1=CC=CC=C1",
    "CC(C)(C)O", "CC(C)(C)N", "COCCO", "CCOC(=O)C", "CCN(CC)CC", "CN1CCCC1", "C(CO)N"
])
y = np.array([
    0.50, 0.60, 0.55, 0.58, 0.52, 0.62, 0.65, 0.57, 0.59, 0.61,
    0.56, 0.60, 0.54, 0.53, 0.62, 0.63, 0.45, 0.40,
    0.64, 0.66, 0.59, 0.51, 0.48, 0.46, 0.49
])

ImportError: cannot import name 'ChempropReactionTransformer' from 'chemprop.sklearn_integration.chemprop_lightning_estimator' (C:\Users\jxl05\Downloads\chemprop\chemprop\sklearn_integration\chemprop_lightning_estimator.py)

## Cross-Validation Example

In [None]:
pipeline = Pipeline([
    ("featurizer", ChempropMoleculeTransformer()),
    ("regressor", ChempropRegressor(n_epochs=20))
])

scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
print("Cross-validation MSE scores:", -scores)
print("Average MSE:", -scores.mean())

## Hyperparameter Tuning Example with GridSearchCV

In [None]:
param_grid = {
    'regressor__batch_size': [8, 16],
    'regressor__n_epochs': [3, 5],
}

grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error')
grid.fit(X, y)

print("Best parameters:", grid.best_params_)
print("Best score (MSE):", -grid.best_score_)

## Save and Reload Trained Pipeline

In [5]:
joblib.dump(grid.best_estimator_, 'best_chemprop_pipeline.pkl')
loaded = joblib.load('best_chemprop_pipeline.pkl')
preds = loaded.predict(["CCO", "CC(=O)O"])
print("Predictions from reloaded model:", preds)

Predictions from reloaded model: [0.16074981 0.13957587]


