# Chemprop scikit-learn Estimator Example
Demonstrating usage of `ChempropTransformer` and `ChempropRegressor` with common scikit-learn workflows including cross-validation, hyperparameter tuning, and model persistence.

In [None]:
from chemprop.sklearn_integration.chemprop_estimator import ChempropTransformer, ChempropRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
import numpy as np
import joblib

# Sample data
X = np.array([
    "CCO", "CCN", "CCC", "COC", "CNC", "CCCl", "CCBr", "CCF", "CCI", "CC=O",
    "CC#N", "CC(C)O", "CC(C)N", "CC(C)C", "COC(C)", "CN(C)C", "C1CCCCC1", "C1=CC=CC=C1",
    "CC(C)(C)O", "CC(C)(C)N", "COCCO", "CCOC(=O)C", "CCN(CC)CC", "CN1CCCC1", "C(CO)N"
])
y = np.array([
    0.50, 0.60, 0.55, 0.58, 0.52, 0.62, 0.65, 0.57, 0.59, 0.61,
    0.56, 0.60, 0.54, 0.53, 0.62, 0.63, 0.45, 0.40,
    0.64, 0.66, 0.59, 0.51, 0.48, 0.46, 0.49
])

## Cross-Validation Example

In [None]:
pipeline = Pipeline([
    ("featurizer", ChempropTransformer()),
    ("regressor", ChempropRegressor(n_epochs=5))
])

scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
print("Cross-validation MSE scores:", -scores)
print("Average MSE:", -scores.mean())

## Hyperparameter Tuning Example with GridSearchCV

In [None]:
param_grid = {
    'regressor__lr': [1e-2, 1e-3],
    'regressor__n_epochs': [3, 5],
}

grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error')
grid.fit(X, y)

print("Best parameters:", grid.best_params_)
print("Best score (MSE):", -grid.best_score_)

## Save and Reload Trained Pipeline

In [None]:
joblib.dump(grid.best_estimator_, 'best_chemprop_pipeline.pkl')
loaded = joblib.load('best_chemprop_pipeline.pkl')
preds = loaded.predict(["CCO", "CC(=O)O"])
print("Predictions from reloaded model:", preds)