In [1]:

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import joblib

# 1. Load dataset
X, y = load_iris(return_X_y=True)

# 2. Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Create a pipeline (scaling + classifier)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# 4. Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [3, 5, None]
}

# 5. GridSearchCV with 5-fold CV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# 6. Fit the model
grid_search.fit(X_train, y_train)

# 7. Print best params and score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)

# 8. Save the best model to file
joblib.dump(grid_search.best_estimator_, 'best_rf_pipeline.joblib')
print("Model saved to 'best_rf_pipeline.joblib'")

# 9. Later... Load the model
loaded_model = joblib.load('best_rf_pipeline.joblib')
print("Model loaded from file.")

# 10. Use loaded model to predict
sample_input = X_test[0].reshape(1, -1)
prediction = loaded_model.predict(sample_input)
print("Prediction on test sample:", prediction)


Best Parameters: {'classifier__max_depth': 3, 'classifier__n_estimators': 50}
Best CV Accuracy: 0.95
Model saved to 'best_rf_pipeline.joblib'
Model loaded from file.
Prediction on test sample: [1]


In [None]:
# Loading data

# Creating a preprocessing + classification pipeline

# Performing GridSearchCV

# Saving the best model using joblib

# Loading it again and making predictions