In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load data
df = pd.read_csv("mushrooms.csv")

# Separate target and features
y = df["class"]
X = df.drop("class", axis=1)

# Encode categorical variables (since mushroom dataset is all strings)
X = pd.get_dummies(X)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Random Forest with grid search
param_grid = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [5, 10, 20, 30],
    "min_samples_split": [2, 5, 10, 15, 20],
}

rf = RandomForestClassifier(random_state=42)

# Use Grid Search to find the best hyperparameters
grid = GridSearchCV(rf, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid.fit(X_train, y_train)
print("Best parameters:", grid.best_params_)

# Evaluate accuracy on the test set
y_pred = grid.best_estimator_.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))


Best parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Test Accuracy: 1.0


In [None]:
import pickle
import joblib

# Save best model
with open("rf_model.pkl", "wb") as f:
    pickle.dump(grid.best_estimator_, f)

# Save feature names
joblib.dump(X.columns, "X_columns.pkl")

['X_columns.pkl']