Step 1 - Import Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import RandomForestClassifier # import RandomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

Step 2 - Data Preparation

Step 2.1 - Load Data

In [2]:
# Load data
df = pd.read_csv('mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


Step 2.2  - Check Null Values

In [3]:
# Check null
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

Step 2.3 - Select The Features

In [32]:
# Features selection

# Features selection from 'cap-shape' to 'habitat'
X = df.iloc[:, 1:] 
y = df['class']
X = pd.get_dummies(X, drop_first=True) # Encode label

# Check features numbers and instance numbers
X.shape

(8124, 95)

Step 3 - Split Data

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

Step 4 - Train Decision Tree Classifier Model

In [31]:
# By default, DT in scikit-learn will use "Gini" as split criteria
# Read the documentation for more detail
dt = DecisionTreeClassifier(random_state=42)

# Fitting / train DT model
dt.fit(X_train, y_train)

# Predict test set
y_pred_dt = dt.predict(X_test)

#  Calculate the accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 1.00
Test set accuracy: 1.0


Step 5 - Train Random Forest Classifier Model

In [29]:
# We will use estimator=10 in this model
# You can read the documentation to understand the hyperparameter of RF in scikit-learn

rf = RandomForestClassifier(n_estimators=10, random_state=1)

# Fitting / training
rf.fit(X_train, y_train)

# Predict the test set
y_pred_rf = rf.predict(X_test)

#  Calculate the accuracy
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Test set accuracy: {:.2f}".format(acc_rf))
print(f"Test set accuracy: {acc_rf}")

Test set accuracy: 1.00
Test set accuracy: 1.0


Step 6 - Hyperparameter Tuning

In [42]:
from sklearn.model_selection import GridSearchCV

# Define Decision Tree model and hyperparameter grid for tuning
dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV for Decision Tree
dt_grid_search = GridSearchCV(dt, dt_param_grid, cv=5, scoring='accuracy')
dt_grid_search.fit(X_train, y_train)

# Get the best hyperparameters for Decision Tree
best_params_dt = dt_grid_search.best_params_
best_accuracy_dt = dt_grid_search.best_score_
print("Best Decision Tree Parameters:", best_params_dt)
print("Best Decision Tree Accuracy:", best_accuracy_dt)


Best Decision Tree Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Decision Tree Accuracy: 0.9996923076923077


In [41]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV for Decision Tree
rf_grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
rf_grid_search.fit(X_train, y_train)

# Get the best hyperparameters for Decision Tree
best_params_rf = rf_grid_search.best_params_
best_accuracy_rf = rf_grid_search.best_score_
print("Best Decision Tree Parameters:", best_params_rf)
print("Best Decision Tree Accuracy:", best_accuracy_rf)


Best Decision Tree Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}
Best Decision Tree Accuracy: 1.0


Step 7 - Evaluate Tuned Models

In [43]:

print("Decision Tree Accuracy:", best_accuracy_dt)
print("Random Forest Accuracy:", best_accuracy_rf)

Decision Tree Accuracy: 0.9996923076923077
Random Forest Accuracy: 1.0
