In [1]:
# import the libraries
import pandas as pd
import numpy as np

In [2]:
# import the dataset
df = pd.read_csv("original_dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class                 61069 non-null  object 
 1   cap-diameter          61069 non-null  float64
 2   cap-shape             61069 non-null  object 
 3   cap-surface           46949 non-null  object 
 4   cap-color             61069 non-null  object 
 5   does-bruise-or-bleed  61069 non-null  object 
 6   gill-attachment       51185 non-null  object 
 7   gill-spacing          36006 non-null  object 
 8   gill-color            61069 non-null  object 
 9   stem-height           61069 non-null  float64
 10  stem-width            61069 non-null  float64
 11  stem-root             9531 non-null   object 
 12  stem-surface          22945 non-null  object 
 13  stem-color            61069 non-null  object 
 14  veil-type             3177 non-null   object 
 15  veil-color         

In [3]:
# checking the missing data
missing_data = df.isnull().sum()
print(missing_data)

class                       0
cap-diameter                0
cap-shape                   0
cap-surface             14120
cap-color                   0
does-bruise-or-bleed        0
gill-attachment          9884
gill-spacing            25063
gill-color                  0
stem-height                 0
stem-width                  0
stem-root               51538
stem-surface            38124
stem-color                  0
veil-type               57892
veil-color              53656
has-ring                    0
ring-type                2471
spore-print-color       54715
habitat                     0
season                      0
dtype: int64


In [4]:
# keep the columns which have the number of NAN values less than 30 percent
df_no_nan = df.loc[:, missing_data <= (len(df)*0.3)]

# Remove the rows which have NAN values
df_no_nan = df_no_nan.dropna()

# checking the missing data after removing the columns
missing_dropped = df_no_nan.isnull().sum()
print(missing_dropped)

class                   0
cap-diameter            0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-color              0
stem-height             0
stem-width              0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
dtype: int64


In [5]:
# print the dataset
print(df_no_nan.head)

<bound method NDFrame.head of       class  cap-diameter cap-shape cap-surface cap-color  \
0         p         15.26         x           g         o   
1         p         16.60         x           g         o   
2         p         14.07         x           g         o   
3         p         14.17         f           h         e   
4         p         14.64         x           h         o   
...     ...           ...       ...         ...       ...   
61064     p          1.18         s           s         y   
61065     p          1.27         f           s         y   
61066     p          1.27         s           s         y   
61067     p          1.24         f           s         y   
61068     p          1.17         s           s         y   

      does-bruise-or-bleed gill-attachment gill-color  stem-height  \
0                        f               e          w        16.95   
1                        f               e          w        17.99   
2                        f 

In [6]:
# identify X (predictors) and y (target variable)
X = df_no_nan.iloc[:, 1:].values
y = df_no_nan.iloc[:,0].values
print("Predictors:", X)
print("Target value:", y)

Predictors: [[15.26 'x' 'g' ... 'g' 'd' 'w']
 [16.6 'x' 'g' ... 'g' 'd' 'u']
 [14.07 'x' 'g' ... 'g' 'd' 'w']
 ...
 [1.27 's' 's' ... 'f' 'd' 'u']
 [1.24 'f' 's' ... 'f' 'd' 'u']
 [1.17 's' 's' ... 'f' 'd' 'u']]
Target value: ['p' 'p' 'p' ... 'p' 'p' 'p']


In [7]:
# identify the categorical attributes in X
X_categorical_cols = [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13]

# encode the categorical variables in X (predictors) using OneHotEncoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse=False), X_categorical_cols)], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)



[[0.0 0.0 0.0 ... 15.26 16.95 17.09]
 [0.0 0.0 0.0 ... 16.6 17.99 18.19]
 [0.0 0.0 0.0 ... 14.07 17.8 17.74]
 ...
 [0.0 0.0 0.0 ... 1.27 3.86 6.37]
 [0.0 0.0 1.0 ... 1.24 3.56 5.44]
 [0.0 0.0 0.0 ... 1.17 3.25 5.45]]


In [8]:
# encode the binary target variables (y)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[1 1 1 ... 1 1 1]


In [9]:
from Mushroom_project.Tree import train_test_split
# apply the train_test_split function to split the data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [20]:
from Mushroom_project.Tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(min_samples=2, max_depths=5)

In [None]:
classifier.fit(X_train, y_train)

In [14]:
# calculate train error and test error
train_error = classifier.zero_one_loss(X_train, y_train)
test_error = classifier.zero_one_loss(X_test, y_test)

print(f"Training Error: {train_error:.4f}")
print(f"Testing Error: {test_error:.4f}")

Training Error: 0.2280
Testing Error: 0.2297


In [15]:
# calculate train accuracy and test accuracy
y_train_pred = classifier.predict(X_train)
y_test_pred = classifier.predict(X_test)
train_accuracy = classifier.accuracy(y_train, y_train_pred)
test_accuracy = classifier.accuracy(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

Training Accuracy: 0.7720
Testing Accuracy: 0.7703


#### Hyperparameter Tuning

In [None]:
param_grid = {
    'max_depth': [5, 10, 15, 20, 25, 30],
    'min_samples_split': [5, 10, 15, 20],
    'criterion': ['gini', 'entropy', 'chi_square']
}

In [None]:
from Mushroom_project.Tree import grid_search_cv
best_params, best_accuracy_score = grid_search_cv(classifier=classifier, param_grid=param_grid, X_train=X_train, y_train=y_train, cv=5, n_jobs=-1)
print("Best parameters: ", best_params)
print("Best cross-validation score: ", best_accuracy_score)

In [None]:
better_classifier = DecisionTreeClassifier(min_samples=, max_depths=,criterion='')