In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

# Load the dataset
df = pd.read_csv('/content/mushrooms.csv')
print(df.head())
print(f"Total number of rows: {len(df)}")

# Check for missing values and clean the data
print("Missing values:\n", df.isnull().sum())

df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)

print("Dataset after cleaning:\n", df.head())
print(f"Total number of rows: {len(df)}")


  class cap-shape cap-surface cap-color bruises odor gill-attachment  \
0     p         x           s         n       t    p               f   
1     e         x           s         y       t    a               f   
2     e         b           s         w       t    l               f   
3     p         x           y         w       t    p               f   
4     e         x           s         g       f    n               f   

  gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            c         n          n  ...                        s   
4            w         b          k  ...                        s   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0                      w                      w         p          w   
1       

In [None]:
# Split the data into training, validation, and testing sets
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['class']):
    df_train = df.iloc[train_index]
    df_test = df.iloc[test_index]

split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
for train_index, val_index in split.split(df_train, df_train['class']):
    df_train_final = df_train.iloc[train_index]
    df_val = df_train.iloc[val_index]

train_X = df_train_final.drop('class', axis=1)
train_y = df_train_final['class']
val_X = df_val.drop('class', axis=1)
val_y = df_val['class']
test_X = df_test.drop('class', axis=1)
test_y = df_test['class']

print("Training set shape:", train_X.shape)
print("Validation set shape:", val_X.shape)
print("Testing set shape:", test_X.shape)


Training set shape: (3386, 22)
Validation set shape: (1129, 22)
Testing set shape: (1129, 22)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encoding categorical variables
label_encoder = LabelEncoder()
for column in train_X.columns:
    train_X[column] = label_encoder.fit_transform(train_X[column])
    val_X[column] = label_encoder.transform(val_X[column])
    test_X[column] = label_encoder.transform(test_X[column])


In [None]:
df.value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0,count
class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,Unnamed: 23_level_1
p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,p,1
e,b,s,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,d,1
e,b,s,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,p,1
e,b,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g,1
e,b,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m,1
e,b,s,w,t,a,f,c,b,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
e,b,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g,1
e,b,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m,1
e,b,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g,1
e,b,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m,1


In [None]:
class Node:
    def __init__(self, attr=None, pred=None, class_label=None):
        self.attr = attr
        self.children = None
        self.isLeaf = False
        self.pred = pred
        self.class_label = class_label

class DecisionTreeClassifierID3():
    def __init__(self):
        self.root = None

    def isBalanced(self, df):
        return len(list(df.value_counts())) == 1

    def getEntropy(self, total, df):
        labels = sorted(df.value_counts().to_dict().items())
        entropy = 0
        for label in labels:
            f = (label[1] / total)
            entropy -= f * np.log2(f)
        return entropy

    def gain(self, column, y):
        total = len(column)
        labels = sorted(y.value_counts().to_dict().items())

        total_entropy = 0
        for label in labels:
            f = (label[1] / total)
            total_entropy -= f * np.log2(f)

        g = total_entropy
        concat_df = pd.concat([column, y], axis=1)
        df_dict = {g: d[y.name] for g, d in concat_df.groupby(by=[concat_df.columns[0]])}

        for key, value in df_dict.items():
            g -= (len(value) / total) * self.getEntropy(total, value)
        return g

    def getMaxGain(self, X, y):
        cols = X.columns
        gain_dict = {}
        for col in cols:
            a = X[col]
            gain_dict[col] = self.gain(a, y)
            print(f"Information Gain for {col}: {gain_dict[col]:.4f}")

        def compare_gain(x):
            return x[1]

        return sorted(gain_dict.items(), key=compare_gain, reverse=True)[0]

    def buildTree(self, X, y, attr_classes, class_val=None, depth=0, max_depth=10):
        if depth == max_depth:
            root = Node()
            root.isLeaf = True
            root.pred = y.mode()[0]
            return root

        root = Node()
        if self.isBalanced(y):
            root.isLeaf = True
            root.pred = y.iloc[0]
        elif X is None or X.empty:
            root.isLeaf = True
            root.pred = y.mode()[0]
        else:
            maxGain = self.getMaxGain(X, y)
            maxGainCol = maxGain[0]
            pred = y.mode()[0]
            attr_list = attr_classes[maxGainCol].copy()
            concat_df = pd.concat([X, y], axis=1)
            df_dict = {g: d for g, d in concat_df.groupby(by=[maxGainCol])}
            root.attr = maxGainCol
            root.children = []

            for key, value in df_dict.items():
                if key in attr_list:
                    attr_list.remove(key)
                new_X = value.drop(maxGainCol, axis=1).iloc[:, :-1]
                new_y = value.iloc[:, -1]
                root.children.append(self.buildTree(new_X, new_y, attr_classes, key, depth + 1, max_depth))

            if len(attr_list) > 0:
                root.pred = pred

        root.class_label = class_val
        return root

    def printTree(self, root, num_spaces=0):
        print("\t" * num_spaces, end="")
        print(root.class_label, "->", end=" ")
        if root.children is None:
            print(root.pred)
        else:
            print(root.attr)
            for child in root.children:
                self.printTree(child, num_spaces + 1)

    def treeDepth(self, root):
        if root.children is None:
            return 0
        return 1 + max(self.treeDepth(child) for child in root.children)

    def train(self, X, y, max_depth=5):
        attr_classes = {}
        cols = X.columns
        for col in cols:
            attr_classes[col] = list(X[col].value_counts().keys())

        self.root = self.buildTree(X, y, attr_classes, max_depth=max_depth)

    def predict_one_example(self, X, root):
        if root.isLeaf:
            return root.pred
        col = root.attr
        val = X[col]
        next_root = [x for x in root.children if x.class_label == val]
        if len(next_root) == 0:
            return root.pred
        return self.predict_one_example(X, next_root[0])

    def predict(self, X):
        pred_y = []
        for i in range(len(X)):
            pred_y.append(self.predict_one_example(X.iloc[i, :], self.root))
        return pred_y

# Train the custom ID3 decision tree
clf = DecisionTreeClassifierID3()
clf.train(train_X, train_y, max_depth=10)

print("\nCustom ID3 Decision Tree Structure:")
clf.printTree(clf.root)

# Predicting on the test set
pred_y = clf.predict(test_X)

# Calculating accuracy
accuracy = np.sum(np.array(pred_y) == test_y.values) / len(test_y) * 100
print(f"Custom ID3 Decision Tree Test accuracy: {accuracy:.2f}%")

# Printing the depth of the tree
tree_depth = clf.treeDepth(clf.root)
print(f"Depth of the Custom ID3 Decision Tree: {tree_depth}")


Information Gain for cap-shape: 0.0414
Information Gain for cap-surface: 0.1077
Information Gain for cap-color: 0.3478
Information Gain for bruises: 0.0641
Information Gain for odor: 0.4604
Information Gain for gill-attachment: 0.0026
Information Gain for gill-spacing: 0.0046
Information Gain for gill-size: 0.0422
Information Gain for gill-color: 0.4214
Information Gain for stalk-shape: 0.1229
Information Gain for stalk-root: 0.0850
Information Gain for stalk-surface-above-ring: 0.2407
Information Gain for stalk-surface-below-ring: 0.2528
Information Gain for stalk-color-above-ring: 0.2489
Information Gain for stalk-color-below-ring: 0.2502
Information Gain for veil-type: 0.0000
Information Gain for veil-color: 0.0013
Information Gain for ring-number: 0.0181
Information Gain for ring-type: 0.2173
Information Gain for spore-print-color: 0.3339
Information Gain for population: 0.2127
Information Gain for habitat: 0.2121
Information Gain for cap-shape: -0.3235
Information Gain for cap-sur

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Training the Decision Tree Classifier
tree_clf = DecisionTreeClassifier(criterion='entropy', random_state=42)
tree_clf.fit(train_X, train_y)

# Predicting on the validation set
val_pred = tree_clf.predict(val_X)

# Calculating accuracy on the validation set
val_accuracy = accuracy_score(val_y, val_pred) * 100
print(f"Validation accuracy using scikit-learn: {val_accuracy:.2f}%")

# Predicting on the test set
tree_pred = tree_clf.predict(test_X)

# Calculating accuracy on the test set
tree_accuracy = accuracy_score(test_y, tree_pred) * 100
print(f"Test accuracy using scikit-learn: {tree_accuracy:.2f}%")

# Printing the depth of the tree
print(f"Depth of the Decision Tree: {tree_clf.get_depth()}")


Validation accuracy using scikit-learn: 100.00%
Test accuracy using scikit-learn: 100.00%
Depth of the Decision Tree: 5
