In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('mushrooms.csv')

print(df.head())

print("Missing values:\n", df.isnull().sum())

df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)

  class cap-shape cap-surface cap-color bruises odor gill-attachment  \
0     p         x           s         n       t    p               f   
1     e         x           s         y       t    a               f   
2     e         b           s         w       t    l               f   
3     p         x           y         w       t    p               f   
4     e         x           s         g       f    n               f   

  gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            c         n          n  ...                        s   
4            w         b          k  ...                        s   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0                      w                      w         p          w   
1       

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split1 = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=42)
for train_index, remaining_index in split1.split(df, df['class']):
    train_df = df.iloc[train_index]
    remaining_df = df.iloc[remaining_index]

split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for val_index, test_index in split2.split(remaining_df, remaining_df['class']):
    val_df = remaining_df.iloc[val_index]
    test_df = remaining_df.iloc[test_index]

train_X = train_df.drop('class', axis=1)
train_y = train_df['class']
val_X = val_df.drop('class', axis=1)
val_y = val_df['class']
test_X = test_df.drop('class', axis=1)
test_y = test_df['class']

print("Training set shape:", train_X.shape)
print("Validation set shape:", val_X.shape)
print("Testing set shape:", test_X.shape)

Training set shape: (3386, 22)
Validation set shape: (1129, 22)
Testing set shape: (1129, 22)


In [None]:
class Node():
    def __init__(self, attr=None, pred=None, class_label=None) -> None:
        self.attr = attr
        self.children = None
        self.isLeaf = False
        self.pred = pred
        self.class_label = class_label


In [None]:
class DecisionTreeClassifierID3():
    def __init__(self):
        self.root = None

    def isBalanced(self, df):
        return len(df.unique()) == 1

    def getEntropy(self, df):
        total = len(df)
        labels = df.value_counts(normalize=True)
        entropy = -sum(labels * np.log2(labels))
        return entropy

    def gain(self, X, y, attr):
        total_entropy = self.getEntropy(y)
        values = X[attr].unique()
        weighted_entropy = 0

        for val in values:
            sub_y = y[X[attr] == val]
            weighted_entropy += (len(sub_y) / len(y)) * self.getEntropy(sub_y)

        info_gain = total_entropy - weighted_entropy
        return info_gain

    def getMaxGain(self, X, y):
        gains = {}
        for attr in X.columns:
            gains[attr] = self.gain(X, y, attr)
            print(f"Information Gain for {attr}: {gains[attr]:.4f}")
        max_gain_attr = max(gains, key=gains.get)
        return max_gain_attr

    def buildTree(self, X, y, attr_classes, class_val=None, depth=0, max_depth=None):
        root = Node()
        if self.isBalanced(y) or X.empty or (max_depth is not None and depth >= max_depth):
            root.isLeaf = True
            root.pred = y.mode()[0] if not y.mode().empty else None
        else:
            maxGainCol = self.getMaxGain(X, y)
            pred = y.mode()[0]
            attr_list = attr_classes[maxGainCol].copy()
            root.attr = maxGainCol
            root.children = []

            for val in attr_list:
                new_X = X[X[maxGainCol] == val].drop(columns=[maxGainCol])
                new_y = y[X[maxGainCol] == val]
                child = self.buildTree(new_X, new_y, attr_classes, val, depth + 1, max_depth)
                root.children.append(child)

            root.pred = pred

        root.class_label = class_val
        return root

    def printTree(self, root, num_spaces=0):
        if root is None:
            return
        print("\t" * num_spaces, end="")
        if root.class_label is not None:
            print(f"[{root.class_label}] ", end="")
        if root.isLeaf:
            print(f"Leaf: {root.pred}")
        else:
            print(f"Attribute: {root.attr}")
            for child in root.children:
                self.printTree(child, num_spaces + 1)

    def train(self, X, y, max_depth=None):
        attr_classes = {}
        for col in X.columns:
            attr_classes[col] = X[col].unique().tolist()
        self.root = self.buildTree(X, y, attr_classes, max_depth=max_depth)

    def predict_one_example(self, x, root):
        if root.isLeaf or root.children is None:
            return root.pred
        attr_value = x.get(root.attr)
        for child in root.children:
            if child.class_label == attr_value:
                return self.predict_one_example(x, child)
        return root.pred

    def predict(self, X):
        predictions = []
        for _, row in X.iterrows():
            predictions.append(self.predict_one_example(row, self.root))
        return predictions

In [None]:
max_depth = 6

# Train the model
clf = DecisionTreeClassifierID3()
print("\nTraining the Decision Tree with Pre-Pruning...")
clf.train(train_X, train_y, max_depth=max_depth)


Training the Decision Tree with Pre-Pruning...
Information Gain for cap-shape: 0.0168
Information Gain for cap-surface: 0.0044
Information Gain for cap-color: 0.1952
Information Gain for bruises: 0.1291
Information Gain for odor: 0.8548
Information Gain for gill-attachment: 0.0033
Information Gain for gill-spacing: 0.0641
Information Gain for gill-size: 0.0296
Information Gain for gill-color: 0.2164
Information Gain for stalk-shape: 0.2719
Information Gain for stalk-root: 0.1013
Information Gain for stalk-surface-above-ring: 0.4209
Information Gain for stalk-surface-below-ring: 0.4078
Information Gain for stalk-color-above-ring: 0.2977
Information Gain for stalk-color-below-ring: 0.2744
Information Gain for veil-type: 0.0000
Information Gain for veil-color: 0.0012
Information Gain for ring-number: 0.0111
Information Gain for ring-type: 0.4631
Information Gain for spore-print-color: 0.5890
Information Gain for population: 0.1086
Information Gain for habitat: 0.1012
Information Gain for

In [None]:
def prune_tree(node, X_val, y_val):
    if node.isLeaf or node.children is None:
        return

    for child in node.children:
        subset_indices = X_val[node.attr] == child.class_label
        X_subset = X_val[subset_indices].drop(columns=[node.attr])
        y_subset = y_val[subset_indices]
        prune_tree(child, X_subset, y_subset)

    predictions = []
    for _, x in X_val.iterrows():
        predictions.append(clf.predict_one_example(x, node))
    accuracy_without_pruning = (np.array(predictions) == y_val.values).mean()

    temp_children = node.children
    temp_isLeaf = node.isLeaf
    temp_pred = node.pred

    node.isLeaf = True
    node.children = None
    node.pred = node.pred

    predictions = []
    for _, x in X_val.iterrows():
        predictions.append(clf.predict_one_example(x, node))
    accuracy_with_pruning = (np.array(predictions) == y_val.values).mean()

    if accuracy_with_pruning < accuracy_without_pruning:
        node.isLeaf = temp_isLeaf
        node.children = temp_children
        node.pred = temp_pred
    else:
        pass  # Keep node pruned

In [None]:
print("\nApplying Post-Pruning...")
prune_tree(clf.root, val_X, val_y)


Applying Post-Pruning...


In [None]:
print("\nDecision Tree Structure After Pruning:")
clf.printTree(clf.root)


Decision Tree Structure After Pruning:
Attribute: odor
	[f] Leaf: p
	[n] Attribute: spore-print-color
		[h] Leaf: None
		[k] Leaf: e
		[n] Leaf: e
		[w] Attribute: cap-color
			[g] Leaf: e
			[e] Leaf: None
			[n] Leaf: e
			[p] Leaf: e
			[y] Leaf: p
			[w] Leaf: p
			[b] Leaf: None
			[c] Leaf: e
		[u] Leaf: None
		[r] Leaf: p
	[c] Leaf: p
	[a] Leaf: e
	[p] Leaf: p
	[l] Leaf: e
	[m] Leaf: p


In [None]:
pred_y = clf.predict(test_X)

accuracy = (np.array(pred_y) == test_y.values).mean() * 100
print(f"\nAccuracy on Test Set: {accuracy:.2f}%")


Accuracy on Test Set: 100.00%


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

combined_X = pd.concat([train_X, val_X])
combined_y = pd.concat([train_y, val_y])

label_encoders = {}
for column in combined_X.columns:
    le = LabelEncoder()
    combined_X[column] = le.fit_transform(combined_X[column])
    test_X[column] = le.transform(test_X[column])
    label_encoders[column] = le

le_target = LabelEncoder()
combined_y = le_target.fit_transform(combined_y)
test_y_encoded = le_target.transform(test_y)

sklearn_clf = DecisionTreeClassifier(random_state=42, max_depth=max_depth)
sklearn_clf.fit(combined_X, combined_y)

sklearn_pred = sklearn_clf.predict(test_X)

sklearn_accuracy = accuracy_score(test_y_encoded, sklearn_pred) * 100
print(f"Scikit-learn Decision Tree Accuracy with max_depth={max_depth}: {sklearn_accuracy:.2f}%")

Scikit-learn Decision Tree Accuracy with max_depth=6: 100.00%
