In [None]:
# import the libraries
import pandas as pd
import numpy as np

In [None]:
# import the dataset
df = pd.read_csv("original_dataset.csv")
df.info()

In [None]:
# checking the missing data
missing_data = df.isnull().sum()
print(missing_data)

In [None]:
# keep the columns which have the number of NAN values less than 30 percent
df_no_nan = df.loc[:, missing_data <= (len(df)*0.3)]

# Remove the rows which have NAN values
df_no_nan = df_no_nan.dropna()

# checking the missing data after removing the columns
missing_dropped = df_no_nan.isnull().sum()
print(missing_dropped)

In [None]:
# print the dataset
print(df_no_nan.head)

In [None]:
# identify X (predictors) and y (target variable)
X = df_no_nan.iloc[:, 1:].values
y = df_no_nan.iloc[:,0].values
print("Predictors:", X)
print("Target value:", y)

In [None]:
# identify the categorical attributes in X
X_categorical_cols = [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13]

# encode the categorical variables in X (predictors) using OneHotEncoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse=False), X_categorical_cols)], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)

In [None]:
# encode the binary target variables (y)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

In [None]:
from Mushroom_project.Tree import train_test_split
# apply the train_test_split function to split the data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [None]:
from Mushroom_project.Tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(min_samples=2, max_depths=5)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
# calculate train error and test error
train_error = classifier.zero_one_loss(X_train, y_train)
test_error = classifier.zero_one_loss(X_test, y_test)

print(f"Training Error: {train_error:.4f}")
print(f"Testing Error: {test_error:.4f}")

In [None]:
# calculate train accuracy and test accuracy
y_train_pred = classifier.predict(X_train)
y_test_pred = classifier.predict(X_test)
train_accuracy = classifier.accuracy(y_train, y_train_pred)
test_accuracy = classifier.accuracy(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

In [None]:
import matplotlib.pyplot as plt
# Initialize lists to store results
train_error = []
train_accuracy = []
test_error = []
test_accuracy = []
for max_depth in range(1, 30):
    train_error.append(train_error)
    train_accuracy.append(train_accuracy)
    test_error.append(test_error)
    test_accuracy.append(test_accuracy)

plt.plot(train_error, label="Train Error")
plt.plot(test_error, label="Test Error")
plt.xlabel("Max depth")
plt.ylabel("0-1 Loss")
plt.legend()
plt.show()

In [None]:
plt.plot(range(1, 50), train_accuracy, label="Train accuracy")
plt.plot(range(1, 50), test_accuracy, label="Test accuracy")
plt.xlabel("Max depth")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

#### Hyperparameter Tuning

In [None]:
param_grid = {
    'max_depth': [2, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 15, 10],
    'criterion': ['gini', 'entropy', 'chi_square']
}

In [None]:
from Mushroom_project.Tree import grid_search_cv
best_params, best_accuracy_score = grid_search_cv(classifier=classifier, param_grid=param_grid, X_train=X_train, y_train=y_train, cv=5, n_jobs=-1)
print("Best parameters: ", best_params)
print("Best cross-validation score: ", best_accuracy_score)

In [None]:
better_classifier = DecisionTreeClassifier(min_samples=, max_depths=,criterion='')