# Decision Tree for Classification
In this document, we develop a decision tree classifier using scikit-sklearn to classify data for breast cancer dataset.

## Import Libraries

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV       # for hyper-parameter tuning

## Load Dataset

In [2]:
dataset = datasets.load_breast_cancer()
X, y = dataset.data, dataset.target

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Fit Model

In [4]:
param_grid = {
    'criterion'         : ['gini', 'entropy'],      # the criterion for splitting
    'max_depth'         : np.arange(1, 100, 9),     # maximum depth of tree for convergence
    'min_samples_split' : [2, 5, 10],               # minumum number of samples required to split an internal node (convergence)
    'min_samples_leaf'  : [1, 2, 4]                 # minimum number of samples required to be at a leaf node (convergence)
}

clf = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator  = clf,
    param_grid = param_grid,
    cv         = 5,                                 # number of folds for cross-validation
    scoring    = 'accuracy',
    n_jobs     = -1                                 # number of CPU cores to use (-1 means all cores)
)

grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_params_
best_score      = grid_search.best_score_
best_clf        = grid_search.best_estimator_

print("Best Parameters:")
for k, v in best_parameters.items():
    print(f"{k}: {v}")

print(f"Training Accuracy: {best_score*100:.2f}")

Best Parameters:
criterion: entropy
max_depth: 10
min_samples_leaf: 1
min_samples_split: 10
Training Accuracy: 94.73


## Testing

In [5]:
y_pred = best_clf.predict(X_test)
accuracy = np.sum(y_pred == y_test) / len(y_test)
print(f"Accuracy: {accuracy*100:.2f}")

Accuracy: 95.61
