In [33]:
# Load required libraries and dependencies
%matplotlib inline
import pandas as pd
import numpy as np
import sklearn
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [25]:
# Read the breast_cancer dataset, and assign it to the cancer variable
cancer = load_breast_cancer()

In [28]:
# Some usefull informations about the breast_cancer dataset
print(cancer.keys())
print(len(cancer.data))

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
569


In [35]:
# Create the x's feature dataframe, the y target dataframe, and display the x's features matrix
x = pd.DataFrame(data = cancer.data, columns = cancer.feature_names)
y = pd.DataFrame(cancer.target)
x

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [32]:
# Split the dataset using the conventional split approach
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size = 0.8, random_state = 1)

In [31]:
# Some basics and usefull informations about x_train,x_test,y_train,y_test
print(len(cancer.data))
print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

569
455
455
114
114


The cancer dataset has 569 rows.<br>
80% of 569 is 455, representing the length of the training dataset.<br>
20% of 569 is 114, representing the length of the test datasets.<br>

### Deciding on what values to choose for the parameters in the grid search:

sample size = 569

size of the training data = 455.2 ~ 455

10% of 455 = 45

5% of 455 ~ 23


In [36]:
# Create a parameter grid search
param_grid1 = {
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(1,11),
     'min_samples_split': np.arange(45,190,5),  
    'min_samples_leaf':np.arange(23,45,5),     
    'min_impurity_decrease': [0, 0.0005, 0.001, 0.005, 0.01, 0.05]
}

In [37]:
# Apply the GridSearchCV() method with cv=5, scoring='accuracy', and param_grid1 as arguments.
gridSearch1 = GridSearchCV(DecisionTreeClassifier(), param_grid1, cv=5,scoring='accuracy') 

In [38]:
# Fit the object created with GridSearchCV() using x_train and y_train
gridSearch1.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
                         'min_impurity_decrease': [0, 0.0005, 0.001, 0.005,
                                                   0.01, 0.05],
                         'min_samples_leaf': array([23, 28, 33, 38, 43]),
                         'min_samples_split': array([ 45,  50,  55,  60,  65,  70,  75,  80,  85,  90,  95, 100, 105,
       110, 115, 120, 125, 130, 135, 140, 145, 150, 155, 160, 165, 170,
       175, 180, 185])},
             scoring='accuracy')

In [39]:
# Display the best parameters
print('Initial parameters for best tree: ', gridSearch1.best_params_)

print('Initial score for best tree (CV accuracy): ', gridSearch1.best_score_)

Initial parameters for best tree:  {'criterion': 'entropy', 'max_depth': 3, 'min_impurity_decrease': 0, 'min_samples_leaf': 23, 'min_samples_split': 45}
Initial score for best tree (CV accuracy):  0.9296703296703297


In [40]:
# Make search number two
param_grid2 = {
    'criterion': ['entropy'],
    'max_depth': np.arange(1,5),
     'min_samples_split': np.arange(20,71),   
    'min_samples_leaf':np.arange(13,34),      
    'min_impurity_decrease': [0, 0.0005, 0.001]
}

In [41]:
# Apply the GridSearchCV() method with cv=5, scoring='accuracy', and param_grid2 as arguments.
gridSearch2 = GridSearchCV(DecisionTreeClassifier(), param_grid2, cv=5,scoring='accuracy') 

In [42]:
# Fit the object created with GridSearchCV() using x_train and y_train
gridSearch2.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy'],
                         'max_depth': array([1, 2, 3, 4]),
                         'min_impurity_decrease': [0, 0.0005, 0.001],
                         'min_samples_leaf': array([13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
       30, 31, 32, 33]),
                         'min_samples_split': array([20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
       54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70])},
             scoring='accuracy')

In [43]:
# Display the best improve parameters
print('Improved parameters for best tree: ', gridSearch2.best_params_)

print('Improved score for best tree (CV accuracy): ', gridSearch2.best_score_)

Improved parameters for best tree:  {'criterion': 'entropy', 'max_depth': 3, 'min_impurity_decrease': 0, 'min_samples_leaf': 22, 'min_samples_split': 21}
Improved score for best tree (CV accuracy):  0.9318681318681319


In [45]:
# Create the classification tree model with the grid best parameters
tree_cancer_prepru= DecisionTreeClassifier(criterion='entropy' , max_depth=3, min_samples_split=21, min_samples_leaf=22, min_impurity_decrease=0, random_state=1)

In [46]:
# Fit the classification tree model with the training data
tree_cancer_prepru.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=3,
                       min_impurity_decrease=0, min_samples_leaf=22,
                       min_samples_split=21, random_state=1)

In [None]:
# Report the test prediction error rate of the tree that you obtained
