In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from matplotlib import pyplot
from pandas.plotting import scatter_matrix
from sklearn.model_selection import  train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier

#declaring header names
winsconsin_headers = ['sample_code','c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#read the data using read_csv class of pandas
wins_data = read_csv("venv/winsconsin_b_cancer (1).csv" ,names= winsconsin_headers)

print(wins_data.shape)

wins_data.drop('sample_code', axis=1, inplace=True)
print(wins_data.shape)
#check all datas are numbers and convert any non-numeric characters to null value
wins_data=wins_data.apply(pd.to_numeric, errors='coerce')
#print(wins_data.apply(pd.to_numeric, errors='coerce').info()) #this will give the datatype info after the conversion

#declaring a new header
new_winsconsin_headers = ['c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#since ? cannot be converted to int we can convert all data to float
wins_data[new_winsconsin_headers] = wins_data[new_winsconsin_headers].applymap(float)
#print(wins_data.dtypes)

#use the simple imputer function to replace missing value
imputer = SimpleImputer (strategy = 'median') # replace most_frequent with median, mean and observe
imputer.fit(wins_data)
new_data = imputer.transform(wins_data)
#reassign the new data frame
wins_data = pd.DataFrame(new_data, columns=new_winsconsin_headers)

#recheck the data for missing values
win_empty_data = wins_data[wins_data.isna().any(axis=1)]
#print('\n These are the missing data \n ', win_empty_data)

#seperate the data into xtrain and y test groups  - training and target sets
train_headers = ['c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses']
target_header = ['tumor_class']

X = wins_data[train_headers]
y = wins_data[target_header]
#split the data into train and test -- split  using 60:40
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1,stratify=y)

#check the dimension of the train and test data
print('\n The total of training dataset', X_train.shape)
print('\n The total of test dataset', X_test.shape)

#instantiate the model
my_model = DecisionTreeClassifier()

#train the model to fit
my_model.fit(X_train, y_train)
#now let's predict the model
y_pred_train = my_model.predict(X_train)

y_pred = my_model.predict(X_test)

#compute the train accuracy
model_acc = accuracy_score(y_train, y_pred_train)
print("Model accuracy on Train data: {:.2f}".format(model_acc), '\n')


#compute test set accuracy
model_accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy on Test data: {:.2f}".format(model_accuracy), '\n')

#constructing a confusion matrix  of the test data
matrix_info = confusion_matrix(y_test, y_pred)
print("The Confusion Matrix: \n", matrix_info, '\n')

#construct the classification report
class_report = classification_report(y_test, y_pred)
print("Report of classification: \n", class_report)

(699, 11)
(699, 10)

 The total of training dataset (489, 9)

 The total of test dataset (210, 9)
Model accuracy on Train data: 1.00 

Model accuracy on Test data: 0.95 

The Confusion Matrix: 
 [[134   4]
 [  7  65]] 

Report of classification: 
               precision    recall  f1-score   support

         2.0       0.95      0.97      0.96       138
         4.0       0.94      0.90      0.92        72

    accuracy                           0.95       210
   macro avg       0.95      0.94      0.94       210
weighted avg       0.95      0.95      0.95       210



    Tuning the model

In [10]:
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from matplotlib import pyplot
from pandas.plotting import scatter_matrix
from sklearn.model_selection import  train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier

#declaring header names
winsconsin_headers = ['sample_code','c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#read the data using read_csv class of pandas
wins_data = read_csv("venv/winsconsin_b_cancer (1).csv" ,names= winsconsin_headers)

print(wins_data.shape)

wins_data.drop('sample_code', axis=1, inplace=True)
print(wins_data.shape)
#check all datas are numbers and convert any non-numeric characters to null value
wins_data=wins_data.apply(pd.to_numeric, errors='coerce')
#print(wins_data.apply(pd.to_numeric, errors='coerce').info()) #this will give the datatype info after the conversion

#declaring a new header
new_winsconsin_headers = ['c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#since ? cannot be converted to int we can convert all data to float
wins_data[new_winsconsin_headers] = wins_data[new_winsconsin_headers].applymap(float)
#print(wins_data.dtypes)

#use the simple imputer function to replace missing value
imputer = SimpleImputer (strategy = 'median') # replace most_frequent with median, mean and observe
imputer.fit(wins_data)
new_data = imputer.transform(wins_data)
#reassign the new data frame
wins_data = pd.DataFrame(new_data, columns=new_winsconsin_headers)

#recheck the data for missing values
win_empty_data = wins_data[wins_data.isna().any(axis=1)]
#print('\n These are the missing data \n ', win_empty_data)

#seperate the data into xtrain and y test groups  - training and target sets
train_headers = ['c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses']
target_header = ['tumor_class']

X = wins_data[train_headers]
y = wins_data[target_header]

#split the data into train and test -- split  using 60:40
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1,stratify=y)

#check the dimension of the train and test data
print('\n The total of training dataset', X_train.shape)
print('\n The total of test dataset', X_test.shape)

#declare a seed variable to ensure reproducibility
SEED =1
#instantiate the model and set the hyperparameters
my_model = DecisionTreeClassifier(max_depth=6, min_samples_leaf=0.01, random_state=SEED)#manually tuning our model

#train the model to fit
my_model.fit(X_train, y_train)
#now let's predict the model
y_pred_train = my_model.predict(X_train)

y_pred = my_model.predict(X_test)

#compute the train accuracy
model_acc = accuracy_score(y_train, y_pred_train)
print("Model accuracy on Train data: {:.2f}".format(model_acc), '\n')


#compute test set accuracy
model_accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy on Test data: {:.2f}".format(model_accuracy), '\n')

#constructing a confusion matrix  of the test data
matrix_info = confusion_matrix(y_test, y_pred)
print("The Confusion Matrix: \n", matrix_info, '\n')

#construct the classification report
class_report = classification_report(y_test, y_pred)
print("Report of classification: \n", class_report)

(699, 11)
(699, 10)

 The total of training dataset (629, 9)

 The total of test dataset (70, 9)
Model accuracy on Train data: 0.97 

Model accuracy on Test data: 0.89 

The Confusion Matrix: 
 [[41  5]
 [ 3 21]] 

Report of classification: 
               precision    recall  f1-score   support

         2.0       0.93      0.89      0.91        46
         4.0       0.81      0.88      0.84        24

    accuracy                           0.89        70
   macro avg       0.87      0.88      0.88        70
weighted avg       0.89      0.89      0.89        70



    Generalisation Error - overfitting and cv

In [24]:
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from matplotlib import pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.model_selection import  train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#declaring header names
winsconsin_headers = ['sample_code','c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#read the data using read_csv class of pandas
wins_data = read_csv("venv/winsconsin_b_cancer (1).csv" ,names= winsconsin_headers)

print(wins_data.shape)

wins_data.drop('sample_code', axis=1, inplace=True)
print(wins_data.shape)
#check all datas are numbers and convert any non-numeric characters to null value
wins_data=wins_data.apply(pd.to_numeric, errors='coerce')
#print(wins_data.apply(pd.to_numeric, errors='coerce').info()) #this will give the datatype info after the conversion

#declaring a new header
new_winsconsin_headers = ['c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#since ? cannot be converted to int we can convert all data to float
wins_data[new_winsconsin_headers] = wins_data[new_winsconsin_headers].applymap(float)
#print(wins_data.dtypes)

#use the simple imputer function to replace missing value
imputer = SimpleImputer (strategy = 'median') # replace most_frequent with median, mean and observe
imputer.fit(wins_data)
new_data = imputer.transform(wins_data)
#reassign the new data frame
wins_data = pd.DataFrame(new_data, columns=new_winsconsin_headers)

#recheck the data for missing values
win_empty_data = wins_data[wins_data.isna().any(axis=1)]
#print('\n These are the missing data \n ', win_empty_data)

#seperate the data into xtrain and y test groups  - training and target sets
train_headers = ['c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses']
target_header = ['tumor_class']

X = wins_data[train_headers]
y = wins_data[target_header]

#split the data into train and test -- split  using 60:40
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1,stratify=y)

#check the dimension of the train and test data
print('\n The total of training dataset', X_train.shape)
print('\n The total of test dataset', X_test.shape)

#declare a seed variable to ensure reproducibility
SEED =1
#instantiate the model and set the hyperparameters
#my_model = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.05, random_state=SEED, criterion = 'gini')#manually tuning our model
#my_model = DecisionTreeClassifier()
#the best hyperparameters
my_model = DecisionTreeClassifier(max_depth=4, min_samples_leaf=1, random_state=SEED, criterion = 'entropy', min_samples_split=8)
#train the model to fit
my_model.fit(X_train, y_train)

#declare the num of folds
num_folds = KFold(n_splits=10, random_state=1, shuffle=True) # test b changing splits to 3, 5 and 10

#compute the array containing the 10 folds and calculate the cross validation mean score
CV_scores = -cross_val_score(my_model, X_train, y_train, cv=num_folds)
#std for the computation range for this generalisation and mean to know how well the model generalises
print("\n Cross val mean: {:.3f} (std: {:.3f})".format(CV_scores.mean()*-1, CV_scores.std()), end="\n\n")

#now let's predict the model for training set
y_pred_train = my_model.predict(X_train)

#now let predict the model for the test set
y_pred = my_model.predict(X_test)

#compute the train accuracy
model_acc = accuracy_score(y_train, y_pred_train)
print("Model accuracy on Train data: {:.2f}".format(model_acc), '\n')


#compute test set accuracy
model_accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy on Test data: {:.2f}".format(model_accuracy), '\n')

#constructing a confusion matrix  of the test data
matrix_info = confusion_matrix(y_test, y_pred)
print("The Confusion Matrix: \n", matrix_info, '\n')

#construct the classification report
class_report = classification_report(y_test, y_pred)
print("Report of classification: \n", class_report)

#ploting the decision tree model
#tree_classif_label = ['2', '4']
#fig = plt.figure(figsize=(30,20))
#tree.plot_tree(my_model, feature_names=train_headers, class_names=tree_classif_label, filled=True, rounded=True, fontsize=14)
#fig.savefig('decisiontree.png') #save the image

(699, 11)
(699, 10)

 The total of training dataset (419, 9)

 The total of test dataset (280, 9)

 Cross val mean: 0.945 (std: 0.026)

Model accuracy on Train data: 0.97 

Model accuracy on Test data: 0.95 

The Confusion Matrix: 
 [[177   6]
 [  9  88]] 

Report of classification: 
               precision    recall  f1-score   support

         2.0       0.95      0.97      0.96       183
         4.0       0.94      0.91      0.92        97

    accuracy                           0.95       280
   macro avg       0.94      0.94      0.94       280
weighted avg       0.95      0.95      0.95       280



Tuning techniques to get the best hyperparameters

GRID SEARCH CROSS VALIDATION TUNING

In [20]:
#Grid search model tuning
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from matplotlib import pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.model_selection import  train_test_split, KFold, cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#declaring header names
winsconsin_headers = ['sample_code','c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#read the data using read_csv class of pandas
wins_data = read_csv("venv/winsconsin_b_cancer (1).csv" ,names= winsconsin_headers)

print(wins_data.shape)

wins_data.drop('sample_code', axis=1, inplace=True)
print(wins_data.shape)
#check all datas are numbers and convert any non-numeric characters to null value
wins_data=wins_data.apply(pd.to_numeric, errors='coerce')
#print(wins_data.apply(pd.to_numeric, errors='coerce').info()) #this will give the datatype info after the conversion

#declaring a new header
new_winsconsin_headers = ['c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#since ? cannot be converted to int we can convert all data to float
wins_data[new_winsconsin_headers] = wins_data[new_winsconsin_headers].applymap(float)
#print(wins_data.dtypes)

#use the simple imputer function to replace missing value
imputer = SimpleImputer (strategy = 'median') # replace most_frequent with median, mean and observe
imputer.fit(wins_data)
new_data = imputer.transform(wins_data)
#reassign the new data frame
wins_data = pd.DataFrame(new_data, columns=new_winsconsin_headers)

#recheck the data for missing values
win_empty_data = wins_data[wins_data.isna().any(axis=1)]
#print('\n These are the missing data \n ', win_empty_data)

#seperate the data into xtrain and y test groups  - training and target sets
train_headers = ['c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses']
target_header = ['tumor_class']

X = wins_data[train_headers]
y = wins_data[target_header]

#split the data into train and test -- split  using 60:40
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1,stratify=y)

#check the dimension of the train and test data
print('\n The total of training dataset', X_train.shape)
print('\n The total of test dataset', X_test.shape)

#declare a seed variable to ensure reproducibility
SEED =1
#instantiate the model and set the hyperparameters
#my_model = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.05, random_state=SEED, criterion = 'gini')#manually tuning our model
my_model = DecisionTreeClassifier(random_state= SEED)
print("\n Hyperparameters of default model \n", my_model.get_params(), '\n') #print the model default hyperparameters

#train the model to fit
my_model.fit(X_train, y_train)

#create a cross validation split
kfold_split = KFold(n_splits=10) # test b changing splits to 3, 5 and 10

#declare a dictionary value of hyperparameters and values
classifier_hyperpar = dict()
classifier_hyperpar['max_depth'] = [2,3,4,6,8,10]
classifier_hyperpar['min_samples_split'] = [2,4,6,8,9]
classifier_hyperpar['min_samples_leaf'] = [0.05,0.2,0.5,1]
classifier_hyperpar['criterion'] = ['gini', 'entropy']

#perform a grid search and fit the grid
classifier_grid = GridSearchCV(my_model,classifier_hyperpar,scoring='accuracy', n_jobs=-1,cv=kfold_split)
classifier_grid_fit = classifier_grid.fit(X,y)

#compute the array containing the 10 folds and calculate the cross validation mean score
CV_scores = -cross_val_score(my_model, X_train, y_train, cv=kfold_split)
#std for the computation range for this generalisation and mean to know how well the model generalises
print("\n Cross val mean: {:.3f} (std: {:.3f})".format(CV_scores.mean()*-1, CV_scores.std()), end="\n\n")

#we can print the hyperparameter tuning results
print('Best hyperparameters: %s' % classifier_grid_fit.best_params_)
print('Best max_depth: = ',  classifier_grid_fit.best_estimator_.get_params()['max_depth'])
print('Best min_samples_split: = ',  classifier_grid_fit.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf: = ',  classifier_grid_fit.best_estimator_.get_params()['min_samples_leaf'])
print('Best criterion: = ',  classifier_grid_fit.best_estimator_.get_params()['criterion'])

#print best hyperparameters
print("suggested Best hyperparameters: \n", classifier_grid_fit.best_estimator_.get_params())

print('Best score: %s {:.3f}\n'.format(classifier_grid_fit.best_score_))


(699, 11)
(699, 10)

 The total of training dataset (419, 9)

 The total of test dataset (280, 9)

 Hyperparameters of default model 
 {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 1, 'splitter': 'best'} 


 Cross val mean: 0.935 (std: 0.026)

Best hyperparameters: {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 8}
Best max_depth: =  4
Best min_samples_split: =  8
Best min_samples_leaf: =  1
Best criterion: =  entropy
suggested Best hyperparameters: 
 {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 4, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 8, 'min_weight_fraction_leaf': 0.0, 'random_state': 1, 'splitter': 'best'}
Best score: %s 0.944



RANDOM SEARCH CROSS VALIDATION TUNING

In [27]:
#Randaom search tuning
#Grid search model tuning
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from matplotlib import pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.model_selection import  train_test_split, KFold, cross_val_score,RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#declaring header names
winsconsin_headers = ['sample_code','c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#read the data using read_csv class of pandas
wins_data = read_csv("venv/winsconsin_b_cancer (1).csv" ,names= winsconsin_headers)

print(wins_data.shape)

wins_data.drop('sample_code', axis=1, inplace=True)
print(wins_data.shape)
#check all datas are numbers and convert any non-numeric characters to null value
wins_data=wins_data.apply(pd.to_numeric, errors='coerce')
#print(wins_data.apply(pd.to_numeric, errors='coerce').info()) #this will give the datatype info after the conversion

#declaring a new header
new_winsconsin_headers = ['c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#since ? cannot be converted to int we can convert all data to float
wins_data[new_winsconsin_headers] = wins_data[new_winsconsin_headers].applymap(float)
#print(wins_data.dtypes)

#use the simple imputer function to replace missing value
imputer = SimpleImputer (strategy = 'median') # replace most_frequent with median, mean and observe
imputer.fit(wins_data)
new_data = imputer.transform(wins_data)
#reassign the new data frame
wins_data = pd.DataFrame(new_data, columns=new_winsconsin_headers)

#recheck the data for missing values
win_empty_data = wins_data[wins_data.isna().any(axis=1)]
#print('\n These are the missing data \n ', win_empty_data)

#seperate the data into xtrain and y test groups  - training and target sets
train_headers = ['c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses']
target_header = ['tumor_class']

X = wins_data[train_headers]
y = wins_data[target_header]

#split the data into train and test -- split  using 60:40
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1,stratify=y)

#check the dimension of the train and test data
print('\n The total of training dataset', X_train.shape)
print('\n The total of test dataset', X_test.shape)

#declare a seed variable to ensure reproducibility
SEED =1
#instantiate the model and set the hyperparameters
#my_model = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.05, random_state=SEED, criterion = 'gini')#manually tuning our model
my_model = DecisionTreeClassifier(random_state= SEED)
print("\n Hyperparameters of default model \n", my_model.get_params(), '\n') #print the model default hyperparameters

#train the model to fit
my_model.fit(X_train, y_train)

#create a cross validation split
kfold_split = KFold(n_splits=10) # test b changing splits to 3, 5 and 10

#declare a dictionary value of hyperparameters and values
classifier_hyperpar = dict()
classifier_hyperpar['max_depth'] = [2,3,4,6,8,10]
classifier_hyperpar['min_samples_split'] = [2,4,6,8,9]
classifier_hyperpar['min_samples_leaf'] = [0.05,0.2,0.5,1]
classifier_hyperpar['criterion'] = ['gini', 'entropy']

#perform a grid search and fit the grid
classifier_grid = RandomizedSearchCV(my_model,classifier_hyperpar,scoring='accuracy', n_jobs=-1,cv=kfold_split)
classifier_grid_fit = classifier_grid.fit(X,y)

#compute the array containing the 10 folds and calculate the cross validation mean score
CV_scores = -cross_val_score(my_model, X_train, y_train, cv=kfold_split)
#std for the computation range for this generalisation and mean to know how well the model generalises
print("\n Cross val mean: {:.3f} (std: {:.3f})".format(CV_scores.mean()*-1, CV_scores.std()), end="\n\n")

#we can print the hyperparameter tuning results
print('Best hyperparameters: %s' % classifier_grid_fit.best_params_)
print('Best max_depth: = ',  classifier_grid_fit.best_estimator_.get_params()['max_depth'])
print('Best min_samples_split: = ',  classifier_grid_fit.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf: = ',  classifier_grid_fit.best_estimator_.get_params()['min_samples_leaf'])
print('Best criterion: = ',  classifier_grid_fit.best_estimator_.get_params()['criterion'])

#print best hyperparameters
print("suggested Best hyperparameters: \n", classifier_grid_fit.best_estimator_.get_params())

print('Best score: %s {:.3f}\n'.format(classifier_grid_fit.best_score_))


(699, 11)
(699, 10)

 The total of training dataset (419, 9)

 The total of test dataset (280, 9)

 Hyperparameters of default model 
 {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 1, 'splitter': 'best'} 


 Cross val mean: 0.935 (std: 0.026)

Best hyperparameters: {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 4, 'criterion': 'gini'}
Best max_depth: =  4
Best min_samples_split: =  2
Best min_samples_leaf: =  1
Best criterion: =  gini
suggested Best hyperparameters: 
 {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 4, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 1, 'splitter': 'best'}
Best score: %s 0.937



    ENSEMBLE LEARNING

In [34]:
#Ensemble - Hard voting
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from matplotlib import pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.model_selection import  train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#importing the necessary models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier



#declaring header names
winsconsin_headers = ['sample_code','c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#read the data using read_csv class of pandas
wins_data = read_csv("venv/winsconsin_b_cancer (1).csv" ,names= winsconsin_headers)

print(wins_data.shape)

wins_data.drop('sample_code', axis=1, inplace=True)
print(wins_data.shape)
#check all datas are numbers and convert any non-numeric characters to null value
wins_data=wins_data.apply(pd.to_numeric, errors='coerce')
#print(wins_data.apply(pd.to_numeric, errors='coerce').info()) #this will give the datatype info after the conversion

#declaring a new header
new_winsconsin_headers = ['c_thickness', 'uni_cell_size', 'uni_cell_shape', 'marg_adhesion', 'epi_cell_size', 'nuclei','bland_chromatin', 'normal_nucleoli', 'mitoses', 'tumor_class']

#since ? cannot be converted to int we can convert all data to float
wins_data[new_winsconsin_headers] = wins_data[new_winsconsin_headers].applymap(float)
#print(wins_data.dtypes)

#use the simple imputer function to replace missing value
imputer = SimpleImputer (strategy = 'median') # replace most_frequent with median, mean and observe
imputer.fit(wins_data)
new_data = imputer.transform(wins_data)
#reassign the new data frame
wins_data = pd.DataFrame(new_data, columns=new_winsconsin_headers)

#recheck the data for missing values
win_empty_data = wins_data[wins_data.isna().any(axis=1)]
#print('\n These are the missing data \n ', win_empty_data)

#seperate the data into xtrain and y test groups  - training and target sets
X, y = new_data[:, :-1], new_data[:, -1]

#declare a seed variable to ensure reproducibility
SEED =1
#split the data into train and test -- split  using 60:40
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1,stratify=y)

#check the dimension of the train and test data
print('\n The total of training dataset', X_train.shape)
print('\n The total of test dataset', X_test.shape)

#instantiate the models
lr = LogisticRegression(random_state=SEED)
knc = KNN()
dtc = DecisionTreeClassifier(random_state=SEED)

classifier_list = [('LogisticRegression:', lr),('K Nearest Neighbour: ', knc), ('DecisionTreeClassifier: ', dtc)]

#instantiate the cross validation cv
kfold_split = KFold(n_splits=10)


#a for loop to iterate through the models
for clsf_name, clsf in classifier_list:

    #fit each model
    clsf.fit(X_train, y_train)

    #computethe array containing the 10 folds
    CV_scores_clsf = -cross_val_score(clsf, X_train, y_train, cv=kfold_split)
    print("\n Cross Val mean: {:.3f} (std: {:.3f})".format(CV_scores_clsf.mean()*-1, CV_scores.std()), end='\n\n' )

    #predict and calculate the accuracy on test data for each model
    y_pred_test_clsf = clsf.predict(X_test)
    print('\n  {:s} Test : {:.3f}'.format(clsf_name, accuracy_score(y_test, y_pred_test_clsf)))

    #predict and calculate the accuracy on train data for each model
    y_pred_train_clsf = clsf.predict(X_train)
    print('\n  {:s} Train : {:.3f}'.format(clsf_name, accuracy_score(y_train, y_pred_train_clsf)))

    print('____________---------------------_______________')

#instantiate the voting classifier
vc = VotingClassifier(estimators=classifier_list)

#fit vc to the training sets and models
vc.fit(X_train, y_train)

#computethe array containing the 10 folds cv mses
CV_scores_vc = -cross_val_score(vc, X_train, y_train, cv=10)
print("\n Cross Val mean: {:.3f} (std: {:.3f})".format(CV_scores_vc.mean()*-1, CV_scores.std()), end='\n\n' )

#predict the label for training set vc
y_pred_vc_train = vc.predict(X_train)
print('\n Voting Classifier Train {:.3f}'.format(accuracy_score(y_train, y_pred_vc_train)))


(699, 11)
(699, 10)

 The total of training dataset (489, 9)

 The total of test dataset (210, 9)

 Cross Val mean: 0.971 (std: 0.026)


  LogisticRegression: Test : 0.943

  LogisticRegression: Train : 0.973
____________---------------------_______________

 Cross Val mean: 0.977 (std: 0.026)


  K Nearest Neighbour:  Test : 0.948

  K Nearest Neighbour:  Train : 0.984
____________---------------------_______________

 Cross Val mean: 0.949 (std: 0.026)


  DecisionTreeClassifier:  Test : 0.948

  DecisionTreeClassifier:  Train : 1.000
____________---------------------_______________

 Cross Val mean: 0.973 (std: 0.026)


 Voting Classifier Train 0.986
