In [214]:
#import packages needed 
import pandas as pd
import numpy as np
import seaborn as sns
from numpy import log, dot, e
from numpy.random import rand
import matplotlib.pyplot as plt
import math
from scipy.stats import entropy
plt.style.use('classic')
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
import tensorflow as tf
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [215]:
dataset = pd.read_csv("breast-cancer-wisconsin.data")

#Link to the dataset: https://archive.ics.uci.edu/ml/datasets/breast+cancer

In [216]:
#add column names
dataset.rename(columns = {'1000025':'id', 
                       '5':'clump thickness',
                        '1': 'cell size',
                         '1.1': 'cell shape',
                         '1.2': 'Marginal Adhesion',
                        '2': 'Single Epithelial Cell Size',
                         '1.3': 'Bare Nuclei',
                         '3': 'Bland Chromatin',
                         '1.4': 'Normal Nucleoli',
                         '1.5': 'Mitoses',
                         '2.1': 'result'},inplace =True)

In [217]:
# Replace all "?" with the average value in the Bare Nuclei columns mean to make the replaced data as accurate as possible  
dataset['Bare Nuclei'].replace({'?': '4'}, inplace = True)

In [218]:
#change the last object to an integer
dataset['Bare Nuclei'] = dataset['Bare Nuclei'].astype('int64')
dataset.info()
#lastly, map the class column as 0 and 1 instead of 2 and 4 so it is binary
dataset.result = dataset.result.map({2:0,4:1})

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   id                           698 non-null    int64
 1   clump thickness              698 non-null    int64
 2   cell size                    698 non-null    int64
 3   cell shape                   698 non-null    int64
 4   Marginal Adhesion            698 non-null    int64
 5   Single Epithelial Cell Size  698 non-null    int64
 6   Bare Nuclei                  698 non-null    int64
 7   Bland Chromatin              698 non-null    int64
 8   Normal Nucleoli              698 non-null    int64
 9   Mitoses                      698 non-null    int64
 10  result                       698 non-null    int64
dtypes: int64(11)
memory usage: 60.1 KB


In [219]:
#id and result are clearly unrelated, lets drop that column
dataset = dataset.iloc[: , 1:]

In [221]:
X = dataset[columns]
y = dataset.result

['clump thickness', 'cell size', 'cell shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']


In [236]:
#split training data 70-30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.89, random_state=1)

In [237]:
#Try modifying one or more of the input parameters and describe what changes you notice in your results
clf_gini = DecisionTreeClassifier(criterion="gini", max_depth=3, splitter = "best")
clf_gini = clf.fit(X_train,y_train)
y_pred = clf_gini.predict(X_test)
print("With the GINI criterion and best splitter:\n",classification_report(y_test, y_pred))
print("-"*55)
clf_entropy = DecisionTreeClassifier(criterion="entropy", max_depth=3, splitter = "best")
clf_entropy = clf_entropy.fit(X_train,y_train)
y_pred = clf_entropy.predict(X_test)
print("With the entropy criterion and best splitter:\n",classification_report(y_test, y_pred))
print("-"*55)
clf_random_gini = DecisionTreeClassifier(criterion="gini", max_depth=3, splitter = "random")
clf_random_gini = clf_random_gini.fit(X_train,y_train)
y_pred = clf_random_gini.predict(X_test)
print("With the gini criterion and random splitter:\n",classification_report(y_test, y_pred))
print("-"*55)
clf_entropy_random = DecisionTreeClassifier(criterion="entropy", max_depth=5, splitter = "random")
clf_entropy_random = clf_entropy_random.fit(X_train,y_train)
y_pred = clf_entropy_random.predict(X_test)
print("With the entropy criterion and random splitter:\n",classification_report(y_test, y_pred))

With the GINI criterion and best splitter:
               precision    recall  f1-score   support

           0       0.98      0.91      0.94       412
           1       0.84      0.96      0.90       210

    accuracy                           0.93       622
   macro avg       0.91      0.93      0.92       622
weighted avg       0.93      0.93      0.93       622

-------------------------------------------------------
With the entropy criterion and best splitter:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96       412
           1       0.92      0.93      0.93       210

    accuracy                           0.95       622
   macro avg       0.94      0.94      0.94       622
weighted avg       0.95      0.95      0.95       622

-------------------------------------------------------
With the gini criterion and random splitter:
               precision    recall  f1-score   support

           0       0.94      0.97      0.9

When changing the paramaters I noticed very few differences in the accuracy of the model, the best performing model was a tie between "With the GINI criterion and best splitter" and "With the gini criterion and random splitter:" at 96% accuracy. This was surprising since the two paramaters are the polar opposites in the training I have done. 
The GINI index chooses the amount of probability that when the feature is classified incorrectly and Entropy is the randomness of the dataset. Another performance difference noticed here is with the random splitter, the model seems to be way more precise predicting the 0's than 1's.

In [234]:
#bagging classifier using the KNeighborsClassifier algorithm
model = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
k_fold_result_bagging = cross_val_score(model, X, y, cv=10,scoring='r2')
print(k_fold_result_bagging)
print(f"Mean 10-Fold : {np.mean(k_fold_result_bagging)}")
print(classification_report(y_test,y_pred))

[0.61956522 0.80978261 0.87318841 0.61956522 0.9365942  0.74637681
 0.87318841 1.         0.87222222 0.93611111]
Mean 10-Fold : 0.8286594202898552
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       415
           1       0.96      0.93      0.94       207

    accuracy                           0.96       622
   macro avg       0.96      0.96      0.96       622
weighted avg       0.96      0.96      0.96       622



In [235]:
# #Boost classifier using the DecisionTreeClassifier algorithm
adaboost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5, random_state=23), n_estimators=5, learning_rate=0.2, random_state=23)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
k_fold_result_boosting = cross_val_score(adaboost, X, y, cv=10,scoring='r2')
print(k_fold_result_boosting)
print(f"Mean 10-Fold : {np.mean(k_fold_result_boosting)}")
print(classification_report(y_test,y_pred))

[0.55615942 0.74637681 0.87318841 0.61956522 0.87318841 0.68297101
 0.80978261 1.         1.         0.87222222]
Mean 10-Fold : 0.8033454106280192
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       415
           1       0.95      0.91      0.93       207

    accuracy                           0.95       622
   macro avg       0.95      0.94      0.95       622
weighted avg       0.95      0.95      0.95       622



The k-fold cross validation is a popular machine learning procedure that estimates the skill of a model on data. As we can see from the results outputted, even though both models performed with incredible accuracies on the Wisconsin Breast Cancer dataset the k-fold scores were much lower.
The difference between the two mean k-fold results is not significant but some reasons why they are not the same are because they use seperate base algorithms and trivially they are seperate methods of ensemble learning, bagging uses the base learning algorithms and trians them seperately to average out a more accurate and precise result, Ada Boosting on the other hand penalizes wrong guesses and rewards correct ones, the idea is that every base learner in sequence will improve the results compared to the last one.


Compare the effectiveness of the three models implemented above. Clearly describe the metric you are using for comparison. Describe (with examples) Why is this metric(metrics) suited/appropriate for the problem at hand? How would a choice of a different metric impact your results? Can you demonstrate that?

From my experiment with the three ensemble learning models, all three did very well with predicting the benign and malignant tumors, the metric I will be using to compare the effectiveness of the three models is precision.
Precision is the ratio between true positives and true positives and false positives together, in this dataset that would be the correctly predicted tumors divided by all the tumors in the dataset. The precision metric helps us understand how well the model did because it tells us how often the model is correct when the tumor is malignant.

Comparing the three ensemble learning techniques using the precision metric, they all performed incredibly similary and in a very high percentage, with the least being 91% and as high as 96%, which is a very good precision score for any ensemble learning model. All the models seem to be doing a better job when predicting the benign tumors than the malignant ones, which is an expected outcome since there are more benign cases in the dataset. The only big discrepancy in the results is compared to the precision of 1's in the bagging and boosting methods, the decision tree underperformed by more than 10% in the entropy and random splitter case. 

I went with precision because on the dataset I have used is one where there are way less malignant tumors than benign ones, meaning being precise in this case will determine how the model truly performed.

If I went with accuracy it could be misleading if a model only guessed the malignant tumors correctly and none of the benign ones, it would have a high accuracy number when in reality it is not a very high performing model for this problem. 
