In [105]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Question 1 Cancer Diagnosis Using Machine Learning

## Define features and label columns

In [106]:
df  = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Cancer.csv")
df[0::10]

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Malignant_Cancer
0,5,1,1,1,2,1,3,1,1,0
10,5,3,3,3,2,3,4,4,1,1
20,5,4,4,9,2,10,5,6,1,1
30,9,5,8,1,2,3,2,1,5,1
40,5,3,5,5,3,3,4,10,1,1
50,5,1,3,1,2,1,2,1,1,0
60,2,2,2,1,1,1,7,1,1,0
70,1,1,1,1,2,1,3,1,1,0
80,10,3,5,1,10,5,3,10,2,1
90,1,3,1,2,2,2,5,3,2,0


In [107]:
feature_cols = ['Clump_Thickness','Uniformity_of_Cell_Size','Uniformity_of_Cell_Shape',
                    'Marginal_Adhesion','Single_Epithelial_Cell_Size','Bare_Nuclei','Bland_Chromatin','Normal_Nucleoli','Mitoses']

X = df[feature_cols] 
print(X.shape)


y = df['Malignant_Cancer']
print(y.shape)

(150, 9)
(150,)


## Split into training and testing sets

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=3)

## Fit training data on DT object

In [109]:
my_decisiontree = DecisionTreeClassifier(random_state=3)

my_decisiontree.fit(X_train, y_train)

DecisionTreeClassifier(random_state=3)

## Find DT accuracy score

In [110]:
y_predict_dt = my_decisiontree.predict(X_test)

score_dt = accuracy_score(y_test, y_predict_dt)

In [111]:
print(score_dt)

0.8301886792452831


## “Bagging” with 19 decision tree classifiers

## Find bootsrapping size

In [112]:
bootstarp_size = int(0.8 * len(X))
print(bootstarp_size)
print(len(X_train))
hold_scores = []
print(X_train.head)

120
97
<bound method NDFrame.head of      Clump_Thickness  Uniformity_of_Cell_Size  Uniformity_of_Cell_Shape  \
50                 5                        1                         3   
73                 1                        1                         1   
34                10                        5                         5   
136               10                       10                        10   
19                10                       10                        10   
..               ...                      ...                       ...   
107                8                        8                         7   
21                 2                        5                         3   
0                  5                        1                         1   
131                2                        1                         1   
106                3                        4                         5   

     Marginal_Adhesion  Single_Epithelial_Cell_Size  Bare_Nucl

## Create 19 base tress and predictions

In [113]:
#each loop creat bootsrtap sample, define new base decision tree classifier for each reample,  Perform prediction and save results
for i in range(1, 20):
    X_train2, y_train2 = resample(X_train, y_train, n_samples = bootstarp_size, random_state= i, replace = True)
    Base_DecisionTree = DecisionTreeClassifier(random_state=3)
    my_decisiontree.fit(X_train2, y_train2)
    y_predict_dt = my_decisiontree.predict(X_test)
    hold_scores.append(y_predict_dt) 

print(y_predict_dt)
print(len(y_predict_dt))

[1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0
 0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1]
53


In [114]:
print(len(hold_scores))
print(hold_scores)

19
[array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 1], dtype=int64), array([1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1], dtype=int64), array([0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 1], dtype=int64), array([0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1], dtype=int64), array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 1], dtype=int64), array([0, 1, 1, 1, 1,

In [115]:
hold_votes = np.array([[0] * 2] * 53)
print(len(hold_votes))
hold_votes
print(hold_votes)



53
[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]]


## Go through prediction matrix and vote on what should be the prediction

In [116]:

for i in range(len(hold_scores)):
    for j in range(53):
        if hold_scores[i][j] == 0:
            hold_votes[j][0] += 1
        elif hold_scores[i][j] == 1:
            hold_votes[j][1] += 1


In [117]:
print(hold_votes)

[[ 8 11]
 [ 0 19]
 [ 3 16]
 [ 0 19]
 [ 0 19]
 [19  0]
 [ 1 18]
 [17  2]
 [ 0 19]
 [ 0 19]
 [19  0]
 [ 7 12]
 [ 4 15]
 [ 0 19]
 [19  0]
 [ 0 19]
 [19  0]
 [17  2]
 [ 0 19]
 [ 0 19]
 [19  0]
 [ 0 19]
 [19  0]
 [19  0]
 [ 0 19]
 [ 0 19]
 [ 1 18]
 [ 0 19]
 [19  0]
 [ 0 19]
 [13  6]
 [ 1 18]
 [19  0]
 [19  0]
 [19  0]
 [19  0]
 [19  0]
 [19  0]
 [ 0 19]
 [19  0]
 [19  0]
 [19  0]
 [ 2 17]
 [ 2 17]
 [19  0]
 [16  3]
 [ 0 19]
 [10  9]
 [ 1 18]
 [ 0 19]
 [18  1]
 [ 0 19]
 [ 0 19]]


In [118]:
final_votes = np.empty(53, dtype=int)
print(y_test)


47     1
3      0
31     1
25     1
15     1
118    0
89     1
6      0
103    1
65     1
88     0
38     1
92     1
53     1
140    0
40     1
72     0
135    0
113    1
42     1
126    0
112    1
141    0
76     0
5      1
109    1
134    1
67     1
57     0
86     1
14     1
24     1
139    0
58     0
8      0
27     0
77     0
41     0
111    1
68     0
128    0
82     0
22     1
17     1
127    0
51     1
23     1
83     1
142    0
84     1
45     1
35     1
101    1
Name: Malignant_Cancer, dtype: int64


In [119]:
for i in range(len(hold_votes)):
    if hold_votes[i][0] < hold_votes[i][1]:
        final_votes[i] = 1
    else:
        final_votes[i] = 0


In [120]:
bagging_score = accuracy_score(y_test, final_votes)

print(bagging_score)

0.8867924528301887


### Bagging resulted in higher score of 0.8867924528301887 opposed to regular Decision Tree of 0.8301886792452831

## Using Random Forest

In [121]:
my_RandomForest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state=3)

In [122]:
my_RandomForest.fit(X_train, y_train)

RandomForestClassifier(n_estimators=19, random_state=3)

In [123]:
rf_predict = my_RandomForest.predict(X_test)

In [124]:
rf_score = accuracy_score(y_test, rf_predict)
print(rf_score)

0.9245283018867925


### Random Forest resulted in higher score of 0.9245283018867925 opposed to regular Bagging of 0.8867924528301887