In [2]:
# Reading the dataset
import pandas as pd
import numpy as np

from sklearn import tree
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix

data_csv = "../breast-cancer-wisconsin.csv"
df = pd.read_csv(data_csv, names=['ID','Clump','U_Cell_size','U_Cell_shape','Marginal_Adhesion','SE_epitelial_cell_size','Bare_nuclei','bland_chromatin','Normal_Nucleoli','Mitoses','Class'])
print('Dataset shape: ', df.shape)
print(df.dtypes)
df.head()

Dataset shape:  (699, 11)
ID                         int64
Clump                      int64
U_Cell_size                int64
U_Cell_shape               int64
Marginal_Adhesion          int64
SE_epitelial_cell_size     int64
Bare_nuclei               object
bland_chromatin            int64
Normal_Nucleoli            int64
Mitoses                    int64
Class                      int64
dtype: object


Unnamed: 0,ID,Clump,U_Cell_size,U_Cell_shape,Marginal_Adhesion,SE_epitelial_cell_size,Bare_nuclei,bland_chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


#  Attribute                     Domain
-- -----------------------------------------
1. Sample code number            id number
2. Clump Thickness               1 - 10
3. Uniformity of Cell Size       1 - 10
4. Uniformity of Cell Shape      1 - 10
5. Marginal Adhesion             1 - 10
6. Single Epithelial Cell Size   1 - 10
7. Bare Nuclei                   1 - 10
8. Bland Chromatin               1 - 10
9. Normal Nucleoli               1 - 10
10. Mitoses                       1 - 10
11. Class:                        (0 for benign, 1 for malignant)

In [3]:
# Change bare nuclei values to numeric only and check the numbers of NAN
df['Bare_nuclei'] = pd.to_numeric(df['Bare_nuclei'], errors='coerce', downcast='integer')
print(df['Bare_nuclei'].isnull().values.sum())

# Drop the lines with null values
df = df.dropna()
print(df['Bare_nuclei'].isnull().values.sum())

# Removing ID column since it won't be considered to the training
df.pop('ID')

# So, the dataset removing null values and ID column has the size:
print('New dataset shape: ', df.shape)

16
0
New dataset shape:  (683, 10)


In [4]:
#Values distribution
for name, values in df.iteritems():
    print (name, '\nMin Value:  ', np.min(values), '\nMax Value: ', np.max(values), '\n\n')

Clump 
Min Value:   1 
Max Value:  10 


U_Cell_size 
Min Value:   1 
Max Value:  10 


U_Cell_shape 
Min Value:   1 
Max Value:  10 


Marginal_Adhesion 
Min Value:   1 
Max Value:  10 


SE_epitelial_cell_size 
Min Value:   1 
Max Value:  10 


Bare_nuclei 
Min Value:   1.0 
Max Value:  10.0 


bland_chromatin 
Min Value:   1 
Max Value:  10 


Normal_Nucleoli 
Min Value:   1 
Max Value:  10 


Mitoses 
Min Value:   1 
Max Value:  10 


Class 
Min Value:   2 
Max Value:  4 




In [5]:
# Changing Class label from 2 or 4 to 0 or 1
df.loc[:, 'Class'] = np.where(df['Class']==2, 0, 1)
print ('Class\nMin Value:  ', np.min(df['Class']), '\nMax Value: ', np.max(df['Class']), '\n\n')

Class
Min Value:   0 
Max Value:  1 




In [6]:
print(f"Total     = {len(df)} -> 100%")
print(f"Benign    = {len(df[df.Class == 0])} -> {len(df[df.Class == 0])/len(df) *100}%")
print(f"Malignant = {len(df[df.Class == 1])} -> {len(df[df.Class == 1])/len(df) *100}%")

Total     = 683 -> 100%
Benign    = 444 -> 65.00732064421669%
Malignant = 239 -> 34.99267935578331%


# (Todo Issue #4) Here we should add the correlation plots correspondents to the paper figures 13 and 14

In [7]:
# Correlation code to plot figures

In [8]:
# Separate entries from outputs
dataset = df.to_numpy(dtype=np.int) # Converting from Pandas dataframe to Numpy
entries = dataset[:, 0:9]
outputs = dataset[:, 9]
print(entries.shape)
print(outputs.shape)


(683, 9)
(683,)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dataset = df.to_numpy(dtype=np.int) # Converting from Pandas dataframe to Numpy


In [9]:
# Split dataset between train and test
seed = 10 # Set seed to get invariant results
test_size = 0.34
x_train, x_test, y_train, y_test = train_test_split(entries, outputs, test_size=test_size, random_state=seed)
print('Train dataset shape:\nEntries: ', x_train.shape, '\nOutput: ', y_train.shape, '\n\n')
print('Test dataset shape:\nEntries: ', x_test.shape, '\nOutput: ', y_test.shape)

Train dataset shape:
Entries:  (450, 9) 
Output:  (450,) 


Test dataset shape:
Entries:  (233, 9) 
Output:  (233,)


In [10]:
# Create 10-fold validation set for training
K = 10
kf = KFold(n_splits=10, shuffle=True, random_state=seed)

# Decision Tree Classifier

In [11]:
# Decision Tree classifier
dt = tree.DecisionTreeClassifier(random_state=seed)

In [12]:
# Training the decision tree using cross-validation
for train_indexes, valid_indexes in kf.split(x_train):
    dt.fit(x_train[train_indexes], y_train[train_indexes])
    y_valid_pred = dt.predict(x_train[valid_indexes])
    print("Validation Accuracy = ", accuracy_score(y_train[valid_indexes], y_valid_pred), 
            ", Train dataset shape: ", x_train[train_indexes].shape, 
            "Validation dataset shape: ", y_train[valid_indexes].shape)


Validation Accuracy =  0.9555555555555556 , Train dataset shape:  (405, 9) Validation dataset shape:  (45,)
Validation Accuracy =  0.9777777777777777 , Train dataset shape:  (405, 9) Validation dataset shape:  (45,)
Validation Accuracy =  0.9333333333333333 , Train dataset shape:  (405, 9) Validation dataset shape:  (45,)
Validation Accuracy =  0.9555555555555556 , Train dataset shape:  (405, 9) Validation dataset shape:  (45,)
Validation Accuracy =  0.8888888888888888 , Train dataset shape:  (405, 9) Validation dataset shape:  (45,)
Validation Accuracy =  0.9555555555555556 , Train dataset shape:  (405, 9) Validation dataset shape:  (45,)
Validation Accuracy =  0.8666666666666667 , Train dataset shape:  (405, 9) Validation dataset shape:  (45,)
Validation Accuracy =  0.8888888888888888 , Train dataset shape:  (405, 9) Validation dataset shape:  (45,)
Validation Accuracy =  0.9777777777777777 , Train dataset shape:  (405, 9) Validation dataset shape:  (45,)
Validation Accuracy =  0.955

In [13]:
# Testing the decision tree
y_test = dt.predict(x_test)
print("Test Accuracy = ", accuracy_score(y_train[valid_indexes], y_valid_pred))

Test Accuracy =  0.9555555555555556


# (Todo Issue #5) Calculates the confusion matrix and define the variables

In [19]:
# Code calculating decision matrix, defining variables from issue #5 and show a confusion matrix
tn, fp, fn, tp = confusion_matrix(outputs, dt.predict(entries)).ravel()
print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive:", tp)

True Negative:  441
False Positive:  3
False Negative:  13
True Positive: 226


# (Todo Issue #6) Calculates the performance metrics

In [22]:
# Code to calculate performance metrics as described on issue #6

# 4.1 accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)
print("Accuracy: ", accuracy)

# 4.2 precision
precision = tp / (tp + fp)
print("Precision: ", precision)

# 4.3 specificity
specificity = tn / (tn + fp)
print("Specificity: ", specificity)

# 4.4 TP rate
tp_rate = tp / (tp + fn)
print("TP Rate: ", tp_rate)

# 4.5 FP rate
fp_rate = fp / (fp + tn)
print("FP Rate: ", fp_rate)

# 4.6 NPV
npv = tn / (tn + fn)
print("NPV: ", npv)

# 4.7 Rate of Misclassification
misclassification_rate = (fp + fn) / (tp + tn + fp + fn)
print("Rate of Misclassification: ", misclassification_rate)

# 4.8 F1 Score
f1_score = (precision * tp_rate) / (precision + tp_rate)
print("F1 Score: ",  f1_score)

Accuracy:  0.9765739385065886
Precision:  0.9868995633187773
Specificity:  0.9932432432432432
TP Rate:  0.9456066945606695
FP Rate:  0.006756756756756757
NPV:  0.9713656387665198
Rate of Misclassification:  0.02342606149341142
F1 Score:  0.4829059829059829


# (Todo Issue #7) Create ROC Curves

In [15]:
# Code to create ROC curve

# Paper negative points  (Let's make a list to use after)
- Does not mention the null values present at `Bare_nuclei` atribute
- Does not treat the proportion of benign and malign samples at training and test dataset
- We assume that `ID` column is not used to the training, but paper seems to consider it
- Normalization of entry values could improve the performance for some methods as NN
- Use a repeated K-fold cross validation could improve the performance
- Use grid-search to tune decision tree parameters could be implemented
- Does not specify any of the Decision tree parameters used (or they just use the default implementation without set any parameters)
- Calculates the decision matrix over the entire dataset