In [108]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import svm
import sklearn as sk
from sklearn.neural_network import MLPClassifier

In [2]:
features = pd.read_csv('BreastCancerDataset.csv')
print(features.head(5))
print(features.describe())


         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness

In [3]:
features = pd.get_dummies(features)
print(features)

           id  radius_mean  texture_mean  perimeter_mean  area_mean  \
0      842302       17.990         10.38          122.80     1001.0   
1      842517       20.570         17.77          132.90     1326.0   
2    84300903       19.690         21.25          130.00     1203.0   
3    84348301       11.420         20.38           77.58      386.1   
4    84358402       20.290         14.34          135.10     1297.0   
5      843786       12.450         15.70           82.57      477.1   
6      844359       18.250         19.98          119.60     1040.0   
7    84458202       13.710         20.83           90.20      577.9   
8      844981       13.000         21.82           87.50      519.8   
9    84501001       12.460         24.04           83.97      475.9   
10     845636       16.020         23.24          102.70      797.8   
11   84610002       15.780         17.89          103.60      781.0   
12     846226       19.170         24.80          132.40     1123.0   
13    

In [4]:
#Labels are the values we want to predict
labels = np.array(features['diagnosis_M'])

# Remove unnessecary data from the features
# axis 1 refers to the columns
features = features.drop('diagnosis_M', axis = 1)
features = features.drop('diagnosis_B', axis = 1)
features = features.drop('id', axis = 1)
features = features.drop('Unnamed: 32', axis = 1)

In [5]:
# Saving feature names for later use
feature_list = list(features.columns)
print(feature_list)

['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']


In [15]:
# Convert to numpy array
features = np.array(features)

'''

Split the data into training and testing sets

We will split the data into two: 
    - 80% will be used to train the model
    - 20% will be used to test the model
    
Train_features represents the feature columns of the training data. It will be used to predict train_labels (diagnosis)
Test_features will be used to predict test_labels

'''

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.20, random_state = 42)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (455, 30)
Training Labels Shape: (455,)
Testing Features Shape: (114, 30)
Testing Labels Shape: (114,)


In [7]:
print(train_features)
print()
print(train_labels)


[[9.029e+00 1.733e+01 5.879e+01 ... 1.750e-01 4.228e-01 1.175e-01]
 [2.109e+01 2.657e+01 1.427e+02 ... 2.903e-01 4.098e-01 1.284e-01]
 [9.173e+00 1.386e+01 5.920e+01 ... 5.087e-02 3.282e-01 8.490e-02]
 ...
 [1.429e+01 1.682e+01 9.030e+01 ... 3.333e-02 2.458e-01 6.120e-02]
 [1.398e+01 1.962e+01 9.112e+01 ... 1.827e-01 3.179e-01 1.055e-01]
 [1.218e+01 2.052e+01 7.722e+01 ... 7.431e-02 2.694e-01 6.878e-02]]

[0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 1 1 0 1
 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0
 1 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0
 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 0 0 0 1 0 1 0 1 0 1
 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0
 1 1 0 0 1 0 1 1 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 0 1 1
 0 0 1 1 1 0 0 0 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 1
 1 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 1 1 0 1 0 0

In [114]:
#Initialize naive bayes model
model = GaussianNB()

# Train the model
model.fit(train_features, train_labels)

# Use the predict method on the test data
predictions = model.predict(test_features)

In [121]:
print(test_labels)
print(predictions)

print('\nAccuracy:', str(accuracy_score(test_labels, predictions) * 100) + '%')
print(confusion_matrix(test_labels, predictions))

scores = cross_val_score(model, features, labels, scoring='accuracy', cv=10)
print('\nMean Accuracy:', np.mean(scores))

[0 1 1 0 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1
 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 1 0 0 1 1 0 1
 0 0 0 1 0 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 1 0 0 1
 0 0 1]
[0 1 1 0 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1
 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 1 0 0 1 1 0 1
 0 0 0 0 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 1 0 0 1
 0 0 1]

Accuracy: 97.36842105263158%
[[71  0]
 [ 3 40]]

Mean Accuracy: 0.9386796733212339


In [122]:
# In machine learning, it is important to test other algorithms
# to see which model results in the highest accuracy

lr = LogisticRegression()
lr.fit(train_features, train_labels)
predictions = lr.predict(test_features)

print('\nLogistic Regression Accuracy:', str(accuracy_score(test_labels, predictions) * 100) + '%')
print(confusion_matrix(test_labels, predictions))


SVM = svm.LinearSVC()
SVM.fit(train_features, train_labels)
predictions = SVM.predict(test_features)

print('\nSupport Vector Machine Accuracy:', str(accuracy_score(test_labels, predictions) * 100) + '%')
print(confusion_matrix(test_labels, predictions))


neuralnNet = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(12,12,12),max_iter=2000)
neuralnNet.fit(train_features, train_labels)
predictions = neuralnNet.predict(test_features)

print('\nNeural Network Accuracy:', str(accuracy_score(test_labels, predictions) * 100) + '%')
print(confusion_matrix(test_labels, predictions))


rf = RandomForestClassifier()
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)

print('\nRandom Forest Accuracy:', str(accuracy_score(test_labels, predictions) * 100) + '%')
print(confusion_matrix(test_labels, predictions))


knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(train_features, train_labels)
predictions = knn.predict(test_features)

print('\nKNeighbors Accuracy:', str(accuracy_score(test_labels, predictions) * 100) + '%')
print(confusion_matrix(test_labels, predictions))


lda = LinearDiscriminantAnalysis()
lda.fit(train_features, train_labels)
predictions = lda.predict(test_features)

print('\nLinear Discriminant Analysis Accuracy:', str(accuracy_score(test_labels, predictions) * 100) + '%')
print(confusion_matrix(test_labels, predictions))




Logistic Regression Accuracy: 95.6140350877193%
[[70  1]
 [ 4 39]]

Support Vector Machine Accuracy: 96.49122807017544%
[[67  4]
 [ 0 43]]

Neural Network Accuracy: 94.73684210526315%
[[70  1]
 [ 5 38]]

Random Forest Accuracy: 96.49122807017544%
[[70  1]
 [ 3 40]]

KNeighbors Accuracy: 92.98245614035088%
[[68  3]
 [ 5 38]]

Linear Discriminant Analysis Accuracy: 95.6140350877193%
[[70  1]
 [ 4 39]]


