In [13]:
# This project uses data from the UCI Machine Learning Database and 
# supervised machine learning algorithms for predictive analysis.

In [14]:
# Import libraries.

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from matplotlib import pyplot as plt
%matplotlib inline  

In [16]:
# Create list of feature labels to use when data are read in.

In [17]:
feature_labels = ["id_number", "outcome", "time", "radius_1", "texture_1",
"perimeter_1", "area_1", "smoothness_1", "compactness_1", "concavity_1",
"concavepoints_1", "symmetry_1", "fractaldimension_1", "radius_2", 
"texture_2", "perimeter_2", "area_2", "smoothness_2", "compactness_2",
"concavity_2", "concavepoints_2", "symmetry_2", "fractaldimension_2", 
"radius_3", "texture_3", "perimeter_3", "area_3", "smoothness_3", 
"compactness_3", "concavity_3", "concavepoints_3", "symmetry_3", 
"fractaldimension_3", "tumor_diameter", "number_axillary_lymph_nodes"]

In [18]:
# Read in data set with feature_labels as column names.

In [19]:
data = pd.read_csv('data/wpbc.data', names = feature_labels)

In [20]:
# Describe data & data shape.

In [21]:
data.describe()
data.shape

(198, 35)

In [22]:
# Create an indicator variable that equals one if outcome = 'R' and 
# 0 otherwise.
# Note: 'R' stands for breast cancer recurrence. 

In [23]:
data['output'] = np.where(data['outcome'] == 'R', 1, 0)

In [24]:
# Create new data frame and eliminate non-numeric observations.

In [25]:
df = data.drop('id_number', axis = 1)
df = df.drop('outcome', axis = 1)
df = df.apply(pd.to_numeric, errors = 'coerce')
df = df.dropna()

In [26]:
# Create output array for what we aim to predict (e.g. recurrence).

In [27]:
y = df.output

In [28]:
# Remove binary output variable from to help create feature training 
# and testing sets.

In [29]:
df = df.drop('output', axis = 1)

In [30]:
# Create training and testing sets (based on 80-20 split).

In [31]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

In [32]:
# Verify data shapes.

In [33]:
X_train.shape, y_train.shape

((155, 33), (155,))

In [34]:
X_test.shape, y_test.shape

((39, 33), (39,))

In [35]:
# Everything looks good -- can start with classification algorithms.

In [36]:
#########################################################################
#########################################################################
#########################################################################
#########################################################################
#########################################################################

In [37]:
# (1) Logistic Model Algorithm; import from sklearn. 

In [38]:
from sklearn.linear_model import LogisticRegression

In [40]:
model = LogisticRegression()
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [41]:
# Predict values using test features.

In [42]:
predict1 = model.predict(X_test)

In [43]:
# Import libraries from sklearn for accuracy score and confusion matrix.

In [44]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [63]:
accuracy_score1 = accuracy_score(y_test, predict1)
print('Accuracy Score for Logistic Model:', round(accuracy_score1, 2))

Accuracy Score for Logistic Model: 0.77


In [64]:
confusion_matrix1 = confusion_matrix(y_test, predict1)
print('Confusion Matrix for Logistic Model:', confusion_matrix1)

Confusion Matrix for Logistic Model: [[24  4]
 [ 5  6]]


In [65]:
#########################################################################
#########################################################################
#########################################################################
#########################################################################
#########################################################################

In [66]:
# (2) Decision Tree Algorithm; import from sklearn. 

In [67]:
from sklearn import tree

In [68]:
dt = tree.DecisionTreeClassifier(criterion = 'gini')

In [69]:
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [70]:
# Predict values using test features. 

In [71]:
predict2 = dt.predict(X_test)

In [72]:
# Evaluate model accuracy. 

In [73]:
accuracy_score2 = accuracy_score(y_test, predict2)
print('Accuracy Score for Decision Tree:', round(accuracy_score2, 2))

Accuracy Score for Decision Tree: 0.74


In [74]:
confusion_matrix2 = confusion_matrix(y_test, predict2)
print('Confusion Matrix for Decision Tree:', confusion_matrix2)

Confusion Matrix for Decision Tree: [[24  4]
 [ 6  5]]


In [75]:
#########################################################################
#########################################################################
#########################################################################
#########################################################################
#########################################################################

In [76]:
# (3) Random Forest Algorithm.

In [77]:
# Import RandomForestClassifier from sklearn.

In [60]:
from sklearn.ensemble import RandomForestClassifier

In [79]:
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [80]:
# Predict values from test data. 

In [81]:
predict3 = rf.predict(X_test)

In [82]:
# Evaluate model accuracy.

In [83]:
accuracy_score3 = accuracy_score(y_test, predict3)
print('Accuracy Score of Random Forest Classifier:', 
      round(accuracy_score3, 2))

Accuracy Score of Random Forest Classifier: 0.77


In [84]:
confusion_matrix3 = confusion_matrix(y_test, predict3)
print('Confusion Matrix of Random Forest Classifier:', 
     confusion_matrix3)

Confusion Matrix of Random Forest Classifier: [[28  0]
 [ 9  2]]


In [85]:
#########################################################################
#########################################################################
#########################################################################
#########################################################################
#########################################################################

In [86]:
# (4) Support Vector Machine Algorithm.

In [87]:
# Load SVM libary from sklearn.

In [88]:
from sklearn import svm

In [89]:
svm_model = svm.SVC(gamma = 'scale')
svm_model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [90]:
# Predict values for test data.

In [91]:
predict4 = svm_model.predict(X_test)

In [92]:
# Evalute model accuracy.

In [93]:
accuracy_score4 = accuracy_score(y_test, predict4)
print('Accuracy score for SVM:', round(accuracy_score4,2))

Accuracy score for SVM: 0.72


In [94]:
confusion_matrix4 = confusion_matrix(y_test, predict4)
print('Confusion Matrix for SVM:', confusion_matrix4)

Confusion Matrix for SVM: [[28  0]
 [11  0]]


In [95]:
#########################################################################
#########################################################################
#########################################################################
#########################################################################
#########################################################################

In [96]:
# (5) Naive Bayes Algorithm.

In [97]:
# Load libraries from sklearn.

In [98]:
from sklearn.naive_bayes import GaussianNB

In [99]:
nb = GaussianNB()
nb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [100]:
# Predict values for test data.

In [101]:
predict5 = nb.predict(X_test)

In [102]:
# Evaluate accuracy of model.

In [103]:
accuracy_score5 = accuracy_score(y_test, predict5)
print('Accuracy Score of Naive Bayes:', round(accuracy_score5,2))

Accuracy Score of Naive Bayes: 0.67


In [104]:
confusion_matrix5 = confusion_matrix(y_test, predict5)
print('Confusion Matrix of Naive Bayes:', confusion_matrix5)

Confusion Matrix of Naive Bayes: [[21  7]
 [ 6  5]]


In [105]:
#########################################################################
#########################################################################
#########################################################################
#########################################################################
#########################################################################

In [106]:
# (6) k-Nearest Neighbor (kNN) Algorithm.

In [107]:
# Import kNN library from sklearn.

In [108]:
from sklearn.neighbors import KNeighborsClassifier

In [109]:
kNN = KNeighborsClassifier()
kNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [110]:
# Predict values using test data.

In [111]:
predict6 = kNN.predict(X_test)

In [112]:
# Evaluate model accuracy.

In [113]:
accuracy_score6 = accuracy_score(y_test, predict6)
print('Accuracy Score using kNN:', round(accuracy_score6,2))

Accuracy Score using kNN: 0.74


In [114]:
confusion_matrix6 = confusion_matrix(y_test, predict6)
print('Confusion matrix using kNN:', confusion_matrix6)

Confusion matrix using kNN: [[26  2]
 [ 8  3]]


In [None]:
#########################################################################
#########################################################################
#########################################################################
#########################################################################
#########################################################################

In [134]:
# Create a list of accuracy scores from each model.
accuracy1 = print('Logistic accuracy:', round(accuracy_score1, 2))
accuracy2 = print('Decision tree accuracy:', round(accuracy_score2, 2))
accuracy3 = print('Random forest accuracy:', round(accuracy_score3, 2))
accuracy4 = print('Support vector machine accuracy:', 
                  round(accuracy_score4, 2))
accuracy5 = print('Naive bayes accuracy:', round(accuracy_score5, 2))
accuracy6 = print('k-Nearest Neighbors accuracy:', round(accuracy_score6, 2))

Logistic accuracy: 0.77
Decision tree accuracy: 0.74
Random forest accuracy: 0.77
Support vector machine accuracy: 0.72
Naive bayes accuracy: 0.67
k-Nearest Neighbors accuracy: 0.74
