# Final Project - Breast Cancer Diagnosis Based on Cell Nuclei Features

Carolyn Nohejl

# Modeling

# Import Packages

In [1]:
import os
import math
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)
import statsmodels.api as sm
import statsmodels.formula.api as smf


import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')



from IPython.display import Image
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.datasets import make_classification;
from sklearn import preprocessing, neighbors, model_selection

from sklearn import tree, ensemble, metrics, model_selection, externals



In [2]:
df = pd.read_csv(os.path.join('breast_cancer_data.csv'))

In [3]:
df.replace(["M", "B"],["1","0"],inplace=True)

Change classifier to a numeric.  This is required to use some models.

In [4]:
df['diagnosis'] = pd.to_numeric(df['diagnosis'])

In [5]:
df.rename(columns = {'concave points_mean': 'concave_points_mean', 'concave points_se': 'concave_points_se', 'concave points_worst': 'concave_points_worst'}, inplace = True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
id                         569 non-null int64
diagnosis                  569 non-null int64
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave_points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non-

# Split data into train and test sets (70/30)¶

Use the same random state as I did for the EDA portion, to result in the same test and train dataframes.

In [7]:
df_train, df_test = train_test_split(df,test_size=0.3, random_state=42)

In [8]:
df_train.diagnosis.value_counts()

0    249
1    149
Name: diagnosis, dtype: int64

In [9]:
df_test.diagnosis.value_counts()

0    108
1     63
Name: diagnosis, dtype: int64

# Random Forest - All 30 Features

Create a feature matrix X with all features except diagnosis and ID, and response vector y including diagnosis.

In [304]:
X = df_train[df_train.columns.values]
X.drop(['id','diagnosis'], axis = 1, inplace = True)

y = df_train.diagnosis

Create training set 60% and test set 40% of the overall training set (df_train)

In [305]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(X,y, train_size = .6, random_state=0)

In [306]:
model = ensemble.RandomForestClassifier(n_estimators = 1000,
        min_samples_leaf = 5,
        oob_score = True,
        random_state = 0).\
    fit(train_X, train_y)

OOB score:

In [307]:
model.oob_score_

0.94537815126050417

10 fold cross validation score on training set:

In [308]:
model_selection.cross_val_score(model, train_X, train_y, cv = 10).mean()

0.94545454545454555

Determine feature importance:

In [309]:
sorted(zip(model.feature_importances_, \
           X.columns.values), reverse = True)

[(0.16715931454388719, 'concave_points_worst'),
 (0.13366127363499894, 'concave_points_mean'),
 (0.1174542576640398, 'perimeter_worst'),
 (0.099679487882633755, 'radius_worst'),
 (0.093355900316311788, 'area_worst'),
 (0.081154916673909866, 'concavity_mean'),
 (0.054142055938401851, 'concavity_worst'),
 (0.049025239156765835, 'perimeter_mean'),
 (0.035210498515225508, 'radius_mean'),
 (0.032636132921040235, 'area_mean'),
 (0.022275230670363317, 'area_se'),
 (0.018244777522621172, 'compactness_worst'),
 (0.011895227652925895, 'compactness_mean'),
 (0.01176052704479969, 'radius_se'),
 (0.011495530457791623, 'texture_worst'),
 (0.011005383867656664, 'symmetry_worst'),
 (0.0074339374683463234, 'perimeter_se'),
 (0.0072703222121763503, 'smoothness_worst'),
 (0.0062425646185192123, 'texture_mean'),
 (0.0059140307436010477, 'fractal_dimension_worst'),
 (0.0040081706736959022, 'concavity_se'),
 (0.0028348740774689174, 'concave_points_se'),
 (0.0025285891563906118, 'smoothness_mean'),
 (0.00240

I hypothesized that I could make the best model using 3 variables representing shape, size and potentially texture.  From this list, I would pick concave points worst, perimeter worst, and area worst (rather than incuding concave points mean, as it would be a second concave points variable).  I would rule out texture as all three texture values are lower in the list.  

Confirm model on the entire dataset:

In [310]:
X_final = df[df.columns.values]
X_final.drop(['id','diagnosis'], axis = 1, inplace = True)

y_final = df.diagnosis

In [311]:
train_X_final, test_X_final, train_y_final, test_y_final = model_selection.train_test_split(X_final,y_final, train_size = .6, random_state=0)

In [312]:
# model_rf_final = tree.DecisionTreeClassifier(max_depth=None, min_samples_leaf=5, random_state=0).fit(train_X, train_y)

In [313]:
# model_rf_final = ensemble.RandomForestClassifier(n_estimators = 1000,
#         min_samples_leaf = 5,
#         oob_score = True,
#         random_state = 0).\
#     fit(train_X, train_y)

10 fold CV score:

In [314]:
# model_selection.cross_val_score(model_rf_final, train_X_final, train_y_final, cv = 10).mean()

0.94133944486885679

In [317]:
model_selection.cross_val_score(model, train_X_final, train_y_final, cv = 10).mean()

0.94133944486885679

In [315]:
# model_selection.cross_val_score(model_rf_final, X, y, cv = 10).mean()

0.94460526315789473

In [318]:
model_selection.cross_val_score(model, X, y, cv = 10).mean()

0.94460526315789473

94% accuracy across the entire dataset

In [316]:
sorted(zip(model_rf_final.feature_importances_, \
           X_final.columns.values), reverse = True)

[(0.16715931454388719, 'concave_points_worst'),
 (0.13366127363499894, 'concave_points_mean'),
 (0.1174542576640398, 'perimeter_worst'),
 (0.099679487882633755, 'radius_worst'),
 (0.093355900316311788, 'area_worst'),
 (0.081154916673909866, 'concavity_mean'),
 (0.054142055938401851, 'concavity_worst'),
 (0.049025239156765835, 'perimeter_mean'),
 (0.035210498515225508, 'radius_mean'),
 (0.032636132921040235, 'area_mean'),
 (0.022275230670363317, 'area_se'),
 (0.018244777522621172, 'compactness_worst'),
 (0.011895227652925895, 'compactness_mean'),
 (0.01176052704479969, 'radius_se'),
 (0.011495530457791623, 'texture_worst'),
 (0.011005383867656664, 'symmetry_worst'),
 (0.0074339374683463234, 'perimeter_se'),
 (0.0072703222121763503, 'smoothness_worst'),
 (0.0062425646185192123, 'texture_mean'),
 (0.0059140307436010477, 'fractal_dimension_worst'),
 (0.0040081706736959022, 'concavity_se'),
 (0.0028348740774689174, 'concave_points_se'),
 (0.0025285891563906118, 'smoothness_mean'),
 (0.00240

I don't understand why the top feature weighting is so different when the entire dataset is used rather than the training set.  I will leverage the top features from the training set for KNN.

# Logistic Regression - Backward Selection

Next I will perform a logistic regression using the top variables (contributing at least 5%) from my random forest model as a starting point and systematically remove variables. I will first leverage statsmodels, then scikit learn for the 10 fold correlation once I have my final features.

In [46]:
train_X_LR = df_train[['concave_points_worst','concave_points_mean','perimeter_worst','radius_worst','area_worst','concavity_mean','concavity_worst']]
train_X_LR = sm.add_constant(train_X_LR)
train_c_LR = df_train['diagnosis']
logit=sm.Logit(train_c_LR, train_X_LR)
result=logit.fit()

Optimization terminated successfully.
         Current function value: 0.121928
         Iterations 11


In [47]:
result.summary()

0,1,2,3
Dep. Variable:,diagnosis,No. Observations:,398.0
Model:,Logit,Df Residuals:,390.0
Method:,MLE,Df Model:,7.0
Date:,"Sat, 16 Sep 2017",Pseudo R-squ.:,0.8156
Time:,12:45:01,Log-Likelihood:,-48.527
converged:,True,LL-Null:,-263.17
,,LLR p-value:,1.2379999999999998e-88

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.2288,8.209,0.028,0.978,-15.860 16.317
concave_points_worst,11.0359,19.546,0.565,0.572,-27.273 49.345
concave_points_mean,99.6932,41.421,2.407,0.016,18.510 180.877
perimeter_worst,-0.1910,0.099,-1.929,0.054,-0.385 0.003
radius_worst,-0.5396,1.292,-0.418,0.676,-3.072 1.993
area_worst,0.0258,0.011,2.262,0.024,0.003 0.048
concavity_mean,-34.3794,14.984,-2.294,0.022,-63.747 -5.012
concavity_worst,14.5735,5.516,2.642,0.008,3.762 25.384


Remove radius_worst and re-run logistic regression:

In [50]:
train_X_LR = df_train[['concave_points_worst','concave_points_mean','perimeter_worst','area_worst','concavity_mean','concavity_worst']]
train_X_LR = sm.add_constant(train_X_LR)
train_c_LR = df_train['diagnosis']
logit=sm.Logit(train_c_LR, train_X_LR)
result=logit.fit()

Optimization terminated successfully.
         Current function value: 0.122144
         Iterations 10


In [51]:
result.summary()

0,1,2,3
Dep. Variable:,diagnosis,No. Observations:,398.0
Model:,Logit,Df Residuals:,391.0
Method:,MLE,Df Model:,6.0
Date:,"Sat, 16 Sep 2017",Pseudo R-squ.:,0.8153
Time:,12:45:44,Log-Likelihood:,-48.613
converged:,True,LL-Null:,-263.17
,,LLR p-value:,1.5259999999999999e-89

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-2.7960,4.013,-0.697,0.486,-10.660 5.068
concave_points_worst,8.8676,18.905,0.469,0.639,-28.185 45.920
concave_points_mean,100.0847,41.686,2.401,0.016,18.382 181.787
perimeter_worst,-0.2131,0.084,-2.539,0.011,-0.378 -0.049
area_worst,0.0218,0.006,3.693,0.000,0.010 0.033
concavity_mean,-34.1054,14.934,-2.284,0.022,-63.375 -4.836
concavity_worst,15.1249,5.394,2.804,0.005,4.554 25.696


Remove concave_points_worst:

In [52]:
train_X_LR = df_train[['concave_points_mean','perimeter_worst','area_worst','concavity_mean','concavity_worst']]
train_X_LR = sm.add_constant(train_X_LR)
train_c_LR = df_train['diagnosis']
logit=sm.Logit(train_c_LR, train_X_LR)
result=logit.fit()

Optimization terminated successfully.
         Current function value: 0.122421
         Iterations 10


In [53]:
result.summary()

0,1,2,3
Dep. Variable:,diagnosis,No. Observations:,398.0
Model:,Logit,Df Residuals:,392.0
Method:,MLE,Df Model:,5.0
Date:,"Sat, 16 Sep 2017",Pseudo R-squ.:,0.8149
Time:,12:46:18,Log-Likelihood:,-48.724
converged:,True,LL-Null:,-263.17
,,LLR p-value:,1.7449999999999999e-90

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-2.5486,3.930,-0.648,0.517,-10.251 5.154
concave_points_mean,115.5608,26.044,4.437,0.000,64.516 166.606
perimeter_worst,-0.2142,0.084,-2.562,0.010,-0.378 -0.050
area_worst,0.0219,0.006,3.695,0.000,0.010 0.033
concavity_mean,-38.2359,12.304,-3.108,0.002,-62.351 -14.121
concavity_worst,16.8824,3.911,4.317,0.000,9.218 24.547


All remaining variables are significant: concave_points_mean, perimeter_worst, area_worst, concavity_mean, concavity_worst

# Logistic Regression - Forward Selection

In [80]:
train_X_LR2 = df_train[['concave_points_worst']]
train_X_LR2 = sm.add_constant(train_X_LR2)
train_c_LR2 = df_train['diagnosis']
logit=sm.Logit(train_c_LR2, train_X_LR2)
result=logit.fit()

Optimization terminated successfully.
         Current function value: 0.224390
         Iterations 8


In [82]:
result.summary()

0,1,2,3
Dep. Variable:,diagnosis,No. Observations:,398.0
Model:,Logit,Df Residuals:,396.0
Method:,MLE,Df Model:,1.0
Date:,"Sat, 16 Sep 2017",Pseudo R-squ.:,0.6607
Time:,13:28:22,Log-Likelihood:,-89.307
converged:,True,LL-Null:,-263.17
,,LLR p-value:,1.32e-77

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-8.3471,0.859,-9.717,0.000,-10.031 -6.664
concave_points_worst,63.4395,6.603,9.608,0.000,50.499 76.380


Added perimeter_worst, as the second most important feature in the random forest model (skipped concave points mean because I assumed it reflects a very similar feature to concave points worst):

In [112]:
train_X_LR2 = df_train[['concave_points_worst','perimeter_worst']]
train_X_LR2 = sm.add_constant(train_X_LR2)
train_c_LR2 = df_train['diagnosis']
logit=sm.Logit(train_c_LR2, train_X_LR2)
result=logit.fit()

Optimization terminated successfully.
         Current function value: 0.148541
         Iterations 9


In [113]:
result.summary()

0,1,2,3
Dep. Variable:,diagnosis,No. Observations:,398.0
Model:,Logit,Df Residuals:,395.0
Method:,MLE,Df Model:,2.0
Date:,"Sat, 16 Sep 2017",Pseudo R-squ.:,0.7754
Time:,13:54:06,Log-Likelihood:,-59.119
converged:,True,LL-Null:,-263.17
,,LLR p-value:,2.3999999999999998e-89

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-18.1385,2.430,-7.466,0.000,-22.900 -13.377
concave_points_worst,41.2781,7.590,5.438,0.000,26.401 56.155
perimeter_worst,0.1160,0.020,5.693,0.000,0.076 0.156


Added radius worst, as the 3rd most important feature in the random forest model:

In [93]:
train_X_LR2 = df_train[['concave_points_worst','perimeter_worst','radius_worst']]
train_X_LR2 = sm.add_constant(train_X_LR2)
train_c_LR2 = df_train['diagnosis']
logit=sm.Logit(train_c_LR2, train_X_LR2)
result=logit.fit()

Optimization terminated successfully.
         Current function value: 0.140632
         Iterations 9


In [94]:
result.summary()

0,1,2,3
Dep. Variable:,diagnosis,No. Observations:,398.0
Model:,Logit,Df Residuals:,394.0
Method:,MLE,Df Model:,3.0
Date:,"Sat, 16 Sep 2017",Pseudo R-squ.:,0.7873
Time:,13:43:06,Log-Likelihood:,-55.972
converged:,True,LL-Null:,-263.17
,,LLR p-value:,1.678e-89

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-19.4234,2.655,-7.314,0.000,-24.628 -14.219
concave_points_worst,50.0697,8.979,5.576,0.000,32.471 67.669
perimeter_worst,-0.0703,0.079,-0.894,0.371,-0.224 0.084
radius_worst,1.2469,0.518,2.405,0.016,0.231 2.263


Stopped here, given that perimeter worst is no longer statistically significant.  Will leverage scikit learn with concave points worst and perimeter worst.

# Logistic Regression - SciKit Learn for 10 fold cross validation

## Backward selection results

First apply SciKit Learn Logistic Regression to the training set, leveraging the features selected via statsmodels above: 

In [55]:
W = df_train[df_train.columns.values]
W.drop(['id','diagnosis','radius_mean','texture_mean','perimeter_mean','area_mean', 'smoothness_mean', 'compactness_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se','area_se', 'smoothness_se', 'compactness_se', 'concavity_se','concave_points_se', 'symmetry_se', 'fractal_dimension_se','radius_worst', 'texture_worst','smoothness_worst', 'compactness_worst','concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'], axis = 1, inplace = True)

c = df_train.diagnosis

In [180]:
W

Unnamed: 0,concavity_mean,concave_points_mean,perimeter_worst,area_worst,concavity_worst
149,0.02881,0.01329,97.19,725.9,0.15640
124,0.08092,0.02800,91.99,632.1,0.33080
421,0.14500,0.06300,114.10,809.2,0.32190
195,0.03873,0.02377,90.81,600.6,0.17640
545,0.02974,0.02443,97.58,729.8,0.10490
...,...,...,...,...,...
71,0.08606,0.02872,62.56,284.4,0.14340
106,0.07070,0.03485,85.51,521.7,0.28730
270,0.00725,0.00625,94.44,684.6,0.03866
435,0.11260,0.06463,113.90,869.3,0.40690


In [56]:
W_train, W_test, c_train, c_test = train_test_split(W, c, 
                                                   test_size=0.4,
                                                   random_state=42)
#Step 2
lr = LogisticRegression()
lr.fit(W_train, c_train)

#Step 3
preds = lr.predict(W_test)

#Step 4
testing_score = accuracy_score(c_test, preds)

print ("The model accurately classified {:.2f} percent of the testing data".format(testing_score*100))

The model accurately classified 88.75 percent of the testing data


In [59]:
#Intialize, fit, and score the model
lr = LogisticRegression()

lr.fit(W,c)

score = lr.score(W,c)

print ("The model produces an accuracy score of {:.2f} percent".format(score*100))

The model produces an accuracy score of 91.96 percent


10-fold cross validation on the training set:

In [63]:
#Use cross_val_score method to generate the average accuracy score for 10 CVs
mean_cv_score = cross_val_score(LogisticRegression(), W,c, cv=10, scoring="accuracy").mean()

print ("The cross validated accuracy score is {:.2f} percent").format(mean_cv_score*100)

The cross validated accuracy score is 91.41 percent


10-fold cross validation on the entire data set:

In [67]:
W_final = df[df.columns.values]
W_final.drop(['id','diagnosis','radius_mean','texture_mean','perimeter_mean','area_mean', 'smoothness_mean', 'compactness_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se','area_se', 'smoothness_se', 'compactness_se', 'concavity_se','concave_points_se', 'symmetry_se', 'fractal_dimension_se','radius_worst', 'texture_worst','smoothness_worst', 'compactness_worst','concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'], axis = 1, inplace = True)

c_final = df.diagnosis

In [68]:
mean_cv_score = cross_val_score(LogisticRegression(), W_final,c_final, cv=10, scoring="accuracy").mean()

print ("The cross validated accuracy score is {:.2f} percent").format(mean_cv_score*100)

The cross validated accuracy score is 92.46 percent


This is for the top 5 features from the statsmodels backward selection method.

## Forward selection results

Leveraged concave points worst and perimeter worst features

In [98]:
P = df_train[df_train.columns.values]
P.drop(['id','diagnosis','radius_mean','texture_mean','perimeter_mean','area_mean', 'smoothness_mean', 'compactness_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se','area_se', 'smoothness_se', 'compactness_se', 'concavity_se','concave_points_se', 'symmetry_se', 'fractal_dimension_se','radius_worst', 'texture_worst','smoothness_worst', 'compactness_worst','concave_points_mean', 'symmetry_worst', 'fractal_dimension_worst','area_worst','concavity_mean','concavity_worst'], axis = 1, inplace = True)

r = df_train.diagnosis

In [102]:
#Intialize, fit, and score the model
lr = LogisticRegression()

lr.fit(P,r)

score = lr.score(P,r)

print ("The model produces an accuracy score of {:.2f} percent".format(score*100))

The model produces an accuracy score of 90.95 percent


In [None]:
10-fold cross validation on training set:

In [103]:
#Use cross_val_score method to generate the average accuracy score for 10 CVs
mean_cv_score = cross_val_score(LogisticRegression(), P,r, cv=10, scoring="accuracy").mean()

print ("The cross validated accuracy score is {:.2f} percent").format(mean_cv_score*100)

The cross validated accuracy score is 90.93 percent


10-fold cross validation on entire dataset:

In [105]:
P_final = df[df.columns.values]
P_final.drop(['id','diagnosis','radius_mean','texture_mean','perimeter_mean','area_mean', 'smoothness_mean', 'compactness_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se','area_se', 'smoothness_se', 'compactness_se', 'concavity_se','concave_points_se', 'symmetry_se', 'fractal_dimension_se','radius_worst', 'texture_worst','smoothness_worst', 'compactness_worst','concave_points_mean', 'symmetry_worst', 'fractal_dimension_worst','area_worst','concavity_mean','concavity_worst'], axis = 1, inplace = True)

r_final = df.diagnosis

In [106]:
mean_cv_score = cross_val_score(LogisticRegression(), P_final,r_final, cv=10, scoring="accuracy").mean()

print ("The cross validated accuracy score is {:.2f} percent").format(mean_cv_score*100)

The cross validated accuracy score is 91.59 percent


Then I re-ran the 10-fold cross validation on the training set, then entire dataset leveraging the entire feature set (30 features). X_final and y_final were defined for the random forest model.

10-fold cross validation on the training set - all 30 variables

In [70]:
mean_cv_score = cross_val_score(LogisticRegression(), X,y, cv=10, scoring="accuracy").mean()

print ("The cross validated accuracy score is {:.2f} percent").format(mean_cv_score*100)

The cross validated accuracy score is 94.68 percent


10-fold cross validation on the entire dataset - all 30 variables

In [66]:
mean_cv_score = cross_val_score(LogisticRegression(), X_final,y_final, cv=10, scoring="accuracy").mean()

print ("The cross validated accuracy score is {:.2f} percent").format(mean_cv_score*100)

The cross validated accuracy score is 95.09 percent


# KNN

### Performed with the top 3 features from random forest (concave points worst, perimeter worst, and radius worst)

In [189]:
# df_knn_postrf = post random forest
df_knn_postrf = df_train[['diagnosis','concave_points_worst', 'perimeter_worst', 'radius_worst']] 

Scale the features

In [190]:
columns_XX = ['concave_points_worst', 'perimeter_worst', 'radius_worst']
XX = df_postrf[columns_XX]

scaler = preprocessing.MinMaxScaler().fit(XX)

XX = scaler.transform(XX)

In [191]:
cc = df_knn_postrf.diagnosis

In [194]:
model = neighbors.KNeighborsClassifier(n_neighbors = 1).\
    fit(XX, cc)

In [195]:
model.score(XX,cc)

1.0

In [196]:
cc_hat = model.predict(XX)

In [197]:
(cc_hat == cc).mean()

1.0

In [198]:
pd.crosstab(cc_hat,cc,rownames = ['Hypotehsized Class'], colnames=['True Class'])

True Class,0,1
Hypotehsized Class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,249,0
1,0,149


Train and test sets (50/50) within the training subset of the overall dataset 

In [213]:
train_knn_postrf = df_knn_postrf.sample(frac = .5, random_state = 0).sort_index()

In [214]:
test_knn_postrf = df_knn_postrf.drop(train_knn_postrf.index)

Feature matrix train_XX and response vector train_cc

In [215]:
train_XX = train_knn_postrf[columns_XX]

scaler = preprocessing.MinMaxScaler().fit(train_XX)
train_XX = scaler.transform(train_XX)

train_cc = train_knn_postrf.diagnosis

Feature matrix test_XX and response vector test_cc

In [216]:
test_XX = test_knn_postrf[columns_XX]

test_XX = scaler.transform(test_XX)

test_cc = test_knn_postrf.diagnosis

10 fold CV on training data

In [217]:
k_cv = 10 # 10-fold CV
k_nn = range(1, train_df.shape[0] * (k_cv - 1) / k_cv) # k-NN

gs = model_selection.GridSearchCV(
    estimator = neighbors.KNeighborsClassifier(),
    param_grid = {'n_neighbors': k_nn, 'weights': ['uniform', 'distance']},
    cv = model_selection.KFold(n_splits = k_cv, shuffle = True, random_state = 0)
)

gs.fit(train_XX, train_cc)

GridSearchCV(cv=KFold(n_splits=10, random_state=0, shuffle=True),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7...66, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [218]:
gs.best_score_

0.94472361809045224

In [219]:
gs.best_params_

{'n_neighbors': 6, 'weights': 'uniform'}

In [220]:
gs.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='uniform')

In [221]:
gs.score(train_XX, train_cc)

0.94472361809045224

In [223]:
gs.score(test_XX, test_cc)

0.94974874371859297

95% accuracy on the test set within the overall training set, using 6 nearest neighbors.

Now we will move to 10 fold cross validation on the entire dataset.

In [320]:
X = df[columns_XX]
scaler = preprocessing.MinMaxScaler().fit(X)
X = scaler.transform(X)

c = df.diagnosis
#top part makes sense
#redoing grid search
k_cv = 10 # 10-fold CV
k_nn = range(1, train_df.shape[0] * (k_cv - 1) / k_cv) # k-NN

gs = model_selection.GridSearchCV(
    estimator = neighbors.KNeighborsClassifier(),
    param_grid = {'n_neighbors': k_nn, 'weights': ['uniform', 'distance']},
    cv = model_selection.KFold(n_splits = k_cv, shuffle = True, random_state = 0)
)

gs.fit(X, c)

GridSearchCV(cv=KFold(n_splits=10, random_state=0, shuffle=True),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7...66, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [321]:
gs.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [325]:
model = neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'uniform').\
    fit(train_XX, train_cc) # what should I be fitting to here?

model.score(X, c)

0.9472759226713533

##### KNN model leveraging the top 3 features from random forest has a 95% accuracy rate for the entire dataset

### KNN Performed with the top 2 features from logistic regression forward selection (concave points worst, perimeter worst)

In [252]:
# df_knn_LR= KNN on logistic regression features
df_knn_LR = df_train[['diagnosis','concave_points_worst', 'perimeter_worst']] 

In [253]:
columns_LR = ['concave_points_worst', 'perimeter_worst']
LR = df_postrf[columns_LR]

scaler = preprocessing.MinMaxScaler().fit(LR)

LR = scaler.transform(LR)
l = df_knn_postrf.diagnosis

In [254]:
model = neighbors.KNeighborsClassifier(n_neighbors = 1).\
    fit(LR, l)

In [255]:
model.score(LR, l)

1.0

Train and test sets (50/50) within the training subset of the overall dataset

In [256]:
train_knn_LR = df_knn_LR.sample(frac = .5, random_state = 0).sort_index()

In [257]:
test_knn_LR = df_knn_LR.drop(train_knn_LR.index)

Feature matrix train_LR and response vector train_l

In [259]:
train_LR = train_knn_LR[columns_LR]

scaler = preprocessing.MinMaxScaler().fit(train_LR)
train_LR = scaler.transform(train_LR)

train_l = train_knn_LR.diagnosis

Feature matrix test_LR and response vector test_l:

In [260]:
test_LR = test_knn_LR[columns_LR]

test_LR = scaler.transform(test_LR)

test_l = test_knn_LR.diagnosis

10-fold CV on training data:

In [262]:
k_cv = 10 # 10-fold CV
k_nn = range(1, train_df.shape[0] * (k_cv - 1) / k_cv) # k-NN

gs = model_selection.GridSearchCV(
    estimator = neighbors.KNeighborsClassifier(),
    param_grid = {'n_neighbors': k_nn, 'weights': ['uniform', 'distance']},
    cv = model_selection.KFold(n_splits = k_cv, shuffle = True, random_state = 0)
)

gs.fit(train_LR, train_l)

GridSearchCV(cv=KFold(n_splits=10, random_state=0, shuffle=True),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7...66, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [263]:
gs.best_score_

0.92462311557788945

In [264]:
gs.best_params_

{'n_neighbors': 19, 'weights': 'uniform'}

In [265]:
gs.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=19, p=2,
           weights='uniform')

In [266]:
gs.score(train_LR, train_l)

0.9346733668341709

In [267]:
gs.score(test_LR, test_l)

0.95477386934673369

Accuracy: 95%

Now we will move to 10 fold cross validation on the entire dataset.

In [268]:
R = df[columns_LR]
scaler = preprocessing.MinMaxScaler().fit(R)
R = scaler.transform(R)

q = df.diagnosis
#top part makes sense
#redoing grid search
k_cv = 10 # 10-fold CV
k_nn = range(1, train_df.shape[0] * (k_cv - 1) / k_cv) # k-NN

gs = model_selection.GridSearchCV(
    estimator = neighbors.KNeighborsClassifier(),
    param_grid = {'n_neighbors': k_nn, 'weights': ['uniform', 'distance']},
    cv = model_selection.KFold(n_splits = k_cv, shuffle = True, random_state = 0)
)

gs.fit(R, q)

GridSearchCV(cv=KFold(n_splits=10, random_state=0, shuffle=True),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7...66, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [269]:
gs.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=14, p=2,
           weights='uniform')

In [278]:
model = neighbors.KNeighborsClassifier(n_neighbors = 19, weights = 'uniform').\
    fit(train_LR, train_l) 

model.score(R, q)

0.93848857644991213

Accuracy: 94%, using 19 neighbors.