# Project Cancer Detection

# Breast Cancer Wisconsin (Disgnostic) Data Set

[Source: UCI](http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29)

[Data Set info](http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names)

In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
col = ['id', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Blend Chromatin', 'Normal Nucleoli', 'Mitosis', 'Class']

In [4]:
df = pd.read_csv('breast-cancer-wisconsin.data.csv', header=None,names=col)
df

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Blend Chromatin,Normal Nucleoli,Mitosis,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


# Data Pre-processing

In [5]:
np.where(df.isnull())

(array([], dtype=int64), array([], dtype=int64))

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id                           699 non-null    int64 
 1   Clump Thickness              699 non-null    int64 
 2   Uniformity of Cell Size      699 non-null    int64 
 3   Uniformity of Cell Shape     699 non-null    int64 
 4   Marginal Adhesion            699 non-null    int64 
 5   Single Epithelial Cell Size  699 non-null    int64 
 6   Bare Nuclei                  699 non-null    object
 7   Blend Chromatin              699 non-null    int64 
 8   Normal Nucleoli              699 non-null    int64 
 9   Mitosis                      699 non-null    int64 
 10  Class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [7]:
df['Bare Nuclei'].describe()

count     699
unique     11
top         1
freq      402
Name: Bare Nuclei, dtype: object

In [8]:
df['Bare Nuclei'].value_counts()

Bare Nuclei
1     402
10    132
2      30
5      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: count, dtype: int64

In [9]:
df[df['Bare Nuclei'] == '?']

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Blend Chromatin,Normal Nucleoli,Mitosis,Class
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2
164,1197510,5,1,1,1,2,?,3,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
249,169356,3,1,1,1,2,?,3,1,1,2
275,432809,3,1,3,1,2,?,2,1,1,2
292,563649,8,8,8,1,2,?,6,10,1,4


In [10]:
df['Class'].value_counts()

Class
2    458
4    241
Name: count, dtype: int64

In [11]:
df['Bare Nuclei'].replace('?', np.NAN, inplace =True)
df = df.dropna()

In [12]:
df

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Blend Chromatin,Normal Nucleoli,Mitosis,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [13]:
df['Class']= df['Class']/2 - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Class']= df['Class']/2 - 1


In [14]:
df['Class'].value_counts()

Class
0.0    444
1.0    239
Name: count, dtype: int64

In [15]:
df.columns

Index(['id', 'Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Blend Chromatin',
       'Normal Nucleoli', 'Mitosis', 'Class'],
      dtype='object')

In [16]:
X =df.drop(['id','Class'],axis=1)
X_col = X.columns

In [17]:
y = df['Class']

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
X = StandardScaler().fit_transform(X.values)

In [20]:
X

array([[ 0.19790469, -0.70221201, -0.74177362, ..., -0.18182716,
        -0.61292736, -0.34839971],
       [ 0.19790469,  0.27725185,  0.26278299, ..., -0.18182716,
        -0.28510482, -0.34839971],
       [-0.51164337, -0.70221201, -0.74177362, ..., -0.18182716,
        -0.61292736, -0.34839971],
       ...,
       [ 0.19790469,  2.23617957,  2.2718962 , ...,  1.86073779,
         2.33747554,  0.22916583],
       [-0.15686934,  1.58320366,  0.93248739, ...,  2.67776377,
         1.02618536, -0.34839971],
       [-0.15686934,  1.58320366,  1.6021918 , ...,  2.67776377,
         0.37054027, -0.34839971]])

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
df1 = pd.DataFrame(X,columns= X_col)
df1

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Blend Chromatin,Normal Nucleoli,Mitosis
0,0.197905,-0.702212,-0.741774,-0.639366,-0.555608,-0.698853,-0.181827,-0.612927,-0.348400
1,0.197905,0.277252,0.262783,0.758032,1.695166,1.772867,-0.181827,-0.285105,-0.348400
2,-0.511643,-0.702212,-0.741774,-0.639366,-0.555608,-0.424217,-0.181827,-0.612927,-0.348400
3,0.552679,1.583204,1.602192,-0.639366,-0.105454,0.125054,-0.181827,1.354008,-0.348400
4,-0.156869,-0.702212,-0.741774,0.059333,-0.555608,-0.698853,-0.181827,-0.612927,-0.348400
...,...,...,...,...,...,...,...,...,...
678,-0.511643,-0.702212,-0.741774,-0.639366,-0.105454,-0.424217,-0.998853,-0.612927,-0.348400
679,-0.866417,-0.702212,-0.741774,-0.639366,-0.555608,-0.698853,-0.998853,-0.612927,-0.348400
680,0.197905,2.236180,2.271896,0.059333,1.695166,-0.149582,1.860738,2.337476,0.229166
681,-0.156869,1.583204,0.932487,0.408682,-0.105454,0.125054,2.677764,1.026185,-0.348400


In [23]:
 X_train, X_test, y_train, y_test = train_test_split(df1, y, test_size=0.2, random_state=42)

In [25]:
from sklearn.preprocessing import MinMaxScaler
pd.DataFrame(MinMaxScaler().fit_transform(df.drop(['id','Class'],axis=1).values), columns = X_col).head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Blend Chromatin,Normal Nucleoli,Mitosis
0,0.444444,0.0,0.0,0.0,0.111111,0.0,0.222222,0.0,0.0
1,0.444444,0.333333,0.333333,0.444444,0.666667,1.0,0.222222,0.111111,0.0
2,0.222222,0.0,0.0,0.0,0.111111,0.111111,0.222222,0.0,0.0
3,0.555556,0.777778,0.777778,0.0,0.222222,0.333333,0.222222,0.666667,0.0
4,0.333333,0.0,0.0,0.222222,0.111111,0.0,0.222222,0.0,0.0


In [26]:
from sklearn.preprocessing import Normalizer
pd.DataFrame(Normalizer().fit_transform(df.drop(['id','Class'],axis=1).values), columns = X_col).head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Blend Chromatin,Normal Nucleoli,Mitosis
0,0.753778,0.150756,0.150756,0.150756,0.301511,0.150756,0.452267,0.150756,0.150756
1,0.319438,0.255551,0.255551,0.319438,0.447214,0.638877,0.191663,0.127775,0.063888
2,0.538816,0.179605,0.179605,0.179605,0.359211,0.359211,0.538816,0.179605,0.179605
3,0.380235,0.506979,0.506979,0.063372,0.190117,0.25349,0.190117,0.443607,0.063372
4,0.609994,0.152499,0.152499,0.457496,0.304997,0.152499,0.457496,0.152499,0.152499


In [27]:
from sklearn.neighbors import KNeighborsClassifier

In [30]:
knn = KNeighborsClassifier(n_neighbors=5,p=2, metric='minkowski')

In [31]:
knn.fit(X_train,y_train)

In [36]:
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [37]:
def print_score(clf, X_train,y_train,X_test, y_test, train = True):
    if train:
        print('Train Result:\n')
        print('accuracy score: {0:4f}\n'.format(accuracy_score(y_train,clf.predict(X_train))))
        print('Classification Report: \n {}\n'.format(classification_report(y_train,clf.predict(X_train))))
        print('Confusion Matrix: \n {}\n'.format(confusion_matrix(y_train,clf.predict(X_train))))

        res = cross_val_score(clf,X_train,y_train,cv =10,scoring='accuracy')
        print('Average Accuracy: \t {0:.4f}'.format(np.mean(res)))
        print('Accuracy SD: \t\t {0:.4f}'.format(np.std(res)))

    elif train == False:
        print('Test Result:\n')
        print('accuracy score: {0:4f}\n'.format(accuracy_score(y_test,clf.predict(X_test))))
        print('Classification Report: \n {}\n'.format(classification_report(y_test,clf.predict(X_test))))
        print('Confusion Matrix: \n {}\n'.format(confusion_matrix(y_test,clf.predict(X_test))))

In [38]:
print_score(knn, X_train, y_train,X_test,y_test,train=True)

Train Result:

accuracy score: 0.972527

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       365
         1.0       0.96      0.96      0.96       181

    accuracy                           0.97       546
   macro avg       0.97      0.97      0.97       546
weighted avg       0.97      0.97      0.97       546


Confusion Matrix: 
 [[358   7]
 [  8 173]]

Average Accuracy: 	 0.9634
Accuracy SD: 		 0.0200


In [39]:
print_score(knn, X_train, y_train,X_test,y_test,train=False)

Test Result:

accuracy score: 0.956204

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.94      0.99      0.96        79
         1.0       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137


Confusion Matrix: 
 [[78  1]
 [ 5 53]]



# Grid Search

In [40]:
from sklearn .model_selection import GridSearchCV

In [41]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [43]:
params = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10]}

In [44]:
grid_search_cv = GridSearchCV(KNeighborsClassifier(),params,n_jobs=1,verbose=1)

In [45]:
grid_search_cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [47]:
grid_search_cv.best_estimator_

In [48]:
print_score(grid_search_cv, X_train, y_train,X_test,y_test,train=True)

Train Result:

accuracy score: 0.970696

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       365
         1.0       0.96      0.95      0.96       181

    accuracy                           0.97       546
   macro avg       0.97      0.97      0.97       546
weighted avg       0.97      0.97      0.97       546


Confusion Matrix: 
 [[358   7]
 [  9 172]]

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 fol

In [50]:
print_score(grid_search_cv, X_train, y_train,X_test,y_test,train=False)

Test Result:

accuracy score: 0.948905

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.93      0.99      0.96        79
         1.0       0.98      0.90      0.94        58

    accuracy                           0.95       137
   macro avg       0.95      0.94      0.95       137
weighted avg       0.95      0.95      0.95       137


Confusion Matrix: 
 [[78  1]
 [ 6 52]]



In [51]:
grid_search_cv.best_estimator_

In [55]:
grid_search_cv.cv_results_

{'mean_fit_time': array([0.00679822, 0.00299926, 0.00419111, 0.00319958, 0.00320096,
        0.00320005, 0.00339518, 0.00340223, 0.00320349, 0.00300016]),
 'std_fit_time': array([2.77820595e-03, 1.21007705e-06, 7.45660415e-04, 3.98278361e-04,
        4.00093581e-04, 4.00185624e-04, 4.85327886e-04, 4.95070424e-04,
        4.03718105e-04, 7.16843432e-07]),
 'mean_score_time': array([0.01860237, 0.01199827, 0.01280913, 0.01200423, 0.01140108,
        0.0111969 , 0.01160231, 0.01180105, 0.01120276, 0.01219578]),
 'std_score_time': array([0.00653585, 0.0006316 , 0.00194365, 0.00090008, 0.00048909,
        0.00074394, 0.00048847, 0.0017119 , 0.00040058, 0.00098048]),
 'param_n_neighbors': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 1},
  {'n_neighbors': 2},
  {'n_neighbors': 3},
  {'n_neighbors': 

## Cross checking other models such as SVM,Random Forest , XGBoost

In [56]:
from sklearn import svm
clf = svm.SVC(kernel='rbf')
clf.fit(X_train,y_train)
print_score(clf, X_train, y_train,X_test,y_test,train=True)
print_score(clf, X_train, y_train,X_test,y_test,train=False)

Train Result:

accuracy score: 0.979853

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.99      0.98      0.98       365
         1.0       0.96      0.98      0.97       181

    accuracy                           0.98       546
   macro avg       0.98      0.98      0.98       546
weighted avg       0.98      0.98      0.98       546


Confusion Matrix: 
 [[358   7]
 [  4 177]]

Average Accuracy: 	 0.9634
Accuracy SD: 		 0.0244
Test Result:

accuracy score: 0.963504

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.96      0.97      0.97        79
         1.0       0.96      0.95      0.96        58

    accuracy                           0.96       137
   macro avg       0.96      0.96      0.96       137
weighted avg       0.96      0.96      0.96       137


Confusion Matrix: 
 [[77  2]
 [ 3 55]]



In [57]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train,y_train)
print_score(clf, X_train, y_train,X_test,y_test,train=True)
print_score(clf, X_train, y_train,X_test,y_test,train=False)

Train Result:

accuracy score: 1.000000

Classification Report: 
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       365
         1.0       1.00      1.00      1.00       181

    accuracy                           1.00       546
   macro avg       1.00      1.00      1.00       546
weighted avg       1.00      1.00      1.00       546


Confusion Matrix: 
 [[365   0]
 [  0 181]]

Average Accuracy: 	 0.9652
Accuracy SD: 		 0.0223
Test Result:

accuracy score: 0.948905

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.93      0.99      0.96        79
         1.0       0.98      0.90      0.94        58

    accuracy                           0.95       137
   macro avg       0.95      0.94      0.95       137
weighted avg       0.95      0.95      0.95       137


Confusion Matrix: 
 [[78  1]
 [ 6 52]]

