# Project Cancer Detection

# Breast Cancer Wisconsin (Disgnostic) Data Set

[Source: UCI](http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29)

[Data Set info](http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names)

7. Attribute Information: (class attribute has been moved to last column)
   #  Attribute                     Domain
   -- -----------------------------------------
   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)

In [1]:
import numpy as np
import pandas as pd

In [2]:
col = ['id', 'Clump Thickness', 'Uniformity of Cell Size', 
       'Uniformity of Cell Shape', 'Marginal Adhesion', 
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses', 'Class']
df = pd.read_csv("breast-cancer-wisconsin.data.csv", names=col,
                 header=None)
df.head()

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


# Data Pre-processing

In [3]:
np.where(df.isnull())

(array([], dtype=int64), array([], dtype=int64))

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id                             699 non-null int64
Clump Thickness                699 non-null int64
Uniformity of Cell Size        699 non-null int64
Uniformity of Cell Shape       699 non-null int64
Marginal Adhesion              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    699 non-null object
Bland Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class                          699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [5]:
df['Bare Nuclei'].describe()

count     699
unique     11
top         1
freq      402
Name: Bare Nuclei, dtype: object

In [6]:
df['Bare Nuclei'].value_counts()

1     402
10    132
2      30
5      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64

How do we drop the `?`

In [7]:
df[df['Bare Nuclei'] == "?"]

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2
164,1197510,5,1,1,1,2,?,3,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
249,169356,3,1,1,1,2,?,3,1,1,2
275,432809,3,1,3,1,2,?,2,1,1,2
292,563649,8,8,8,1,2,?,6,10,1,4


In [8]:
df['Class'].value_counts()

2    458
4    241
Name: Class, dtype: int64

In [9]:
df['Bare Nuclei'].replace("?", np.NAN, inplace=True)
df = df.dropna()

Note that for class: 2 is benign, 4 is for malignant

$$\frac{\text{df["Class"]}}{2} - 1$$

In [10]:
df['Bare Nuclei'].value_counts()

1     402
10    132
2      30
5      30
3      28
8      21
4      19
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64

In [11]:
df['Class'] = df['Class'] / 2 - 1

In [12]:
df['Class'].value_counts()

0.0    444
1.0    239
Name: Class, dtype: int64

In [13]:
df.columns

Index(['id', 'Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses', 'Class'],
      dtype='object')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
id                             683 non-null int64
Clump Thickness                683 non-null int64
Uniformity of Cell Size        683 non-null int64
Uniformity of Cell Shape       683 non-null int64
Marginal Adhesion              683 non-null int64
Single Epithelial Cell Size    683 non-null int64
Bare Nuclei                    683 non-null object
Bland Chromatin                683 non-null int64
Normal Nucleoli                683 non-null int64
Mitoses                        683 non-null int64
Class                          683 non-null float64
dtypes: float64(1), int64(9), object(1)
memory usage: 64.0+ KB


In [15]:
X = df.drop(['id', 'Class'], axis=1)
X_col = X.columns

In [16]:
y = df['Class']

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
X = StandardScaler().fit_transform(X.values)



Training

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
df1 = pd.DataFrame(X, columns=X_col)

In [21]:
df1.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,0.197905,-0.702212,-0.741774,-0.639366,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484
1,0.197905,0.277252,0.262783,0.758032,1.695166,1.772867,-0.181827,-0.285105,-0.3484
2,-0.511643,-0.702212,-0.741774,-0.639366,-0.555608,-0.424217,-0.181827,-0.612927,-0.3484
3,0.552679,1.583204,1.602192,-0.639366,-0.105454,0.125054,-0.181827,1.354008,-0.3484
4,-0.156869,-0.702212,-0.741774,0.059333,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484


In [22]:
X_train, X_test, y_train, y_test = train_test_split(df1, y,
                                                    train_size=0.8,
                                                    random_state=42)



In [23]:
from sklearn.preprocessing import MinMaxScaler
pd.DataFrame(MinMaxScaler().fit_transform(df.drop(['id', 'Class'], axis=1).values), columns=X_col).head()



Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,0.444444,0.0,0.0,0.0,0.111111,0.0,0.222222,0.0,0.0
1,0.444444,0.333333,0.333333,0.444444,0.666667,1.0,0.222222,0.111111,0.0
2,0.222222,0.0,0.0,0.0,0.111111,0.111111,0.222222,0.0,0.0
3,0.555556,0.777778,0.777778,0.0,0.222222,0.333333,0.222222,0.666667,0.0
4,0.333333,0.0,0.0,0.222222,0.111111,0.0,0.222222,0.0,0.0


In [24]:
from sklearn.neighbors import KNeighborsClassifier

In [25]:
knn = KNeighborsClassifier(n_neighbors=5,
                           p=2, metric='minkowski')

In [26]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [27]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [28]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))        

In [29]:
print_score(knn, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.9725

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.98      0.98      0.98       365
        1.0       0.96      0.96      0.96       181

avg / total       0.97      0.97      0.97       546


Confusion Matrix: 
 [[358   7]
 [  8 173]]

Average Accuracy: 	 0.9635
Accuracy SD: 		 0.0162


In [30]:
print_score(knn, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.9562

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.94      0.99      0.96        79
        1.0       0.98      0.91      0.95        58

avg / total       0.96      0.96      0.96       137


Confusion Matrix: 
 [[78  1]
 [ 5 53]]



In [31]:
accuracy = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
precision = cross_val_score(knn, X_train, y_train, cv=5, scoring='precision')
recall = cross_val_score(knn, X_train, y_train, cv=5, scoring='recall')
f1 = cross_val_score(knn, X_train, y_train, cv=5, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())

0.9597164303586322
0.9391720291720291
0.9597164303586322
0.9391118399897099


# Grid Search

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [34]:
params = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

In [35]:
grid_search_cv = GridSearchCV(KNeighborsClassifier(),
                              params, 
                              n_jobs=-1,
                              verbose=1)

In [36]:
grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   16.0s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [37]:
grid_search_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

In [38]:
print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.9725

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.98      0.98      0.98       365
        1.0       0.96      0.96      0.96       181

avg / total       0.97      0.97      0.97       546


Confusion Matrix: 
 [[358   7]
 [  8 173]]

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   25.6s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   20.0s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   23.9s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   21.4s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   19.7s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.9s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.8s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    8.1s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   17.5s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Average Accuracy: 	 0.9635
Accuracy SD: 		 0.0181


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.2s finished


In [39]:
print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.9562

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.94      0.99      0.96        79
        1.0       0.98      0.91      0.95        58

avg / total       0.96      0.96      0.96       137


Confusion Matrix: 
 [[78  1]
 [ 5 53]]



In [40]:
accuracy = cross_val_score(grid_search_cv, X_train, y_train, cv=5, scoring='accuracy')
precision = cross_val_score(grid_search_cv, X_train, y_train, cv=5, scoring='precision')
recall = cross_val_score(grid_search_cv, X_train, y_train, cv=5, scoring='recall')
f1 = cross_val_score(grid_search_cv, X_train, y_train, cv=5, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.9s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.1s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.3s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.8s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.9s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.1s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.4s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.2s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.3s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.8s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.0s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.9s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.6s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.9s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.9s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.9s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.9s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.0s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.8s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits
0.9652210175145957
0.9550614480026244
0.9652210175145957
0.9469374542536804


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.1s finished


In [41]:
grid_search_cv.best_params_

{'n_neighbors': 7}

In [42]:
grid_search_cv.cv_results_['mean_train_score']



array([1.        , 0.96703277, 0.97528466, 0.97161912, 0.9725374 ,
       0.97070337, 0.97161158, 0.96977754, 0.96978258, 0.97070085])

In [43]:
grid_search_cv.cv_results_



{'mean_fit_time': array([0.00366807, 0.00366918, 0.0061721 , 0.0041705 , 0.00433548,
        0.00316882, 0.00416899, 0.01017404, 0.00333571, 0.00300177]),
 'mean_score_time': array([0.00400297, 0.0066727 , 0.00533549, 0.00516947, 0.00633923,
        0.00483632, 0.00683808, 0.00817235, 0.00633764, 0.00517583]),
 'mean_test_score': array([0.94871795, 0.94505495, 0.96520147, 0.96520147, 0.96703297,
        0.96336996, 0.96886447, 0.96703297, 0.96886447, 0.96703297]),
 'mean_train_score': array([1.        , 0.96703277, 0.97528466, 0.97161912, 0.9725374 ,
        0.97070337, 0.97161158, 0.96977754, 0.96978258, 0.97070085]),
 'param_n_neighbors': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 1},
  {'n_neighbors': 2},
  {'n_neighbors': 3},
  {'n_neighbors': 4},
  {'n_neighbors': 5},
  {'n_neighbors':

SVM, Random Forest, XGBoost

In [44]:
from sklearn import svm
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)
print_score(clf, X_train, y_train, X_test, y_test, train=True)
print_score(clf, X_train, y_train, X_test, y_test, train=False)

Train Result:

accuracy score: 0.9799

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.99      0.98      0.98       365
        1.0       0.96      0.98      0.97       181

avg / total       0.98      0.98      0.98       546


Confusion Matrix: 
 [[358   7]
 [  4 177]]

Average Accuracy: 	 0.9635
Accuracy SD: 		 0.0199
Test Result:

accuracy score: 0.9635

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.96      0.97      0.97        79
        1.0       0.96      0.95      0.96        58

avg / total       0.96      0.96      0.96       137


Confusion Matrix: 
 [[77  2]
 [ 3 55]]



In [45]:
accuracy = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
precision = cross_val_score(clf, X_train, y_train, cv=5, scoring='precision')
recall = cross_val_score(clf, X_train, y_train, cv=5, scoring='recall')
f1 = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())

0.9615346121768141
0.9312781954887217
0.9615346121768141
0.94265644443554


In [46]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
print_score(clf, X_train, y_train, X_test, y_test, train=True)
print_score(clf, X_train, y_train, X_test, y_test, train=False)

Train Result:

accuracy score: 0.9982

Classification Report: 
              precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       365
        1.0       1.00      0.99      1.00       181

avg / total       1.00      1.00      1.00       546


Confusion Matrix: 
 [[365   0]
 [  1 180]]

Average Accuracy: 	 0.9579
Accuracy SD: 		 0.0164
Test Result:

accuracy score: 0.9416

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.92      0.99      0.95        79
        1.0       0.98      0.88      0.93        58

avg / total       0.94      0.94      0.94       137


Confusion Matrix: 
 [[78  1]
 [ 7 51]]



In [47]:
accuracy = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
precision = cross_val_score(clf, X_train, y_train, cv=5, scoring='precision')
recall = cross_val_score(clf, X_train, y_train, cv=5, scoring='recall')
f1 = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())

0.9597164303586322
0.9449697773846382
0.9597164303586322
0.9390165196204169


In [48]:
import xgboost as xgb
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train)
print_score(clf, X_train, y_train, X_test, y_test, train=True)
print_score(clf, X_train, y_train, X_test, y_test, train=False)

Train Result:

accuracy score: 0.9927

Classification Report: 
              precision    recall  f1-score   support

        0.0       1.00      0.99      0.99       365
        1.0       0.98      0.99      0.99       181

avg / total       0.99      0.99      0.99       546


Confusion Matrix: 
 [[362   3]
 [  1 180]]



  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Average Accuracy: 	 0.9616
Accuracy SD: 		 0.0190
Test Result:

accuracy score: 0.9489

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.93      0.99      0.96        79
        1.0       0.98      0.90      0.94        58

avg / total       0.95      0.95      0.95       137


Confusion Matrix: 
 [[78  1]
 [ 6 52]]



  if diff:
  if diff:
  if diff:
  if diff:


In [49]:
accuracy = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
precision = cross_val_score(clf, X_train, y_train, cv=5, scoring='precision')
recall = cross_val_score(clf, X_train, y_train, cv=5, scoring='recall')
f1 = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.9615846538782318
0.9402902061725591
0.9615846538782318
0.9418200301706096


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


***

In [51]:
from sklearn import svm
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [53]:
accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
precision = cross_val_score(clf, X_train, y_train, cv=10, scoring='precision')
recall = cross_val_score(clf, X_train, y_train, cv=10, scoring='recall')
f1 = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())

0.9633970658970659
0.941812865497076
0.9633970658970659
0.9453145582557347


In [24]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state = 0)
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
y_pred = clf.predict(X_test)
X_test.shape

(137, 9)

In [55]:
accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
precision = cross_val_score(clf, X_train, y_train, cv=10, scoring='precision')
recall = cross_val_score(clf, X_train, y_train, cv=10, scoring='recall')
f1 = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())

0.9688852813852813
0.9528654970760234
0.9688852813852813
0.9532424064931806


In [56]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [57]:
y_pred = classifier.predict(X_test)

In [58]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[76,  3],
       [ 3, 55]], dtype=int64)

In [59]:
accuracy = cross_val_score(classifier, X_train, y_train, cv=10, scoring='accuracy')
precision = cross_val_score(classifier, X_train, y_train, cv=10, scoring='precision')
recall = cross_val_score(classifier, X_train, y_train, cv=10, scoring='recall')
f1 = cross_val_score(classifier, X_train, y_train, cv=10, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())

0.9632647907647908
0.9160609857978279
0.9632647907647908
0.9473118520486942


In [62]:
from sklearn.neural_network import MLPClassifier 
clf = MLPClassifier(alpha=1)
clf.fit(X_train, y_train)



MLPClassifier(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [65]:
y_pred = clf.predict(X_test)

In [66]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[78,  1],
       [ 4, 54]], dtype=int64)

In [67]:
accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
precision = cross_val_score(clf, X_train, y_train, cv=10, scoring='precision')
recall = cross_val_score(clf, X_train, y_train, cv=10, scoring='recall')
f1 = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())



0.9688852813852813
0.9481286549707603
0.9688852813852813
0.9537071613851801




In [68]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train, y_train)

QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=None, tol=0.0001)

In [69]:
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[75,  4],
       [ 3, 55]], dtype=int64)

In [70]:
accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
precision = cross_val_score(clf, X_train, y_train, cv=10, scoring='precision')
recall = cross_val_score(clf, X_train, y_train, cv=10, scoring='recall')
f1 = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())

0.9559247234247236
0.8973528518265361
0.9559247234247236
0.9373240446924658


In [72]:
from sklearn.gaussian_process import GaussianProcessClassifier
clf = GaussianProcessClassifier()
clf.fit(X_train, y_train)

GaussianProcessClassifier(copy_X_train=True, kernel=None,
             max_iter_predict=100, multi_class='one_vs_rest', n_jobs=1,
             n_restarts_optimizer=0, optimizer='fmin_l_bfgs_b',
             random_state=None, warm_start=False)

In [73]:
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
precision = cross_val_score(clf, X_train, y_train, cv=10, scoring='precision')
recall = cross_val_score(clf, X_train, y_train, cv=10, scoring='recall')
f1 = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())

0.9670670995670996
0.9525730994152047
0.9670670995670996
0.9505576208672185


In [75]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier()
clf.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [76]:
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
precision = cross_val_score(clf, X_train, y_train, cv=10, scoring='precision')
recall = cross_val_score(clf, X_train, y_train, cv=10, scoring='recall')
f1 = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())

0.9597943722943724
0.9473099415204679
0.967034632034632
0.9359011417834948


In [77]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [78]:
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
precision = cross_val_score(clf, X_train, y_train, cv=10, scoring='precision')
recall = cross_val_score(clf, X_train, y_train, cv=10, scoring='recall')
f1 = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())

0.9560906685906685
0.9312280701754385
0.9560906685906685
0.9343456946552923


In [94]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
clf4 = SVC(kernel='rbf')
clf5 = MLPClassifier(alpha=1)
clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3),
                                   ('svc', clf4),
                                   ('mlp', clf5),
                                  ], 
                       voting='hard')
clf.fit(X_train, y_train)



VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFore...=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [95]:
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
precision = cross_val_score(clf, X_train, y_train, cv=10, scoring='precision')
recall = cross_val_score(clf, X_train, y_train, cv=10, scoring='recall')
f1 = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1')
print(accuracy.mean())
print(precision.mean())
print(recall.mean())
print(f1.mean())

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:


0.9688852813852813
0.9481286549707603
0.9688852813852813
0.9537071613851801


  if diff:
