In [1]:
import numpy as np
import pandas as pd
import seaborn as sns


In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

data.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [3]:
data.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [4]:
data.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [5]:
data.target_names

array(['malignant', 'benign'], dtype='<U9')

## Create dataframe

In [6]:
df = pd.DataFrame(np.c_[data.data , data.target] , columns = [list(data.feature_names) + ['target']])
# df.head()
#df.info()
pd.set_option('display.max_columns', None)

df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [7]:
df.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,1.176,1.256,7.673,158.7,0.0103,0.02891,0.05198,0.02454,0.01114,0.004239,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,0.0
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,0.7655,2.463,5.203,99.04,0.005769,0.02423,0.0395,0.01678,0.01898,0.002498,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,0.0
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,0.4564,1.075,3.425,48.55,0.005903,0.03731,0.0473,0.01557,0.01318,0.003892,18.98,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,0.0
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,0.726,1.595,5.772,86.22,0.006522,0.06158,0.07117,0.01664,0.02324,0.006185,25.74,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0.0
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,0.3857,1.428,2.548,19.15,0.007189,0.00466,0.0,0.0,0.02676,0.002783,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,1.0


In [8]:
df.shape

(569, 31)

## Split data

In [9]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

X.shape , y.shape

((569, 30), (569,))

In [10]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 2020)

X_train.shape, X_test.shape,y_train.shape , y_test.shape

((455, 30), (114, 30), (455,), (114,))

## Train Model 1 - support vector machine classification

In [11]:
from sklearn.svm import SVC
classification_rbf = SVC(kernel = 'rbf')
classification_rbf.fit(X_train,y_train)

classification_rbf.score(X_test,y_test)

0.9122807017543859

## Feature Selection

In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)

StandardScaler()

In [13]:
X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test)

## ML Model 2 Training after scaling

In [14]:
classification_rbf_2 = SVC(kernel = 'rbf')
classification_rbf_2.fit(X_train_sc , y_train)

classification_rbf_2.score(X_test_sc , y_test)

0.956140350877193

## Model 3 Training after scaling 

In [15]:
classification_poly = SVC(kernel = 'poly',degree = 3)
classification_poly.fit(X_train_sc , y_train)

classification_poly.score(X_test_sc , y_test)

0.9210526315789473

## Model 4 Training after scaling

In [16]:
classification_linear = SVC(kernel = 'linear')
classification_linear.fit(X_train_sc , y_train)

classification_linear.score(X_test_sc , y_test)

0.9649122807017544

## Prediction

In [17]:
patient1_data = [ 11.42,17.0,130,1297,0.14,0.13,0.30,0.1,0.2,0.05,0.74,0.7,1.15,3.4,74.0,0.01,0.04,0.05,0.01,0.017,0.005,23,25,98,1956,0.13,0.20,0.8,0.4,0.1]
patient1 = np.array([patient1_data])
patient1_sc = sc.transform(patient1)

patient1_sc

array([[-7.59876550e-01, -5.18049415e-01,  1.62029380e+00,
         1.88308532e+00,  3.18325437e+00,  5.40319733e-01,
         2.74411838e+00,  1.39458274e+00,  7.68679036e-01,
        -1.83636768e+00,  1.21829855e+00, -9.49967578e-01,
        -8.00721897e-01, -7.74214097e-01,  2.58833061e+04,
        -8.44232687e-01,  2.51418088e-01,  6.02139184e+00,
        -1.26339079e+00,  5.02703737e+00, -3.40378581e+00,
        -4.31193406e-01, -2.46842437e+00, -1.37653631e+00,
         8.63352073e+04, -7.99315990e-01, -3.38704259e-01,
         1.07483201e+01,  1.86648248e+00,  9.86010891e-01]])

In [18]:
# model4 used - accuracy : 0.964

classification_linear.predict(patient1_sc)

array([0.])

In [19]:
pred = classification_linear.predict(patient1_sc)
if pred[0] == 0:
    print('Patient has cancer (malignant Tumor)')
else:
    print('Patient has no cancer (benign)')

Patient has cancer (malignant Tumor)


In [20]:
# patient2 has no cancer
patient2 = df.iloc[48:49 , 0:-1]
patient2_sc = sc.transform(patient2)


# patient3 has cancer
patient3 = df.iloc[2:3 , 0:-1]
patient3_sc = sc.transform(patient3)

## Model 5 Training - Decision Tree

In [21]:
from sklearn.tree import DecisionTreeClassifier
classifier_gini = DecisionTreeClassifier(criterion = 'gini')
classifier_gini.fit(X_train , y_train)

classifier_gini.score(X_test , y_test)

0.9385964912280702

## Model 6 Traing - Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
classifier_gini_rf = RandomForestClassifier(n_estimators = 100, criterion = 'gini')
classifier_gini_rf.fit(X_train , y_train)

classifier_gini_rf.score(X_test , y_test)

0.9473684210526315

## Model 7 Training - K Nearest Neighbor Classification

In [23]:
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(n_neighbors = 5)
classifier_knn.fit(X_train , y_train)

classifier_knn.score(X_test , y_test)

0.9298245614035088

## Model 8 Training - Naive Bayes Classifier

In [24]:
from sklearn.naive_bayes import GaussianNB
classifier_gNB = GaussianNB()
classifier_gNB.fit(X_train , y_train)

classifier_gNB.score(X_test , y_test)

0.9736842105263158

In [25]:
from sklearn.naive_bayes import BernoulliNB
classifier_bNB = BernoulliNB()
classifier_bNB.fit(X_train , y_train)

classifier_bNB.score(X_test , y_test)

0.5789473684210527

In [26]:
from sklearn.naive_bayes import MultinomialNB
classifier_mNB = MultinomialNB()
classifier_mNB.fit(X_train , y_train)

classifier_mNB.score(X_test , y_test)

0.8947368421052632

## Prediction - using naive bayes classifier : GaussianNB (Model 8)

In [27]:
pred = classifier_gNB.predict(patient1)
if pred[0] == 0:
    print('Patient has cancer (malignant Tumor)')
else:
    print('Patient has no cancer (benign)')

Patient has cancer (malignant Tumor)


## Model 9 Training - Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression
logisticR = LogisticRegression()
logisticR.fit(X_train_sc, y_train)

logisticR.score(X_test_sc , y_test)

0.9736842105263158

In [29]:
pred = logisticR.predict(patient1)
if pred[0] == 0:
    print('Patient has cancer (malignant Tumor)')
else:
    print('Patient has no cancer (benign)')

Patient has cancer (malignant Tumor)
