## Import the Dependencies

In [54]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

## Data collection and processing

In [2]:
cancer_data = pd.read_csv("Breast_cancer_data.csv")

In [3]:
cancer_data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
cancer_data.shape

(569, 33)

In [5]:
cancer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [6]:
cancer_data.isnull().sum()

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

In [7]:
cancer_data.describe()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,0.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,


In [18]:
cancer_data['diagnosis'].value_counts()

0    357
1    212
Name: diagnosis, dtype: int64

In [21]:
# COnvert the categorical text data into numeric
cancer_data.replace({'diagnosis':{'B':0,'M':1}}, inplace= True)

In [22]:
cancer_data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [23]:
X = cancer_data.drop(['id','diagnosis','Unnamed: 32'], axis =1)
Y = cancer_data['diagnosis']

In [24]:
print(X)
Y

     radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     compactness_mean  concavity_mean  concave points_mean  sym

0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int64

In [25]:
# Convert to numpy array
X = np.asarray(X)
Y = np.asarray(Y)

In [26]:
print(X)
print(Y)

[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 1 1
 0 1 0 1 1 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0
 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 1
 0 1 0 0 0 1 0 0 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 0 0 1 0 1 1 1 1 0 0 1 1 0 0
 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 

## GridSearchCV
- Used to get best hyperparameter for the model

In [27]:
# Load the model
model = SVC()

In [28]:
# Hyperparameters dictonary

parameters = {
    'kernel':['linear','poly','rbf','sigmoid'],
    'C':[1, 5, 10, 20]
}

In [29]:
# Grid Search
classifier = GridSearchCV(model, parameters, cv=5)
#cv: Cross Validation Number

In [30]:
# fitting the data to the model
classifier.fit(X, Y)

In [31]:
classifier.cv_results_

{'mean_fit_time': array([1.18236341e+00, 3.82084846e-03, 4.54311371e-03, 8.90908241e-03,
        3.06247606e+00, 3.45730782e-03, 3.44896317e-03, 9.54837799e-03,
        2.16220369e+00, 5.27853966e-03, 3.51886749e-03, 9.12189484e-03,
        3.52217941e+00, 6.07924461e-03, 2.49667168e-03, 9.08536911e-03]),
 'std_fit_time': array([2.95584067e-01, 5.80070716e-04, 9.58916732e-04, 3.63880387e-04,
        1.04017004e+00, 5.63337142e-04, 3.89631955e-04, 2.44312112e-03,
        2.68393853e-01, 2.23418560e-03, 6.94355518e-04, 1.45734880e-03,
        6.87750942e-01, 5.02462325e-03, 4.70940764e-04, 4.73364209e-04]),
 'mean_score_time': array([0.00447516, 0.00143204, 0.00299253, 0.00263724, 0.00310593,
        0.00135503, 0.00203552, 0.00272312, 0.00069962, 0.00138936,
        0.00224824, 0.00219774, 0.00085959, 0.00099397, 0.00192046,
        0.00240054]),
 'std_score_time': array([0.00407222, 0.00055081, 0.00121179, 0.00069558, 0.00421233,
        0.0004584 , 0.0005902 , 0.00090219, 0.00058067, 

In [32]:
# Best parameters for SVC

best_parameters = classifier.best_params_
print(best_parameters)

{'C': 5, 'kernel': 'linear'}


In [34]:
# Determining the highest accuracy

highest_accuracy = classifier.best_score_
print(round((highest_accuracy*100),2))

95.26


In [35]:
# Loading the results to pandas dataframe
result = pd.DataFrame(classifier.cv_results_)

In [36]:
result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.182363,0.295584,0.004475,0.004072,1,linear,"{'C': 1, 'kernel': 'linear'}",0.947368,0.929825,0.973684,0.921053,0.955752,0.945536,0.018689,4
1,0.003821,0.00058,0.001432,0.000551,1,poly,"{'C': 1, 'kernel': 'poly'}",0.842105,0.885965,0.929825,0.947368,0.938053,0.908663,0.039382,12
2,0.004543,0.000959,0.002993,0.001212,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.850877,0.894737,0.929825,0.947368,0.938053,0.912172,0.035444,11
3,0.008909,0.000364,0.002637,0.000696,1,sigmoid,"{'C': 1, 'kernel': 'sigmoid'}",0.54386,0.45614,0.464912,0.385965,0.451327,0.460441,0.050253,13
4,3.062476,1.04017,0.003106,0.004212,5,linear,"{'C': 5, 'kernel': 'linear'}",0.947368,0.938596,0.973684,0.929825,0.973451,0.952585,0.018008,1


In [39]:
grid_search_result = result[['param_C','param_kernel','mean_test_score']]

In [40]:
grid_search_result

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.945536
1,1,poly,0.908663
2,1,rbf,0.912172
3,1,sigmoid,0.460441
4,5,linear,0.952585
5,5,poly,0.922729
6,5,rbf,0.931501
7,5,sigmoid,0.411178
8,10,linear,0.950815
9,10,poly,0.920975


- Highest Accuracy = 95.26%
- Best Parameters = {'C':5,'kernel':'linear'}

# RandomizedSearchCV

In [41]:
# Load the model
model = SVC()

In [42]:
# Hyperparameters dictonary

parameters = {
    'kernel':['linear','poly','rbf','sigmoid'],
    'C':[1, 5, 10, 20]
}

In [43]:
# Randomized Search
classifier = RandomizedSearchCV(model, parameters, cv=5)
#cv: Cross Validation Number

In [44]:
# fitting the data to the model
classifier.fit(X, Y)

In [45]:
classifier.cv_results_

{'mean_fit_time': array([1.73168182e-02, 4.47177887e-03, 4.94890213e-03, 8.42223167e-03,
        3.91831398e-03, 3.51947961e+00, 5.12962341e-03, 1.25428395e+00,
        3.33294868e-03, 2.21696076e+00]),
 'std_fit_time': array([1.82182799e-02, 9.94617516e-04, 1.07931413e-03, 8.18361034e-04,
        6.96152572e-04, 7.88963997e-01, 4.00039763e-03, 2.79624006e-01,
        7.51402478e-04, 2.53869100e-01]),
 'mean_score_time': array([0.00369201, 0.0009191 , 0.00182948, 0.00231609, 0.00121269,
        0.00089383, 0.00150619, 0.00112739, 0.00168996, 0.00115218]),
 'std_score_time': array([0.00255683, 0.0007096 , 0.00070404, 0.00045352, 0.0003575 ,
        0.00045858, 0.00029895, 0.00048446, 0.0004089 , 0.00019719]),
 'param_kernel': masked_array(data=['sigmoid', 'poly', 'rbf', 'sigmoid', 'poly', 'linear',
                    'poly', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',


In [46]:
# Best parameters for SVC

best_parameters = classifier.best_params_
print(best_parameters)

{'kernel': 'linear', 'C': 20}


In [47]:
# Determining the highest accuracy

highest_accuracy = classifier.best_score_
print(round((highest_accuracy*100),2))

95.08


In [48]:
# Loading the results to pandas dataframe
result = pd.DataFrame(classifier.cv_results_)

In [49]:
result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.017317,0.018218,0.003692,0.002557,sigmoid,10,"{'kernel': 'sigmoid', 'C': 10}",0.482456,0.403509,0.421053,0.342105,0.362832,0.402391,0.048906,10
1,0.004472,0.000995,0.000919,0.00071,poly,5,"{'kernel': 'poly', 'C': 5}",0.885965,0.912281,0.921053,0.938596,0.955752,0.922729,0.023689,4
2,0.004949,0.001079,0.001829,0.000704,rbf,1,"{'kernel': 'rbf', 'C': 1}",0.850877,0.894737,0.929825,0.947368,0.938053,0.912172,0.035444,7
3,0.008422,0.000818,0.002316,0.000454,sigmoid,5,"{'kernel': 'sigmoid', 'C': 5}",0.491228,0.421053,0.421053,0.350877,0.371681,0.411178,0.048578,9
4,0.003918,0.000696,0.001213,0.000357,poly,10,"{'kernel': 'poly', 'C': 10}",0.885965,0.921053,0.903509,0.938596,0.955752,0.920975,0.024701,6


In [51]:
random_search_result = result[['param_C','param_kernel','mean_test_score']]

In [52]:
random_search_result

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,sigmoid,0.402391
1,5,poly,0.922729
2,1,rbf,0.912172
3,5,sigmoid,0.411178
4,10,poly,0.920975
5,20,linear,0.950815
6,1,poly,0.908663
7,1,linear,0.945536
8,10,rbf,0.922714
9,10,linear,0.950815


- Highest Accuracy: 95.08%
- Best Parameters: {'kernel': 'linear', 'C': 20}

## Building the model using the determined best Hyperparameters

In [55]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [67]:
# Loading the model with hyperparameters
model = SVC(C=5,kernel='linear')

In [68]:
# Fitting the model
model.fit(X_train, Y_train)

In [69]:
# Model Evaluation
prediction = model.predict(X_test)
accuracy = accuracy_score(Y_test, prediction)
print("Accuracy score of SVC model with Best parameters: ", round((accuracy*100),2))

Accuracy score of SVC model with Best parameters:  93.86
