In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

In [8]:
df = sns.load_dataset('iris')

In [9]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [10]:
df.shape

(150, 5)

In [11]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [12]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [14]:
df.isnull().sum()

Unnamed: 0,0
sepal_length,0
sepal_width,0
petal_length,0
petal_width,0
species,0


In [15]:
df.duplicated().sum()

np.int64(1)

In [16]:
df = df.drop_duplicates()
display(df.shape)

(149, 5)

In [17]:
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [18]:
df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [19]:
from sklearn.model_selection import train_test_split


In [20]:
x = df.drop('species',axis=1)
y = df['species']

In [21]:
x

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [22]:
y

Unnamed: 0,species
0,setosa
1,setosa
2,setosa
3,setosa
4,setosa
...,...
145,virginica
146,virginica
147,virginica
148,virginica


In [23]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

                                                   Creating KNN Model

In [24]:
from sklearn.neighbors import KNeighborsClassifier

In [25]:
model_knn = KNeighborsClassifier(n_neighbors=3)

In [26]:
model_knn.fit(x_train,y_train)

In [27]:
model_knn.score(x_test,y_test)  #our model is overfitting as accuracy score is 100%, even changing the value n_neighbor=5,13...

1.0

                                              Creating SVM Model

In [28]:
from sklearn.svm import SVC

In [29]:
model_svm = SVC(gamma = 'auto')   #remove C=30 and kernel='linear' while doing Grid Search CV.

In [30]:
model_svm.fit(x_train,y_train)

In [31]:
model_svm.score(x_test,y_test)  #keeping kernel='rbf' gives accuracy=100%, but when kernel='linear' gives accuracy=96.67%
# after removing "C"  and "kernel" gives accuracy=100%

1.0

                          Now, lets Use the Grid Search CV for SVM Model

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
classifier_svm = GridSearchCV((model_svm),{
    'C':[1,10,20,30],
    'kernel':['rbf','linear']
},cv=5,return_train_score=False)

In [34]:
classifier_svm.fit(x,y)

In [35]:
classifier_svm.cv_results_

{'mean_fit_time': array([0.00421238, 0.00336938, 0.00343318, 0.00339446, 0.00339818,
        0.00318508, 0.00401802, 0.00345397]),
 'std_fit_time': array([1.00166923e-03, 2.85098038e-04, 7.16274490e-05, 3.74232228e-04,
        6.04534743e-05, 5.48431973e-05, 5.32769079e-04, 1.98135737e-04]),
 'mean_score_time': array([0.00294213, 0.00252705, 0.00268164, 0.00251474, 0.00255656,
        0.00246854, 0.00330491, 0.00266695]),
 'std_score_time': array([4.29507015e-04, 4.71327421e-05, 2.07553067e-04, 7.15768313e-05,
        3.52450082e-05, 6.04892698e-05, 7.97256448e-04, 1.82832139e-04]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20, 30, 30],
              mask=[False, False, False, False, False, False, False, False],
        fill_value=999999),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear',
                    'rbf', 'linear'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value=np.str_('?'),
        

In [36]:
results = pd.DataFrame(classifier_svm.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004212,0.001002,0.002942,0.00043,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.003369,0.000285,0.002527,4.7e-05,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.003433,7.2e-05,0.002682,0.000208,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.003394,0.000374,0.002515,7.2e-05,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.003398,6e-05,0.002557,3.5e-05,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.003185,5.5e-05,0.002469,6e-05,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
6,0.004018,0.000533,0.003305,0.000797,30,rbf,"{'C': 30, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.933333,1.0,0.96,0.038873,7
7,0.003454,0.000198,0.002667,0.000183,30,linear,"{'C': 30, 'kernel': 'linear'}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,7


In [37]:
results[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667
6,30,rbf,0.96
7,30,linear,0.96


                                        Now, lets Use the Grid Search CV for KNN Model

In [38]:
classifier_knn = GridSearchCV((model_knn),{
    'n_neighbors':[1,2,3,4,5,6,7,8,9,10],
    'weights':['uniform','distance'],
    'metric':['euclidean','manhattan'],
    'algorithm':['auto','ball_tree','kd_tree','brute']
},cv=5,return_train_score=False)

In [39]:
classifier_knn.fit(x,y)

In [40]:
classifier_knn.cv_results_

{'mean_fit_time': array([0.00337648, 0.00301971, 0.00299082, 0.00397868, 0.00305591,
        0.00308943, 0.00288105, 0.0028223 , 0.00326109, 0.00299101,
        0.00316653, 0.00300217, 0.00292249, 0.0028604 , 0.00290284,
        0.00365863, 0.00284705, 0.00289874, 0.00281491, 0.0029295 ,
        0.00314951, 0.0031939 , 0.00340028, 0.00304952, 0.00316048,
        0.00381861, 0.00305839, 0.00335021, 0.00288267, 0.00293951,
        0.00343142, 0.00348773, 0.00369229, 0.00382185, 0.00298123,
        0.00301766, 0.00335946, 0.00374517, 0.00335441, 0.00340843,
        0.00332284, 0.00334506, 0.00439878, 0.00210791, 0.00207539,
        0.00203514, 0.00204444, 0.00237069, 0.00204849, 0.00202937,
        0.00210834, 0.0020555 , 0.00202265, 0.00235062, 0.00223393,
        0.00199785, 0.00204296, 0.00219779, 0.00220013, 0.00210719,
        0.0020895 , 0.0021174 , 0.0020999 , 0.00206823, 0.00205903,
        0.0020648 , 0.00209413, 0.00208092, 0.00204067, 0.00207119,
        0.00210042, 0.00210295,

In [41]:
results = pd.DataFrame(classifier_knn.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003376,0.000452,0.005178,0.001079,auto,euclidean,1,uniform,"{'algorithm': 'auto', 'metric': 'euclidean', '...",0.966667,0.966667,0.933333,0.933333,1.0,0.960000,0.024944,99
1,0.003020,0.000172,0.003624,0.000637,auto,euclidean,1,distance,"{'algorithm': 'auto', 'metric': 'euclidean', '...",0.966667,0.966667,0.933333,0.933333,1.0,0.960000,0.024944,99
2,0.002991,0.000147,0.004371,0.000308,auto,euclidean,2,uniform,"{'algorithm': 'auto', 'metric': 'euclidean', '...",0.966667,0.933333,0.933333,0.900000,1.0,0.946667,0.033993,153
3,0.003979,0.001504,0.004226,0.000945,auto,euclidean,2,distance,"{'algorithm': 'auto', 'metric': 'euclidean', '...",0.966667,0.966667,0.933333,0.933333,1.0,0.960000,0.024944,99
4,0.003056,0.000173,0.004447,0.000503,auto,euclidean,3,uniform,"{'algorithm': 'auto', 'metric': 'euclidean', '...",0.966667,0.966667,0.933333,0.966667,1.0,0.966667,0.021082,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,0.002035,0.000168,0.002581,0.000601,brute,manhattan,8,distance,"{'algorithm': 'brute', 'metric': 'manhattan', ...",0.966667,0.966667,0.933333,0.933333,1.0,0.960000,0.024944,99
156,0.002145,0.000417,0.003168,0.000408,brute,manhattan,9,uniform,"{'algorithm': 'brute', 'metric': 'manhattan', ...",0.966667,1.000000,0.966667,0.933333,1.0,0.973333,0.024944,30
157,0.002077,0.000239,0.002295,0.000108,brute,manhattan,9,distance,"{'algorithm': 'brute', 'metric': 'manhattan', ...",0.966667,0.966667,0.900000,0.966667,1.0,0.960000,0.032660,82
158,0.001896,0.000014,0.002865,0.000047,brute,manhattan,10,uniform,"{'algorithm': 'brute', 'metric': 'manhattan', ...",0.966667,0.966667,0.966667,0.933333,1.0,0.966667,0.021082,78


In [42]:
results[['params','param_weights','param_n_neighbors','param_algorithm','mean_test_score']].head(10)

Unnamed: 0,params,param_weights,param_n_neighbors,param_algorithm,mean_test_score
0,"{'algorithm': 'auto', 'metric': 'euclidean', '...",uniform,1,auto,0.96
1,"{'algorithm': 'auto', 'metric': 'euclidean', '...",distance,1,auto,0.96
2,"{'algorithm': 'auto', 'metric': 'euclidean', '...",uniform,2,auto,0.946667
3,"{'algorithm': 'auto', 'metric': 'euclidean', '...",distance,2,auto,0.96
4,"{'algorithm': 'auto', 'metric': 'euclidean', '...",uniform,3,auto,0.966667
5,"{'algorithm': 'auto', 'metric': 'euclidean', '...",distance,3,auto,0.966667
6,"{'algorithm': 'auto', 'metric': 'euclidean', '...",uniform,4,auto,0.973333
7,"{'algorithm': 'auto', 'metric': 'euclidean', '...",distance,4,auto,0.966667
8,"{'algorithm': 'auto', 'metric': 'euclidean', '...",uniform,5,auto,0.973333
9,"{'algorithm': 'auto', 'metric': 'euclidean', '...",distance,5,auto,0.966667


             Using Randomized Search CV

In [43]:
from sklearn.model_selection import RandomizedSearchCV

In [50]:
classifier_svm_random = RandomizedSearchCV((model_svm),{
    'C':[1,10,20,30],
    'kernel':['rbf','linear']
},n_iter=4,cv=5,return_train_score=False)

In [51]:
classifier_svm_random.fit(x,y)

In [52]:
classifier_svm_random.cv_results_

{'mean_fit_time': array([0.00367875, 0.00226531, 0.00240521, 0.00236897]),
 'std_fit_time': array([1.91758488e-03, 6.17504304e-05, 1.40610004e-04, 6.15972106e-05]),
 'mean_score_time': array([0.00233855, 0.00172272, 0.00172343, 0.00182805]),
 'std_score_time': array([6.28303529e-04, 8.53486033e-05, 3.45342049e-05, 2.18937204e-04]),
 'param_kernel': masked_array(data=['linear', 'linear', 'rbf', 'rbf'],
              mask=[False, False, False, False],
        fill_value=np.str_('?'),
             dtype=object),
 'param_C': masked_array(data=[30, 20, 10, 20],
              mask=[False, False, False, False],
        fill_value=999999),
 'params': [{'kernel': 'linear', 'C': 30},
  {'kernel': 'linear', 'C': 20},
  {'kernel': 'rbf', 'C': 10},
  {'kernel': 'rbf', 'C': 20}],
 'split0_test_score': array([1.        , 1.        , 0.96666667, 0.96666667]),
 'split1_test_score': array([1., 1., 1., 1.]),
 'split2_test_score': array([0.9       , 0.9       , 0.96666667, 0.9       ]),
 'split3_test_scor

In [53]:
results = pd.DataFrame(classifier_svm_random.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003679,0.001918,0.002339,0.000628,linear,30,"{'kernel': 'linear', 'C': 30}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,4
1,0.002265,6.2e-05,0.001723,8.5e-05,linear,20,"{'kernel': 'linear', 'C': 20}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,3
2,0.002405,0.000141,0.001723,3.5e-05,rbf,10,"{'kernel': 'rbf', 'C': 10}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.002369,6.2e-05,0.001828,0.000219,rbf,20,"{'kernel': 'rbf', 'C': 20}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,2


In [54]:
results[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,30,linear,0.96
1,20,linear,0.966667
2,10,rbf,0.98
3,20,rbf,0.966667
