In [117]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [118]:
df=pd.read_csv('mushroom_cleaned.csv')

In [119]:
df.head()

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
0,1372,2,2,10,3.807467,1545,11,1.804273,1
1,1461,2,2,10,3.807467,1557,11,1.804273,1
2,1371,2,2,10,3.612496,1566,11,1.804273,1
3,1261,6,2,10,3.787572,1566,11,1.804273,1
4,1305,6,2,10,3.711971,1464,11,0.943195,1


In [120]:

df.tail(50)

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
53985,78,6,3,2,1.225954,477,12,0.943195,1
53986,86,2,3,2,0.80816,544,12,0.943195,1
53987,75,2,3,2,1.015068,472,12,0.943195,1
53988,74,2,3,2,1.249828,420,12,0.88845,1
53989,72,5,3,2,1.552231,388,12,0.88845,1
53990,90,6,3,2,0.589316,564,12,0.943195,1
53991,69,6,3,2,1.269723,426,12,0.88845,1
53992,83,6,3,2,1.098627,496,12,0.88845,1
53993,84,2,3,2,0.963341,500,12,0.943195,1
53994,89,2,3,2,0.891719,493,12,0.88845,1


In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54035 entries, 0 to 54034
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   cap-diameter     54035 non-null  int64  
 1   cap-shape        54035 non-null  int64  
 2   gill-attachment  54035 non-null  int64  
 3   gill-color       54035 non-null  int64  
 4   stem-height      54035 non-null  float64
 5   stem-width       54035 non-null  int64  
 6   stem-color       54035 non-null  int64  
 7   season           54035 non-null  float64
 8   class            54035 non-null  int64  
dtypes: float64(2), int64(7)
memory usage: 3.7 MB


In [122]:
df.isnull().sum()

cap-diameter       0
cap-shape          0
gill-attachment    0
gill-color         0
stem-height        0
stem-width         0
stem-color         0
season             0
class              0
dtype: int64

In [123]:
df.columns

Index(['cap-diameter', 'cap-shape', 'gill-attachment', 'gill-color',
       'stem-height', 'stem-width', 'stem-color', 'season', 'class'],
      dtype='object')

In [124]:
df['class'].value_counts()

class
1    29675
0    24360
Name: count, dtype: int64

In [125]:
df['class']=np.where((df['class']==1),'edible','non_edible')

In [126]:
X=df[['cap-diameter','cap-shape','gill-attachment','gill-color','stem-height','stem-width','stem-color','season']]
y=df['class']

In [127]:
X,y

(       cap-diameter  cap-shape  gill-attachment  gill-color  stem-height  \
 0              1372          2                2          10     3.807467   
 1              1461          2                2          10     3.807467   
 2              1371          2                2          10     3.612496   
 3              1261          6                2          10     3.787572   
 4              1305          6                2          10     3.711971   
 ...             ...        ...              ...         ...          ...   
 54030            73          5                3           2     0.887740   
 54031            82          2                3           2     1.186164   
 54032            82          5                3           2     0.915593   
 54033            79          2                3           2     1.034963   
 54034            72          5                3           2     1.158311   
 
        stem-width  stem-color    season  
 0            1545          11 

In [128]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)


In [129]:
x_train.shape

(43228, 8)

In [130]:
from sklearn.neighbors import KNeighborsClassifier
knn3=KNeighborsClassifier(n_neighbors=3,metric='euclidean') 
knn3.fit(x_train,y_train)

In [131]:
prediction_train=knn3.predict(x_train)

In [132]:
prediction_train

array(['edible', 'non_edible', 'non_edible', ..., 'edible', 'edible',
       'non_edible'], dtype=object)

In [133]:
from sklearn.metrics import accuracy_score

In [134]:
train_accuracy=accuracy_score(y_train,prediction_train)

In [135]:
train_accuracy

0.8627509947256408

In [136]:
from sklearn.model_selection import GridSearchCV
knn=KNeighborsClassifier(n_neighbors=5,metric='euclidean')
param_grid={'n_neighbors':np.arange(1,16,2)}
knn_with_gs=GridSearchCV(knn,param_grid,return_train_score=True,verbose=True,scoring='accuracy')
knn_with_gs.fit(x_train,y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [137]:
knn_with_gs.cv_results_

{'mean_fit_time': array([0.23466115, 0.21561861, 0.20920172, 0.216014  , 0.22864571,
        0.20733862, 0.23454056, 0.20889115]),
 'std_fit_time': array([0.06342747, 0.01856119, 0.00501637, 0.01399045, 0.02645998,
        0.04532783, 0.05454424, 0.00465765]),
 'mean_score_time': array([0.62269382, 0.56675735, 0.57703428, 0.57803955, 0.58972416,
        0.60046544, 0.65010796, 0.61914258]),
 'std_score_time': array([0.26297757, 0.01580148, 0.02632094, 0.0302487 , 0.01923861,
        0.15756893, 0.1829303 , 0.00858271]),
 'param_n_neighbors': masked_array(data=[1, 3, 5, 7, 9, 11, 13, 15],
              mask=[False, False, False, False, False, False, False, False],
        fill_value=999999),
 'params': [{'n_neighbors': np.int64(1)},
  {'n_neighbors': np.int64(3)},
  {'n_neighbors': np.int64(5)},
  {'n_neighbors': np.int64(7)},
  {'n_neighbors': np.int64(9)},
  {'n_neighbors': np.int64(11)},
  {'n_neighbors': np.int64(13)},
  {'n_neighbors': np.int64(15)}],
 'split0_test_score': array([0

In [138]:
knn_with_gs.best_estimator_

In [139]:
tunned_df=pd.DataFrame(knn_with_gs.cv_results_)

In [140]:
tunned_df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_n_neighbors', 'params', 'split0_test_score', 'split1_test_score',
       'split2_test_score', 'split3_test_score', 'split4_test_score',
       'mean_test_score', 'std_test_score', 'rank_test_score',
       'split0_train_score', 'split1_train_score', 'split2_train_score',
       'split3_train_score', 'split4_train_score', 'mean_train_score',
       'std_train_score'],
      dtype='object')

In [141]:
tunned_df=tunned_df[['param_n_neighbors','mean_test_score','mean_train_score']]

In [142]:
tunned_df

Unnamed: 0,param_n_neighbors,mean_test_score,mean_train_score
0,1,0.716758,1.0
1,3,0.715231,0.85728
2,5,0.712594,0.814461
3,7,0.711182,0.792958
4,9,0.70799,0.777604
5,11,0.706047,0.766777
6,13,0.703965,0.757888
7,15,0.702577,0.751162


In [143]:
knn3=KNeighborsClassifier(n_neighbors=1,metric='euclidean')
knn3.fit(x_train,y_train) 
prediction_test=knn3.predict(x_test)
prediction_train=knn3.predict(x_train) 
test_accuracy=accuracy_score(y_test,prediction_test)
print('test accuracy=',test_accuracy)
train_accuracy=accuracy_score(y_train,prediction_train)
print('train accuracy=',train_accuracy)

test accuracy= 0.7265661145553808
train accuracy= 1.0


In [144]:
## using oversampling

In [145]:
from imblearn.over_sampling import RandomOverSampler
over_sampler=RandomOverSampler(random_state=42)
x_train_over,y_train_over=over_sampler.fit_resample(x_train,y_train)
print("after oversampling:\n",y_train_over.count())

after oversampling:
 47554


In [146]:
knn3=KNeighborsClassifier(n_neighbors=3,metric='euclidean')
knn3.fit(x_train_over,y_train_over) 
prediction_test=knn3.predict(x_test)
prediction_train=knn3.predict(x_train_over) 
test_accuracy=accuracy_score(y_test,prediction_test)
print('test accuracy=',test_accuracy)
train_accuracy=accuracy_score(y_train_over,prediction_train)
print('train accuracy=',train_accuracy)

test accuracy= 0.7143518090126769
train accuracy= 0.8751524582579804


In [147]:
#using undersampling

In [148]:
from imblearn.under_sampling import RandomUnderSampler
under_sampler=RandomUnderSampler(random_state=42,replacement=False)
x_train_under,y_train_under=under_sampler.fit_resample(x_train,y_train)
print("after undersampling:\n",y_train_under.count())

after undersampling:
 38902


In [149]:
knn3=KNeighborsClassifier(n_neighbors=3,metric='euclidean')
knn3.fit(x_train_under,y_train_under) 
prediction_test=knn3.predict(x_test)
prediction_train=knn3.predict(x_train_under) 
test_accuracy=accuracy_score(y_test,prediction_test)
print('test accuracy=',test_accuracy)
train_accuracy=accuracy_score(y_train_under,prediction_train)
print('train accuracy=',train_accuracy)

test accuracy= 0.7159248635143888
train accuracy= 0.8603670762428667


In [150]:
### using smote

In [151]:
from imblearn.over_sampling import SMOTE
smote=SMOTE()
x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)
print("after re_sampling:\n",y_train_smote.count())

after re_sampling:
 47554


In [152]:
knn3=KNeighborsClassifier(n_neighbors=3,metric='euclidean')
knn3.fit(x_train_smote,y_train_smote) 
prediction_test=knn3.predict(x_test)
prediction_train=knn3.predict(x_train_smote) 
test_accuracy=accuracy_score(y_test,prediction_test)
print('test accuracy=',test_accuracy)
train_accuracy=accuracy_score(y_train_smote,prediction_train)
print('train accuracy=',train_accuracy)

test accuracy= 0.7158323308966411
train accuracy= 0.877192244606132
