# Goal
Given the dataset, predict whether a mushroom is poisonous or edible.

# Step
1. Preprocess with dataset, ex. dealing with missing value denoted by '?'.
2. Train models using KNN and/or SVM. Note that you need to use train_test_split and set test_size = 0.2. It is up to you which features to use – you can either use all or select a few depending on how you see fit.
3. Show the accuracy scores of the models.
4. Among the models that you tried, choose the best model and show its accuracy score.

## import data

In [3]:
import pandas as pd

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'
                 'mushroom/agaricus-lepiota.data', header=None, engine='python')
# put the classes to the end
column_name = ['cap-shape', 'cap-surface','cap-color','bruises?','odor',
               'gill-attachment','gill-spacing','gill-size','gill-color',
               'stalk-shape','stalk-root','stalk-surface-above-ring',
               'stalk-surface-below-ring','stalk-color-above-ring',
               'stalk-color-below-ring','veil-type','veil-color','ring-number',
               'ring-type','spore-print-color','population','habitat', 'classes']

df.columns = column_name
df.head()


Unnamed: 0,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,classes
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


## Try with Label Encoder

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
import numpy as np

# encode label first
label_le = LabelEncoder()
df['classes'] = label_le.fit_transform(df['classes'].values)

# encode categorical features
catego_features = df.columns[:-2]

catego_le = LabelEncoder()

# transform categorical values into numerical values
# be careful that '?' will also be encoded
# we have to replace it to NaN in numerical
categories = []
for i in catego_features:
    df[i] = catego_le.fit_transform(df[i].values)
    classes_list = catego_le.classes_.tolist()
    
    # replace '?' with 'NaN'
    if '?' in classes_list:
        idx = classes_list.index('?')
        df[i] = df[i].replace(idx, np.nan)
    
    # store the total number of values
    categories.append(np.arange(len(classes_list)))

display(df.head(15))

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,classes
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,s,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,n,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,n,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,s,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,a,1
5,0,5,3,9,1,0,1,0,0,5,...,2,7,7,0,2,1,4,2,n,1
6,0,0,2,8,1,0,1,0,0,2,...,2,7,7,0,2,1,4,2,n,3
7,0,0,3,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,s,3
8,1,5,3,8,1,6,1,0,1,7,...,2,7,7,0,2,1,4,2,v,1
9,0,0,2,9,1,0,1,0,0,2,...,2,7,7,0,2,1,4,2,s,3


In [5]:
df.isnull().sum()

cap-shape                      0
cap-surface                    0
cap-color                      0
bruises?                       0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                     0
stalk-surface-above-ring    2480
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
classes                        0
dtype: int64

In [4]:
df.shape

(8124, 23)

# Deal with missing value: SimpleImputer

In [6]:
from sklearn.impute import SimpleImputer 

imr = SimpleImputer(missing_values = np.NaN, strategy = 'most_frequent')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)

df_impute = pd.DataFrame(imputed_data)
df_impute.columns = df.columns

display(df.head(5))
display(df_impute.head(5))

# check if there are still missing values
display(df_impute.isnull().sum())


Unnamed: 0,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,classes
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,s,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,n,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,n,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,s,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,a,1


Unnamed: 0,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,classes
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,s,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,n,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,n,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,s,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,a,1


cap-shape                   0
cap-surface                 0
cap-color                   0
bruises?                    0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
classes                     0
dtype: int64

In [18]:
catego_features = ['cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']

In [26]:
catego_features_idx = []
for fea in catego_features:
    catego_features_idx.append(df.columns.tolist().index(fea))

In [28]:
categories

[array([0, 1]),
 array([0, 1, 2, 3, 4, 5]),
 array([0, 1, 2, 3]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([0, 1]),
 array([0, 1]),
 array([0, 1]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
 array([0, 1]),
 array([0, 1, 2, 3, 4]),
 array([0, 1, 2, 3]),
 array([0, 1, 2, 3]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([0]),
 array([0, 1, 2, 3]),
 array([0, 1, 2]),
 array([0, 1, 2, 3, 4]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8])]

## Try with One hot encoding

In [14]:
from sklearn.compose import ColumnTransformer

# give the column index you want to do one-hot encoding
ohe = ColumnTransformer(
    [
        ("one_hot_encoder", OneHotEncoder(sparse = False, categories = "auto"), df.columns[:-2])
    ],
    remainder = "passthrough"
)

df_onehot_data = ohe.fit_transform(df_impute)
print('Impute: {}'.format(df_impute.shape))
print('Impute one-hot: {}'.format(df_onehot_data.shape))



Impute: (8124, 23)
Impute one-hot: (8124, 107)


## Split the Train and Test

In [33]:
from sklearn.model_selection import train_test_split


y = df_impute['classes'].astype(int)
X = df_impute.drop(columns=['classes'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

ohe = ColumnTransformer(
    [
        ('ohe', OneHotEncoder(sparse = False), catego_features_idx)
    ],
    remainder = "passthrough"
)

tmp = ohe.fit_transform(X_train)
tmp.shape


(6499, 111)

## Model: KNN and SVM 
SVM performance is better than the KNN.

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


pipe_knn = Pipeline([
    ('imr', SimpleImputer(missing_values = np.NaN, strategy = 'most_frequent')),
    ('ohe', ohe),
    ('scl', StandardScaler()),
    ('clf', KNeighborsClassifier(n_neighbors = 10, p = 2, metric = "minkowski"))
])

pipe_svm = Pipeline([
    ('imr', SimpleImputer(missing_values = np.NaN, strategy = 'most_frequent')),
    ('ohe', ohe),
    ('scl', StandardScaler()),
    ('clf', SVC(kernel = "rbf", random_state = 0 , gamma = 0.001, C= 100.0))
])

pipe_knn.fit(X_train, y_train)
y_pred = pipe_knn.predict(X_test)
print('[KNN]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)
print('\n[SVC]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

[KNN]
Misclassified samples: 712
Accuracy: 0.5618

[SVC]
Misclassified samples: 637
Accuracy: 0.6080


## Tuning the parameter with SVM.

In [36]:
from sklearn.model_selection import GridSearchCV

param_gamma = [0.0001, 0.001, 0.01, 0.1, 1.0]
param_C = [0.1, 1.0, 10.0, 100.0]

# here you can set parameter for different steps 
# by adding two underlines (__) between step name and parameter name
param_grid = [{'clf__C': param_C, 
               'clf__kernel': ['linear']},
              {'clf__C': param_C, 
               'clf__gamma': param_gamma, 
               'clf__kernel': ['rbf']}]

# set pipe_svm as the estimator
gs = GridSearchCV(
    estimator = pipe_svm, 
    param_grid = param_grid, 
    scoring = "accuracy",
    cv = 3
)

gs = gs.fit(X_train, y_train)
print('[SVC: grid search]')
print('Validation accuracy: %.3f' % gs.best_score_)
print(gs.best_params_)

clf = gs.best_estimator_
clf.fit(X_train, y_train)
print('Test accuracy: %.3f' % clf.score(X_test, y_test))

[SVC: grid search]
Validation accuracy: 0.662
{'clf__C': 10.0, 'clf__gamma': 0.0001, 'clf__kernel': 'rbf'}
Test accuracy: 0.653
