In [127]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'
                 'mushroom/agaricus-lepiota.data', header=None, engine='python')
column_name = ['classes','cap-shape', 'cap-surface','cap-color','bruises?','odor',
               'gill-attachment','gill-spacing','gill-size','gill-color',
               'stalk-shape','stalk-root','stalk-surface-above-ring',
               'stalk-surface-below-ring','stalk-color-above-ring',
               'stalk-color-below-ring','veil-type','veil-color','ring-number',
               'ring-type','spore-print-color','population','habitat']
df.columns = column_name
print(df.shape[0])
display(df.isnull().sum())
df.head()

8124


classes                     0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises?                    0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


## Deal missing value denoted by '?'

In [128]:
for i in df.columns:
    if '?' in classes_list:
        idx = classes_list.index('?')
        df[i] = df[i].replace(idx, np.nan)

## Encode label, feature, split dataset

In [129]:
label_le = LabelEncoder()
df['classes'] = label_le.fit_transform(df['classes'].values)
catego_le = LabelEncoder()
num_values = []
for i in df.columns:
    df[i] = catego_le.fit_transform(df[i].values)
    classes_list = catego_le.classes_.tolist()
    num_values.append(len(classes_list))
X =df.drop(['classes'], axis=1)
y = df['classes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## KNN

In [130]:
pipe_knn = Pipeline([('imr', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
                     ('ohe', OneHotEncoder(n_values=num_values[1:], sparse=False)),
                     ('scl', StandardScaler()),
                     ('clf', KNeighborsClassifier(n_neighbors=10, p=2, metric='minkowski'))])
# use the pipeline model to train
pipe_knn.fit(X_train, y_train)
y_pred = pipe_knn.predict(X_test)
print('[KNN]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

[KNN]
Misclassified samples: 0
Accuracy: 1.0000


## SVM

In [131]:
pipe_svm = Pipeline([('imr', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
                     ('ohe', OneHotEncoder(n_values=num_values[1:], sparse=False)),
                     ('scl', StandardScaler()),
                     ('clf', SVC(kernel='rbf', random_state=0, gamma=0.001, C=100.0))])
pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)
print('\n[SVC]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))


[SVC]
Misclassified samples: 0
Accuracy: 1.0000


   ### Grid Research

In [133]:
param_gamma = [0.0001, 0.001, 0.01, 0.1, 1.0]
param_C = [0.1, 1.0, 10.0, 100.0]
param_grid = [{'clf__C': param_C,'clf__kernel': ['linear']},{'clf__C': param_C,'clf__gamma': param_gamma,'clf__kernel': ['rbf']}]
gs = GridSearchCV(estimator=pipe_svm, param_grid=param_grid, scoring='accuracy')
gs = gs.fit(X_train, y_train)
print(gs.best_params_)
clf = gs.best_estimator_
clf.fit(X_train, y_train)
print('Test accuracy: %.3f' % clf.score(X_test, y_test))

{'clf__C': 0.1, 'clf__kernel': 'linear'}
Test accuracy: 1.000


# Report

在這個作業中，我大概做的步驟:
    1. Read data，在這裡我先用了isnull()來確認是否有missing data。
    2. 處理資料中'?'的問題，將其轉成NaN
    3. 用LabelEncoder()分別encode label(classes)和data
    4. 將資料分為training data和testing data(test size=0.2)
    5. 使用Pipeline()來進行KNN的model，最終的accuracy為1.0000，沒有misclassified samples。
    6. 使用Pipeline()來進行SVM的model，最終的accuracy也是1.0000，沒有misclassified samples。
    7. 雖然以上兩個model的預測準確度非常完美，我還是練習了grid search，找出以linear、C=0.1做出最好的結果。
    8. 因為以上做出來的準確度就非常好了，因此我沒有使用feature selection來進行處理。