In [1]:
from IPython import get_ipython

ipy = get_ipython()
if ipy:
    ipy.run_line_magic("matplotlib", "inline")

In [2]:
import pandas as pd

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/"
    "mushroom/agaricus-lepiota.data",
    header=None,
    engine="python",
)
column_name = [
    "classes",
    "cap-shape",
    "cap-surface",
    "cap-color",
    "bruises?",
    "odor",
    "gill-attachment",
    "gill-spacing",
    "gill-size",
    "gill-color",
    "stalk-shape",
    "stalk-root",
    "stalk-surface-above-ring",
    "stalk-surface-below-ring",
    "stalk-color-above-ring",
    "stalk-color-below-ring",
    "veil-type",
    "veil-color",
    "ring-number",
    "ring-type",
    "spore-print-color",
    "population",
    "habitat",
]
df.columns = column_name
df.head()

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer


def impute_missing(df: pd.DataFrame):
    imputer = SimpleImputer(strategy="most_frequent")
    df_imputed = df.copy()
    df_imputed = df_imputed.replace("?", np.nan)
    df_imputed = imputer.fit_transform(df_imputed)
    return pd.DataFrame(df_imputed, columns=df.columns)


df_imputed = impute_missing(df)

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder


def encode_label(df: pd.DataFrame):
    encoders = {}
    df_encoded = df.copy()
    for col in df.columns:
        encoder = LabelEncoder()
        df_encoded[col] = encoder.fit_transform(df[col])
        encoders[col] = encoder
    return df_encoded, encoders


df_encoded, encoders = encode_label(df_imputed)

In [5]:
from sklearn.model_selection import train_test_split

X, y = (
    df_encoded.drop(columns=["classes"]),
    df_encoded["classes"].values,
)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

print("X_train {}".format(X_train.shape))
print("X_test {}".format(X_test.shape))
print("y_train {}".format(y_train.shape))
print("y_test {}".format(y_test.shape))

X_train (6499, 22)
X_test (1625, 22)
y_train (6499,)
y_test (1625,)


In [6]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

pipe_svc = Pipeline([("scl", StandardScaler()), ("clf", SVC(random_state=1))])
pipe_svc.fit(X_train, y_train)
y_pred = pipe_svc.predict(X_test)

print("SVC")
print("Misclassified samples: %d" % (y_test != y_pred).sum())
print("Accuracy: %.4f" % accuracy_score(y_test, y_pred))

pipe_knn = Pipeline(
    [("scl", StandardScaler()), ("clf", KNeighborsClassifier())]
)
pipe_knn.fit(X_train, y_train)
y_pred = pipe_knn.predict(X_test)

print("KNeighborsClassifier")
print("Misclassified samples: %d" % (y_test != y_pred).sum())
print("Accuracy: %.4f" % accuracy_score(y_test, y_pred))

SVC
Misclassified samples: 0
Accuracy: 1.0000
KNeighborsClassifier
Misclassified samples: 0
Accuracy: 1.0000


# Report

It seems that without any tuning of both KNN and SVC, we can get a perfect score on the test set.