In [1]:
import altair as alt
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix

In [69]:
# importing csv and adding new columns

column_headers = ['Edibility', 'Cap Shape', 'Cap Surface', 'Cap Color', 'Bruises', 'Odor', 'Gill Attachment', 'Gill Spacing',
                  'Gill Size', 'Gill Color', 'Stalk Shape', 'Stalk Root', 'Stalk Surface Above Ring', 'Stalk Surface Below Ring',
                  'Stalk Color Above Ring', 'Stalk Color Below Ring', 'Veil Type', 'Veil Color', 'Ring Number', 'Ring Type',
                  'Spore Print Color', 'Population', 'Habitat']

mushrooms_df = pd.read_csv("mushroom_data.csv", names=column_headers, header=None)

mushrooms_df.dropna(inplace=False)
mushrooms_df

unique_vals = mushrooms_df['Cap Color'].unique()

print(unique_vals)
mushrooms_df

['n' 'y' 'w' 'g' 'e' 'p' 'b' 'u' 'c' 'r']


Unnamed: 0,Edibility,Cap Shape,Cap Surface,Cap Color,Bruises,Odor,Gill Attachment,Gill Spacing,Gill Size,Gill Color,...,Stalk Surface Below Ring,Stalk Color Above Ring,Stalk Color Below Ring,Veil Type,Veil Color,Ring Number,Ring Type,Spore Print Color,Population,Habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [70]:
# encodes the labels to make them readable in the plot as numerical values
label_encoder = LabelEncoder()
mushrooms_df_encoded = mushrooms_df.apply(label_encoder.fit_transform)

In [71]:
unique_vals = mushrooms_df['Edibility'].value_counts()
unique_vals

e    4208
p    3916
Name: Edibility, dtype: int64

In [106]:
# makes the letters into numbers
label_encoder = LabelEncoder()
mushrooms_df_encoded = mushrooms_df.apply(label_encoder.fit_transform)

X = mushrooms_df_encoded[['Cap Shape', 'Cap Surface', 'Cap Color', "Gill Size", "Veil Color"]]
y = mushrooms_df_encoded[['Edibility']]

X_mush_train, X_mush_test, y_mush_train, y_mush_test = train_test_split(X, y, test_size=0.2, random_state=42)


X_mush_train = X_mush_train[['Cap Shape', 'Cap Surface', 'Cap Color', "Gill Size", "Veil Color"]]
y_mush_train = y_mush_train[['Edibility']]

X_mush_test = X_mush_test[['Cap Shape', 'Cap Surface', 'Cap Color', "Gill Size", "Veil Color"]]
y_mush_test = y_mush_test[['Edibility']]


# make dummy variables w onehot
categorical_columns = ['Cap Shape', 'Cap Surface', 'Cap Color', "Veil Color"]
numerical_columns = ["Gill Size"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
mush_preprocessor = make_column_transformer((categorical_transformer, categorical_columns),)


X_train_preprocessed = mush_preprocessor.fit_transform(X_mush_train)
X_test_preprocessed = mush_preprocessor.transform(X_mush_test)

#### KNeighbors
knn_spec = KNeighborsClassifier()
#classifier.fit(X_train_preprocessed, y_train)

mush_fit = make_pipeline(mush_preprocessor, KNeighborsClassifier()).fit(X_mush_train, y_mush_train)
mush_fit


mush_preprocessor
mush_fit
X_mush_test

  return self._fit(X, y)


Unnamed: 0,Cap Shape,Cap Surface,Cap Color,Gill Size,Veil Color
1971,2,0,4,0,2
6654,2,2,2,1,2
5606,5,3,4,1,2
3332,2,3,3,0,2
6988,2,2,2,1,2
...,...,...,...,...,...
7374,3,2,2,1,2
1149,2,2,4,0,2
4999,5,3,4,1,2
7497,3,3,4,1,2


In [110]:
# new predicted column
mush_test_predictions = mush_fit.predict(X_mush_test[['Cap Shape', 'Cap Surface', 'Cap Color', "Gill Size", "Veil Color"]])
mush_test_predictions = pd.concat(
    [
        X_mush_test.reset_index(drop=True), y_mush_test,
        pd.DataFrame(mush_test_predictions, columns=["Predicted"]),
    ],
    axis=1,
)

mush_test_predictions.iloc[1624,:]
mush_test_predictions
y_mush_test

Unnamed: 0,Edibility
1971,0
6654,1
5606,1
3332,0
6988,1
...,...
7374,1
1149,0
4999,1
7497,1


In [74]:
#X_test = mush_test_predictions[['Cap Shape', 'Cap Surface', 'Cap Color', 'Gill Size', 'Veil Color']]
#y_test = mush_test_predictions["Edibility"]

mush_prediction_accuracy = mush_fit.score(X_mush_test, y_mush_test)
mush_prediction_accuracy

0.68

In [96]:
mush_test_predictions.isna().sum()

Cap Shape      6499
Cap Surface    6499
Cap Color      6499
Gill Size      6499
Veil Color     6499
Edibility         0
Predicted      6499
dtype: int64

In [97]:
X_mush_test.isna().sum()

Cap Shape      0
Cap Surface    0
Cap Color      0
Gill Size      0
Veil Color     0
dtype: int64

In [98]:
X_mush_train.isna().sum()

Cap Shape      0
Cap Surface    0
Cap Color      0
Gill Size      0
Veil Color     0
dtype: int64

In [19]:
mush_matrix = confusion_matrix(
    mush_test_predictions["Edibility"],  # true labels
    mush_test_predictions["Predicted"],  # predicted labels
    labels=mush_fit.classes_,           # specify the label for each class
)
mush_matrix

ValueError: Input y_pred contains NaN.

In [14]:
#mush_test_predictions["Edibility"]
X_mush_test.isna().sum()

Cap Shape      0
Cap Surface    0
Cap Color      0
Gill Size      0
Veil Color     0
dtype: int64