In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import mean_squared_error, accuracy_score
from ucimlrepo import fetch_ucirepo 

In [2]:
# Fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# Save data as X and y variables
X = mushroom.data.features 
y = np.ravel(mushroom.data.targets)

# Expand dataframe columns and look at view dataframe
pd.set_option('display.max_columns', None)
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g


Encoding data

In [5]:
# factorize all columns
for col in X.columns:
    X.loc[:,col] = pd.factorize(X[col], sort = True)[0]

X.iloc[0:5, 0:5]

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor
0,5,2,4,1,6
1,5,2,9,1,0
2,0,2,8,1,3
3,5,3,8,1,6
4,5,2,3,0,5


does our dataset have missing values?

In [7]:
X.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [8]:
# create copy of X variables
X_Na = X.copy()

create NAs

In [10]:
# assign 10% of new data with NA values

for col in X_Na.columns:
    X_Na.loc[X_Na.sample(frac = 0.1).index, col] = np.nan


In [11]:
# check to make suer there are missing values

X_Na.isna().sum()

cap-shape                   812
cap-surface                 812
cap-color                   812
bruises                     812
odor                        812
gill-attachment             812
gill-spacing                812
gill-size                   812
gill-color                  812
stalk-shape                 812
stalk-root                  812
stalk-surface-above-ring    812
stalk-surface-below-ring    812
stalk-color-above-ring      812
stalk-color-below-ring      812
veil-type                   812
veil-color                  812
ring-number                 812
ring-type                   812
spore-print-color           812
population                  812
habitat                     812
dtype: int64

In [12]:
# NAs randomly assigned
X_Na

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,,6,1,0,1,4,0,2,2,2,7,,0,2,1,4,2,3,5
1,5,2,9,1,0,1,0,,4,,1,2,2,,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,1,2,2,7,7,0,2,1,4,3,2,3
3,5,,8,1,6,1,0,1,,0,2,2,2,7,7,0,2,1,4,2,3,5
4,5,2,,0,,1,1,,4,1,2,2,2,7,,0,2,1,0,3,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,,2,,0,5,0,0,0,11,0,-1,2,2,5,5,0,1,1,4,0,1,2
8120,5,2,4,0,,,0,,11,0,-1,2,2,5,5,0,,,4,,4,2
8121,2,2,4,0,5,0,0,0,5,,-1,2,2,5,5,0,1,,4,0,1,2
8122,3,3,4,0,,,0,1,0,1,-1,2,1,7,7,0,2,1,0,7,4,2


Imputing: 3 methods

In [13]:
# impute with mode
X_mode_impute = X_Na.fillna(X_Na.mode().iloc[0])

# check there are no NAs

X_mode_impute.isna().sum()

# filled the NAs with the mode of each column

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [14]:
# impute with median (using SimpleImputer)
median_impute = SimpleImputer(strategy = 'median')
X_median_impute = median_impute.fit_transform(X_Na)
# make into a dataframe

X_median_impute = pd.DataFrame(X_median_impute, columns = X.columns)
# check make sure there are no NAs
X_median_impute.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [15]:
# impute with knn imputer
knn_impute = KNNImputer(n_neighbors = 5)
X_knn_impute = knn_impute.fit_transform(X_Na)
# make into dataframe
X_knn_impute = pd.DataFrame(X_knn_impute, columns = X.columns)
# check
X_knn_impute.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

Which one performed best?

In [17]:
# calculate imputation accuracy using mean squared error
mse_mode = mean_squared_error(X, X_mode_impute)
mse_median = mean_squared_error(X, X_median_impute)
mse_knn = mean_squared_error(X, X_knn_impute)

# report results
print(f"Mode Imputation Performance: {mse_mode}")
print(f"Median Imputation Performance: {mse_median}")
print(f"KNN Imputation Performance: {mse_knn}")

Mode Imputation Performance: 0.45235777270489225
Median Imputation Performance: 0.2548509466899423
KNN Imputation Performance: 0.12204019515688641


KNN performed best because mse is smallest

Random Forest classifier with original data

In [18]:
# split actual data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 43, random_state = 42) 

In [23]:
# number of features to include for tuning

num_features = [1, 4, 7, 10, 13, 16, 19, 22]
accuracy = [ ]

for feature in num_features:
    rf_classifier = RandomForestClassifier(
    n_estimators = 50,
    max_depth = 3,
    random_state = 42,
    max_features = feature)
    
    rf_classifier.fit(X_train, y_train)
    
    # predict and evaluate results
    
    y_pred = rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_pred)
    accuracy.append(rf_accuracy)
    
    print(f"Number of features: {feature}; Random Forest accuracy: {rf_accuracy}")


Number of features: 1; Random Forest accuracy: 0.9302325581395349
Number of features: 4; Random Forest accuracy: 0.9767441860465116
Number of features: 7; Random Forest accuracy: 0.9767441860465116
Number of features: 10; Random Forest accuracy: 0.9767441860465116
Number of features: 13; Random Forest accuracy: 0.9767441860465116
Number of features: 16; Random Forest accuracy: 0.9767441860465116
Number of features: 19; Random Forest accuracy: 0.9767441860465116
Number of features: 22; Random Forest accuracy: 0.9302325581395349


Again with KNN imputed data

In [25]:
# split imputed data
X_train, X_test, y_train, y_test = train_test_split(X_knn_impute,y, test_size = 0.3, random_state = 42)

In [26]:
# number of features to include for tuning

num_features = [1, 4, 7, 10, 13, 16, 19, 22]
accuracy = [ ]

for feature in num_features:
    rf_classifier = RandomForestClassifier(
    n_estimators = 50,
    max_depth = 3,
    random_state = 42,
    max_features = feature)
    
    rf_classifier.fit(X_train, y_train)
    
    # predict and evaluate results
    
    y_pred = rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_pred)
    accuracy.append(rf_accuracy)
    
    print(f"Number of features: {feature}; Random Forest accuracy: {rf_accuracy}")


Number of features: 1; Random Forest accuracy: 0.9155045118949959
Number of features: 4; Random Forest accuracy: 0.9823625922887613
Number of features: 7; Random Forest accuracy: 0.9864643150123051
Number of features: 10; Random Forest accuracy: 0.985233798195242
Number of features: 13; Random Forest accuracy: 0.9840032813781788
Number of features: 16; Random Forest accuracy: 0.9749794913863823
Number of features: 19; Random Forest accuracy: 0.9626743232157506
Number of features: 22; Random Forest accuracy: 0.9569319114027892
