## Week 6 - Discussion 6

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import mean_squared_error, accuracy_score
from ucimlrepo import fetch_ucirepo 

In [3]:
# Fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# Save data as X and y variables
X = mushroom.data.features 
y = np.ravel(mushroom.data.targets)

# Expand dataframe columns and look at view dataframe
pd.set_option('display.max_columns', None)
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g


In [4]:
X['cap-shape']

0       x
1       x
2       b
3       x
4       x
       ..
8119    k
8120    x
8121    f
8122    k
8123    x
Name: cap-shape, Length: 8124, dtype: object

In [7]:
X.loc[:, "cap-shape"]

0       x
1       x
2       b
3       x
4       x
       ..
8119    k
8120    x
8121    f
8122    k
8123    x
Name: cap-shape, Length: 8124, dtype: object

In [None]:
# factorize all columns
for col in X.columns:
    X.loc[:, col] = pd.factorize...

In [8]:
# Check for NAs
X.isna().sum()

cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64

In [10]:
# Create copy of X variables
X_NA = X.copy()

In [12]:
# Assign 10% of new df
for col in X_NA.columns:
    X_NA.loc[X_NA.sample(frac = 0.1).index, col] = np.nan

In [13]:
# check to make sure there are missing values
X_NA.isna().sum()

cap-shape                    812
cap-surface                  812
cap-color                    812
bruises                      812
odor                         812
gill-attachment              812
gill-spacing                 812
gill-size                    812
gill-color                   812
stalk-shape                  812
stalk-root                  3049
stalk-surface-above-ring     812
stalk-surface-below-ring     812
stalk-color-above-ring       812
stalk-color-below-ring       812
veil-type                    812
veil-color                   812
ring-number                  812
ring-type                    812
spore-print-color            812
population                   812
habitat                      812
dtype: int64

In [14]:
# Impute with mode
X_mode_impute = X_NA.fillna(X_NA.mode().iloc[0])

# Check to make sure there are no NAs
X_mode_impute.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [21]:
# Impute with median (using SimpleImputer)
median_impute = SimpleImputer(strategy = 'median')
X_median_impute = median_impute.fit_transform(X_NA)
X_median_impute = pd.DataFrame(X_median_impute, columns = X.columns)

# Check to make sure there are no NAs
X_median_impute.isna().sum()

ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'x'

In [20]:
# Impute with KNN imputer
knn_impute  = KNNImputer(n_neighbors = 5)
X_knn_impute = knn_impute.fit_transform(X_NA)
X_knn_impute= pd.DataFrame(X_knn_impute, columns = X.columns)

X_knn_impute.isna().sum()

ValueError: could not convert string to float: 'x'

In [22]:
# Use MSE to calculate imputation accuracy
mse_mode = mean_squared_error(X, X_mode_impute)
mse_median = mean_squared_error(X, X_median_impute)
mse_knn = mean_squared_error(X, X_knn_impute)

# Print results
print(F"Mode Imputation Performance: {mse_mode}")
print(F"Median Imputation Performance: {mse_median}")
print(F"KNN Imputation Performance: {mse_knn}")

ValueError: could not convert string to float: 'x'

### Bagging Classifier with og data

In [28]:
#split actual data
X_Train, X_tet, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=42)

In [31]:
# number of features to include for tuning

num_features = [1,4,7,10,13,16,19,22]
accuracy = []

for feature in num_features:
    rf_classifier = RandomForestClassifier(n_estimator = 50, max_depth = 3, ramdom_state=42,max_features = feature)
    
    rf_classifier.fit(X_train, y_train)
    
    # predict and evaluate results
    y_pred = rf.classifier.predict(X_test)
    ... 

TypeError: RandomForestClassifier.__init__() got an unexpected keyword argument 'n_estimator'