In [1]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

np.random.seed(19)

## Load data

In [35]:
data_folder = ""
data = pd.read_csv(os.path.join(data_folder, "mushrooms.csv"))

In [36]:
data.head().transpose()

Unnamed: 0,0,1,2,3,4
class,p,e,e,p,e
cap-shape,x,x,b,x,x
cap-surface,s,s,s,y,s
cap-color,n,y,w,w,g
bruises,t,t,t,t,f
odor,p,a,l,p,n
gill-attachment,f,f,f,f,f
gill-spacing,c,c,c,c,w
gill-size,n,b,b,n,b
gill-color,k,k,n,n,k


In [37]:
data.describe().transpose()

Unnamed: 0,count,unique,top,freq
class,8124,2,e,4208
cap-shape,8124,6,x,3656
cap-surface,8124,4,y,3244
cap-color,8124,10,n,2284
bruises,8124,2,f,4748
odor,8124,9,n,3528
gill-attachment,8124,2,f,7914
gill-spacing,8124,2,c,6812
gill-size,8124,2,b,5612
gill-color,8124,12,b,1728


In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

## Label

In [39]:
data[0] = data.apply(lambda row: 0 if row[0] == 'e' else 1, axis=1)

In [40]:
data

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,0
0,p,x,s,n,t,p,f,c,n,k,...,w,w,p,w,o,p,k,s,u,1
1,e,x,s,y,t,a,f,c,b,k,...,w,w,p,w,o,p,n,n,g,0
2,e,b,s,w,t,l,f,c,b,n,...,w,w,p,w,o,p,n,n,m,0
3,p,x,y,w,t,p,f,c,n,n,...,w,w,p,w,o,p,k,s,u,1
4,e,x,s,g,f,n,f,w,b,k,...,w,w,p,w,o,e,n,a,g,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,o,o,p,o,o,p,b,c,l,0
8120,e,x,s,n,f,n,a,c,b,y,...,o,o,p,n,o,p,b,v,l,0
8121,e,f,s,n,f,n,a,c,b,n,...,o,o,p,o,o,p,b,c,l,0
8122,p,k,y,n,f,y,f,c,n,b,...,w,w,p,w,o,e,w,v,l,1


In [41]:
cols = np.arange(1,23)
for col in cols:
    if np.any(data.iloc[:,col].isnull()):
        data.loc[data.iloc[:,col].isnull(), col] = 'missing'

In [42]:
labelEncoders = dict()

#one hot encoding for each column
for col in cols:
    encoder = LabelEncoder()
    values = data.iloc[:,col].tolist()
    values.append('missing')  #add missing
    encoder.fit(values)
    labelEncoders[col] = encoder

# count total columns one hot encoding
dimensionality = 0
for col, encoder in labelEncoders.items():
    dimensionality += len(encoder.classes_)
print("dimensionality:  %d" % (dimensionality))

dimensionality:  139


In [44]:
# transform data
def transform(df):
    N, _ = df.shape
    X = np.zeros((N, dimensionality))
    i = 0
    for col ,encoder in labelEncoders.items():
        k = len(encoder.classes_)
        X[np.arange(N), encoder.transform(df.iloc[:,col]) + i] = 1
        i += k
    return X

In [45]:
X = transform(data)
Y = data[0].to_numpy()

## Logistic Regression

In [47]:
logistic_model = LogisticRegression()
print("logistic Regression performance: %f" % (cross_val_score(logistic_model, X, Y, cv=8).mean()))

logistic Regression performance: 0.926003


## Decision Tree

In [48]:
tree_model = DecisionTreeClassifier()
print("Decision Tree performance: %f" % (cross_val_score(tree_model, X, Y, cv=8).mean()))

Decision Tree performance: 0.933775


## Random Forest

In [49]:
forest = RandomForestClassifier(n_estimators=20)
print("Random Forest performance: %f" % (cross_val_score(tree_model, X, Y, cv=8).mean()))

Random Forest performance: 0.950524


## Implement Sudo Random Forest

In [50]:
from sklearn.base import BaseEstimator

class SudoRandomForest(BaseEstimator):
    def __init__(self, M):
        self.M = M
    def fit(self, X, Y, n_features=None):
        N,D = X.shape
        if n_features is None:
            n_features = int(np.sqrt(D))
        
        self.models = []
        self.features = []
        
        for m in range(self.M):
            tree = DecisionTreeClassifier()
            
            idx = np.random.choice(N, size=N, replace=True) # bagging
            X_current = X[idx]
            Y_current = Y[idx]
            
            features = np.random.choice(D, size=n_features, replace=False) # feature
            tree.fit(X_current[:,features], Y_current)
            self.features.append(features)
            self.models.append(tree)
            
    def predict(self, X):
        N = len(X)
        results = np.zeros(N)
        for features, tree in zip(self.features, self.models):
            results += tree.predict(X[:, features])
        return np.round(results/self.M)
    
    def score(self, X, Y):
        prediction = self.predict(X)
        return np.mean(prediction == Y)