In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from IPython import display
from sklearn.model_selection import train_test_split
%matplotlib inline

In [28]:
# import the data
data = pd.read_csv(os.path.join('data','mushrooms.csv'))
labels = data['class']
X = data.drop(['class'], axis=1)

# encode dummy variables
X = pd.get_dummies(X).values

# y should be 1 for edible
y = (labels == 'e') * 1

# split the data into train and validation
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25, random_state=1)

# split test into test and validation
X_te, X_cv, y_te, y_cv = train_test_split(X_te, y_te, test_size=0.5, random_state=1)

print("X_tr", X_tr.shape)
print("X_cv", X_cv.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_cv", y_cv.shape)
print("y_te", y_te.shape)

X_tr (6093, 117)
X_cv (1016, 117)
X_te (1015, 117)
y_tr (6093,)
y_cv (1016,)
y_te (1015,)


In [30]:
# get base distribution
pd.value_counts(y, normalize=True)

1    0.517971
0    0.482029
Name: class, dtype: float64

In [36]:
# try a random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('rf', RandomForestClassifier())
])

grid = {
    'rf__n_estimators': [5, 10, 50],
    'rf__max_depth': [5, 10, 20, 50],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)

# Fit it to full training set
grid_cv.fit(X_tr, y_tr)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'rf__n_estimators': [1, 5, 10, 50], 'rf__max_depth': [1, 5, 10, 20, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [37]:
# Collect results and sort them
grid_results = pd.DataFrame.from_items([
    ('estimators', grid_cv.cv_results_['param_rf__n_estimators']),
    ('max_depth', grid_cv.cv_results_['param_rf__max_depth']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

grid_results.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,estimators,max_depth,mean_te
19,50,50,1.0
11,50,10,1.0
18,10,50,1.0
17,5,50,1.0
15,50,20,1.0
13,5,20,1.0
14,10,20,0.999836
10,10,10,0.999672
9,5,10,0.999508
12,1,20,0.999179


In [38]:
# score on test data
grid_cv.score(X_te, y_te)

1.0