In [28]:
import pandas as pd

df = pd.read_csv('mushrooms_mapped.csv')

df = df.rename(columns={'Unnamed: 0':'id'})

### Split the data into train, validation and test sets

In [29]:
target = 'class'
X = df.drop(columns=target)
y = df[target]

In [38]:
from sklearn.model_selection import train_test_split

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, stratify=y, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, stratify=y_trainval, random_state=42)

print('X_train shape', X_train.shape)
print('y_train shape', y_train.shape)
print('X_val shape', X_val.shape)
print('y_val shape', y_val.shape)
print('X_test shape', X_test.shape)
print('y_test shape', y_test.shape)

X_train shape (4569, 23)
y_train shape (4569,)
X_val shape (1524, 23)
y_val shape (1524,)
X_test shape (2031, 23)
y_test shape (2031,)


In [39]:
# Save the ids for later, so we can look up actual results,
# to compare with predicted results
train_id = X_train['id']
val_id = X_val['id']
test_id = X_test['id']

In [40]:
target = 'class'
features = ['spore_print_color', 'gill_size', 'gill_attachment', 'population', 'habitat', 'gill_spacing',
       'gill_color']
X_train = X_train[features]
X_val = X_val[features]
X_test = X_test[features]

In [41]:
import category_encoders as ce
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

pipeline = make_pipeline(
    ce.OrdinalEncoder(),  
    RandomForestClassifier(n_estimators=300, random_state=42)
)

cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=5)

array([0.99988997, 0.99981104, 0.99995195, 0.99998559, 0.999803  ])

In [43]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['spore_print_color', 'gill_size',
                                      'gill_attachment', 'population',
                                      'habitat', 'gill_spacing', 'gill_color'],
                                drop_invariant=False, handle_missing='value',
                                handle_unknown='value',
                                mapping=[{'col': 'spore_print_color',
                                          'data_type': dtype('O'),
                                          'mapping': brown        1
chocolate    2
white        3
black        4
green        5
orange       6
purp...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                           

In [44]:
pipeline.score(X_val, y_val)


0.989501312335958

In [66]:
from sklearn.metrics import roc_auc_score
class_index = 1
y_pred_proba = pipeline.predict_proba(X_val)[:, class_index]
print(f'Test ROC AUC for class {class_index}:')
print(roc_auc_score(y_val, y_pred_proba)) # Ranges from 0-1, higher is better

Test ROC AUC for class 1:
0.99983532069355


In [45]:
pipeline.score(X_test, y_test)

0.9940915805022157

In [67]:
class_index = 1
y_pred_proba = pipeline.predict_proba(X_test)[:, class_index]
print(f'Test ROC AUC for class {class_index}:')
print(roc_auc_score(y_test, y_pred_proba)) # Ranges from 0-1, higher is better

Test ROC AUC for class 1:
0.9999179538366534


In [71]:
def predict(
    spore_print_color, 
    gill_size, 
    gill_attachment, 
    population, 
    habitat, 
    gill_spacing,
    gill_color
):
    #make dataframe from inputs
    df = pd.DataFrame(
        data=[[ spore_print_color, 
               gill_size, 
               gill_attachment, 
               population, 
               habitat, 
               gill_spacing,
               gill_color]],
        columns= ['spore_print_color', 
                  'gill_size', 
                  'gill_attachment', 
                  'population', 
                  'habitat', 
                  'gill_spacing',
                  'gill_color']
    )
    
    # Get the model's prediction
    y_pred = pipeline.predict(df)[0]
    
    class_index = 1
    y_pred_proba = pipeline.predict_proba(df)[:, class_index][0]
    
    return f'Your mushroom is {y_pred}. There is a {y_pred_proba:.2f} probability that the mushroom is poisonous' 

In [72]:
predict('chocolate', 'narrow', 'descending', 'abundant', 'grasses', 'crowded', 'brown')

'Your mushroom is edible. There is a 0.30 probability that the mushroom is poisonous'

In [68]:
from joblib import dump
dump(pipeline, 'pipeline.joblib', compress=True)

['pipeline.joblib']

In [24]:
import joblib
import sklearn
import category_encoders as ce
print(f'joblib=={joblib.__version__}')
print(f'scikit-learn=={sklearn.__version__}')
print(f'category_encoders=={ce.__version__}')

joblib==0.13.2
scikit-learn==0.21.3
category_encoders==2.1.0
