In this notebook, I am building a model to successfully classify mushrooms as either poisonous or edible. 

In [58]:
%matplotlib notebook
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing, cross_validation
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from matplotlib import style
import warnings
warnings.filterwarnings('ignore')

In [2]:
style.use('fivethirtyeight')

Class is the variable we are trying to predict. In machine learning speak, class would be our label and the other variables would be our features. Or in other words class is a class and we are trying to create a model to determine the common attributes of each class.

In [3]:
fungi=pd.read_csv('mushrooms.csv')

fungi.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [4]:
fungi.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [7]:
fungi['veil-type'].unique()

array(['p'], dtype=object)

In [10]:
fungi.drop(['veil-type'],axis=1, inplace=True)
df=pd.get_dummies(fungi, drop_first=False)


In [11]:
df.describe()

Unnamed: 0,class_e,class_p,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
count,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,...,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0
mean,0.517971,0.482029,0.055638,0.000492,0.387986,0.10192,0.003939,0.450025,0.285574,0.000492,...,0.153619,0.497292,0.210734,0.387494,0.264402,0.102413,0.035943,0.140817,0.045298,0.023634
std,0.499708,0.499708,0.229235,0.022185,0.487321,0.302562,0.062641,0.497527,0.451715,0.022185,...,0.360605,0.500023,0.407855,0.487208,0.441041,0.303209,0.186159,0.347854,0.207969,0.151914
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


We are trying to predict class. In order to set up out model we have to convert the data into some form of numerical data. We could convert each variable into a factor but this presents a new problem. If we do so then there will be a false scale. Gills for instance, have 12 different colors in the data. If we labelled these colors 1 through 12 then our model would assign a scale to color where one color has twelve times the magnitude of another, and it is inaccurate to say that color has any magnitude at all. Our model would ultimately still have some predictive power but it is best to avoid injecting inaccuracy into our data when possible. 

In order to convert the data into numeric data, a dummy variable is created for each attribute in the data. This is a binary variable that has a value of 1 if the attribute is present in the observation and 0 if it is not. This creates another problem for the model; there is now a very large number of variables. This will influence model selection but hopefully not bea serious problem.

We are also removing veil-type from the data because all mushrooms have the same veil type. Therefore, veil type has no predictive value. In addition, all of our data is categorical and currently defined by strings. In order to run a model on the data we have to transform it into numerical data. Here we are using the label encoder function to create numerical classes for each variable.

In [29]:
df=df.rename(columns={'class_p': 'poisonous', 'class_e': 'edible'})
df.head()

Unnamed: 0,edible,poisonous,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [44]:
dfgby=df.groupby('edible').sum()
dfgby.head()

Unnamed: 0_level_0,poisonous,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
edible,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3916.0,48.0,4.0,1556.0,600.0,0.0,1708.0,760.0,4.0,1412.0,...,368.0,2848.0,648.0,1268.0,740.0,592.0,36.0,1008.0,272.0,0.0
1,0.0,404.0,0.0,1596.0,228.0,32.0,1948.0,1560.0,0.0,1144.0,...,880.0,1192.0,1064.0,1880.0,1408.0,240.0,256.0,136.0,96.0,192.0


In [25]:
#relative frequency of poisonous to edible for each attribute
colnames=list(df)
poisonous=[]
edible=[]
for i in range(1,len(colnames)):
    poisonous.append(df.loc[df['poisonous']==1,  [colnames[i]]].sum())
    edible.append(df.loc[df['edible']==1,  [colnames[i]]].sum())


In [54]:
colsum=[]
for i in range(1,len(colnames)):
   colsum[i]=sum(df.iloc[:,[i]])


TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [52]:
df.drop('poisonous', axis=1, inplace=True)

In [54]:
df.label=df[['edible']]
df.features=df.drop('edible', axis=1)
df.shape,df.label.shape, df.features.shape

((8124, 117), (8124, 1), (8124, 116))

In [60]:
X_train, X_test, y_train, y_test= train_test_split(df.features, df.label, test_size=.5, random_state=0)

In [61]:
clf=RandomForestClassifier(min_samples_leaf=5, min_samples_split=5)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [62]:
confusion_matrix(y_true=y_test.values, y_pred=clf.predict(X_test))


array([[1949,    3],
       [   0, 2110]])