In [12]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression


eat = {'number': {'e': 0, 'p':1}, 'name': {'e': 'Edible', 'p': 'Poisonous'}}

smell = {'number': {'a' : 0,'l' : 1,'c' : 2,'y' : 3,'f' : 4,'m' : 5,'n' : 6,'p' : 7,'s' : 8},
             'name': {'a' : 'Almond','l' : 'Anise','c' : 'Creosote','y' : 'Fishy',
                      'f' : 'Foul','m' : 'Musty','n' : 'None','p' : 'Pungent','s' : 'Spicy'}}

color = {'number': {'k':0,'n':1,'b':2,'h':3,'g':4,'r':5,
         'o':6,'p':7,'u':8,'e':9,'w':10,'y':11},
             'name': {'k':'Black','n':'Brown','b':'Buff','h':'Chocolate','g':'Gray','r':'Green',
         'o':'Orange','p':'Pink','u':'Purple','e':'Red','w':'White','y':'Yellow'}}

cols = ['Class_Type', 'Odor', 'Gill_Color']
colname = ['Class_Name', 'Odor_Name', 'Gill_Color_Name']

#shroom = pd.read_table('/Users/Maureen/Desktop/GitHub/IS362/mushroom.txt', delimiter=',', header=None, usecols=[0,5,9])
shroom = pd.read_table('https://raw.githubusercontent.com/moshun8/IS362/master/mushroom.txt', delimiter=',', header=None, usecols=[0,5,9])
shroom.columns = cols

shroom.Class_Type = shroom.Class_Type.map(eat['number'])
shroom.Odor = shroom.Odor.map(smell['number'])
shroom.Gill_Color = shroom.Gill_Color.map(color['number'])
shroom.head()

Unnamed: 0,Class_Type,Odor,Gill_Color
0,1,7,0
1,0,0,0
2,0,1,1
3,1,7,1
4,0,6,0


Set the X and y variables.
Use get_dummies to put the categorical data into separate columns

In [13]:
features = ['Odor', 'Gill_Color']
X = pd.get_dummies(data=shroom, columns=features)
y = X['Class_Type']
X.head()

Unnamed: 0,Class_Type,Odor_0,Odor_1,Odor_2,Odor_3,Odor_4,Odor_5,Odor_6,Odor_7,Odor_8,...,Gill_Color_2,Gill_Color_3,Gill_Color_4,Gill_Color_5,Gill_Color_6,Gill_Color_7,Gill_Color_8,Gill_Color_9,Gill_Color_10,Gill_Color_11
0,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Instantiate the train, test, split model. It automatically puts 25% into the train group.

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(6093, 22)
(6093,)
(2031, 22)
(2031,)


Train the model

In [15]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
print({linreg.intercept_: linreg.coef_})

{-1.7266966376318813: array([ 1.        ,  1.45679157,  1.45679157,  1.45679157,  1.45679157,
        1.45679157,  1.45679157,  1.45679157,  1.45679157,  1.45679157,
        0.26990507,  0.26990507,  0.26990507,  0.26990507,  0.26990507,
        0.26990507,  0.26990507,  0.26990507,  0.26990507,  0.26990507,
        0.26990507,  0.26990507])}


See how well the model can predict with the mean squared error.

In [16]:
y_pred = linreg.predict(X_test)
print(metrics.mean_squared_error(y_test, y_pred))

3.72152706035e-30


Those numbers are very reliable, but I figured I'd try SVM to see if this test is any better.

In [17]:
from sklearn import svm

clf = svm.SVC(probability=True, random_state=0)
clf.fit(X,y)
cross_val_score(clf, X, y, scoring='mean_squared_error')
#Valid options are ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 
#'f1_samples', 'f1_weighted', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 
#'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 
#'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']

array([-0., -0., -0.])

These are also really reliable numbers. How I'll check if Odor or Gill Color is a better predictor. I'll change the X to the get dummies for Odor and Gill Color separately. First Odor

In [18]:
X = pd.get_dummies(data=shroom, columns=['Odor'])
y = X['Class_Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)

print((metrics.mean_squared_error(y_test, y_pred)))

1.95397756165e-29


In [19]:
clf = svm.SVC(probability=True, random_state=0)
clf.fit(X,y)
cross_val_score(clf, X, y, scoring='mean_squared_error')

array([-0., -0., -0.])

Now Gill Color

In [20]:
X = pd.get_dummies(data=shroom, columns=['Gill_Color'])
y = X['Class_Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
print((metrics.mean_squared_error(y_test, y_pred)))

2.77748402544e-29


In [21]:
clf = svm.SVC(probability=True, random_state=0)
clf.fit(X,y)
cross_val_score(clf, X, y, scoring='mean_squared_error')

array([-0., -0., -0.])

It looks like Gill Color is slightly better than Odor at predicting Edible/Poisonous. However, statistically speaking they're both great. Using them together would probably make 99.99% correct predictions.