In [1]:
# Dan Collins - IS362 - Project 4 - Mushroom DataSet

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [6]:
new_directory = r'C:\Users\dmcol\Documents\GitWork\IS362-Project4'

In [7]:
os.chdir(new_directory)

In [10]:
# Load the dataset file obtained from the mushroom.zip downloaded zip
data = pd.read_csv('agaricus-lepiota.data', header=None)

In [11]:
# Define the columns to select. 0 is for Edibility, 1 is for Cap-Shape, and 5 is for Odor.
# For Project 4, CapShape and Odor will act as the Predictor columns.
subset_data = data.iloc[:, [0, 1, 5]].copy()

In [12]:
# Rename columns
subset_data.columns = ['Edibility', 'CapShape', 'Odor']

In [13]:
# Define mapping dictionaries for column transformations
edibility_map = {'e': 1, 'p': 2, 'u': 3}
capshape_map = {'b': 1, 'c': 2, 'x': 3, 'f': 4, 'k': 5, 's': 6}
odor_map = {'a': 1, 'l': 2, 'c': 3, 'y': 4, 'f': 5, 'm': 6, 'n': 7, 'p': 8, 's': 9}

In [23]:
# NOTE: I understand the preferred way to include a multi-line comment in Python is to use triple quotes """,
# but I used only the # for better readability in Jupyter

# Below show the numeric mappings for each column's data values:
#
#Edibility has edible, poisonous, or unknown edibility
#	1 = Edible
#	2 = Poisonous
#	3 = Unknown Edibility
#    
#Cap-Shape has bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
#	1 = bell
#	2 = conical
#	3 = convex
#	4 = flat
#	5 = knobbed
#	6 = sunken
#    
#Odor has almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s
#	1 = almond
#	2 = anise
#	3 = cresote
#	4 = fishy
#	5 = foul
#	6 = musty
#	7 = none
#	8 = pungent
#	9 = spicy

In [14]:
# Do the mapping
subset_data.loc[:, 'Edibility'] = subset_data['Edibility'].map(edibility_map)
subset_data.loc[:, 'CapShape'] = subset_data['CapShape'].map(capshape_map)
subset_data.loc[:, 'Odor'] = subset_data['Odor'].map(odor_map)

In [15]:
# Create dummy variables for CapShape and Odor
dummy_data = pd.get_dummies(subset_data, columns=['CapShape', 'Odor'], drop_first=True)

In [16]:
# Split the data into features (X) and target variable (y)
X = dummy_data.drop('Edibility', axis=1)
y = dummy_data['Edibility']

In [17]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Initialize the classifier
clf = RandomForestClassifier(random_state=42)

In [21]:
# Fit the classifier on the training data
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [22]:
# Predict on the test set
y_pred = clf.predict(X_test)

In [23]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model: {accuracy}")

Accuracy of the model: 0.9852307692307692


In [28]:
# The reported accuracy of the model, at approximately 98.52%, is quite high.

# This indicates that this model has a high level of accuracy while predicting if a mushroom is poisonous or not based
# on the mushroom's Capshape and Odor.

# This provides a conclusion that Capshape and Odor are strong predictors of a mushroom's Edibility.

#My recommendation would be to include additional columns from the dataset into the analysis, to see if it improves the
#accuracy and makes the model a better predictor of mushroom's Edibility.