# Predict mushroom edibleness

Build a Machine learning model that predicts whether a mushroom is **poisonous** or **edible**.
1. Import modules
2. Explore data
3. Clean data if necessary (NaN values etc.)
4. Split data into train and test sets
5. Choose and train ML model
6. Check model accuracy, precision, recall and f1 scores
7. Perform Hyperparameter tuning if necessary, compare scores
8. Build ML Pipeline

In [20]:
# 1. Imports

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 2. Initial data exploration
df = pd.read_csv('mushrooms.csv')
#print(df.info())
#print(df.head())

# 3. Encode categorical data into numerical for use with ML models
for i in df.columns:
    df[i] = df[i].astype('category')
    df[i] = df[i].cat.codes
#print(df.info())
#print(df.head())
    
# 4.1 Split data into X, y
y = df['class']
X = df.drop('class', axis=1)

# 4.2 Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# 5. Based on data types and number of samples the SVC model is chosen (default value for hyperparameter 'C' is 1)
clf = SVC(C=1) 
clf.fit(X_train, y_train)

# 6. Check model accuracy, precision, recall and f1 scores
y_pred = clf.predict(X_test)

print("Model Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Model Precision Score: ", precision_score(y_test, y_pred))
print("Model Recall Score: ", recall_score(y_test, y_pred))
print("Model F1 Score: ", f1_score(y_test, y_pred))

Model Accuracy Score:  0.9895384615384616
Model Precision Score:  0.9987357774968394
Model Recall Score:  0.9801488833746899
Model F1 Score:  0.9893550407013149


In [21]:
# 7. Hyperparameter tuning - find best value for 'C'
#print(clf.get_params().keys())

parameters = {'C': [x for x in range(1, 50)]}

# 7.1 Create and fit a GridSearchCV model
gs = GridSearchCV(clf, parameters)
gs.fit(X_train, y_train)

# 7.2 Show which hyperparameter performed best
print("Best 'C': ", gs.best_estimator_)

# 7.3 Print the parameters and mean test score
print(gs.cv_results_['params'])
print(gs.cv_results_['mean_test_score'])

# 7.4 Create and print Pandas DataFrame
cv_table = pd.concat([pd.DataFrame(gs.cv_results_['params']), pd.DataFrame(gs.cv_results_['mean_test_score'], columns=['Accuracy'])], axis=1)
print(cv_table)

# Calculate and print accuracy, precision, recall, and f1 scores of the model on best 'C'
acc = gs.score(X_test, y_test)
print(acc)


Best 'C':  SVC(C=10)
[{'C': 1}, {'C': 2}, {'C': 3}, {'C': 4}, {'C': 5}, {'C': 6}, {'C': 7}, {'C': 8}, {'C': 9}, {'C': 10}, {'C': 11}, {'C': 12}, {'C': 13}, {'C': 14}, {'C': 15}, {'C': 16}, {'C': 17}, {'C': 18}, {'C': 19}, {'C': 20}, {'C': 21}, {'C': 22}, {'C': 23}, {'C': 24}, {'C': 25}, {'C': 26}, {'C': 27}, {'C': 28}, {'C': 29}, {'C': 30}, {'C': 31}, {'C': 32}, {'C': 33}, {'C': 34}, {'C': 35}, {'C': 36}, {'C': 37}, {'C': 38}, {'C': 39}, {'C': 40}, {'C': 41}, {'C': 42}, {'C': 43}, {'C': 44}, {'C': 45}, {'C': 46}, {'C': 47}, {'C': 48}, {'C': 49}]
[0.98738319 0.99430722 0.99769219 0.99846142 0.9990768  0.9993845
 0.99969219 0.99984604 0.99984604 1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.   

In [23]:
# 7.5 Fit a new SVC model based on best 'C' (Note: This value is slightly different on each fit of the first model)
clf2 = SVC(C=10) 
clf2.fit(X_train, y_train)

# 7.6 Check model accuracy, precision, recall and f1 scores
y_pred = clf2.predict(X_test)

print("Model Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Model Precision Score: ", precision_score(y_test, y_pred))
print("Model Recall Score: ", recall_score(y_test, y_pred))
print("Model F1 Score: ", f1_score(y_test, y_pred))

Model Accuracy Score:  1.0
Model Precision Score:  1.0
Model Recall Score:  1.0
Model F1 Score:  1.0
