# Predict mushroom edibleness

Predict whether a mushroom is **poisonous** or **edible**.
1. Import modules
2. Explore data
3. Clean data if necessary (NaN values etc.)
4. Split data into train and test sets
5. Choose and train ML model
6. Check model accuracy, precision, recall and f1 scores
7. Perform Hyperparameter tuning if necessary, compare scores
8. Build ML Pipeline

In [17]:
# 1. Imports

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 2. Initial data exploration
df = pd.read_csv('mushrooms.csv')
#print(df.info())
#print(df.head())

# 3. Encode categorical data into numerical for use with ML models
for i in df.columns:
    df[i] = df[i].astype('category')
    df[i] = df[i].cat.codes
#print(df.info())
#print(df.head())
    
# 4.1 Split data into X, y
y = df['class']
X = df.drop('class', axis=1)

# 4.2 Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# 5. Based on data types and number of samples the SVC model is chosen (default value for hyperparameter 'C' is 1)
clf = SVC(C=1) 
clf.fit(X_train, y_train)

# 6. Check model accuracy, precision, recall and f1 scores
y_pred = clf.predict(X_test)

print("Model Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Model Precision Score: ", precision_score(y_test, y_pred))
print("Model Recall Score: ", recall_score(y_test, y_pred))
print("Model F1 Score: ", f1_score(y_test, y_pred))

Model Accuracy Score:  1.0
Model Precision Score:  1.0
Model Recall Score:  1.0
Model F1 Score:  1.0


In [13]:
# 7. Hyperparameter tuning - find best value for 'C'
#print(clf.get_params().keys())

parameters = {'C': [x for x in range(1, 26)]}

# Create and fit a GridSearchCV model
gs = GridSearchCV(clf, parameters)
gs.fit(X_train, y_train)

# Show which hyperparameters performed the best
print(gs.best_estimator_)

# Print the the parameters and mean test score
print(gs.cv_results_['params'])
print(gs.cv_results_['mean_test_score'])

# Create and print Pandas DataFrame
cv_table = pd.concat([pd.DataFrame(gs.cv_results_['params']), pd.DataFrame(gs.cv_results_['mean_test_score'], columns=['Accuracy'])], axis=1)

# This table displays the information more clearly.
#cv_table = df.pivot(index = 'C', columns = 'penalty')
print(cv_table)

# Compute and print the accuracy, precision, recall and f1 scores of the model on best 'C'
acc = gs.score(X_test, y_test)
print(acc)


SVC(C=24)
[{'C': 1}, {'C': 2}, {'C': 3}, {'C': 4}, {'C': 5}, {'C': 6}, {'C': 7}, {'C': 8}, {'C': 9}, {'C': 10}, {'C': 11}, {'C': 12}, {'C': 13}, {'C': 14}, {'C': 15}, {'C': 16}, {'C': 17}, {'C': 18}, {'C': 19}, {'C': 20}, {'C': 21}, {'C': 22}, {'C': 23}, {'C': 24}, {'C': 25}]
[0.9889213  0.99369183 0.99707669 0.99830757 0.99892308 0.99892308
 0.99907692 0.99923077 0.99938462 0.99938462 0.99938462 0.99953846
 0.99953846 0.99953846 0.99953846 0.99953846 0.99953846 0.99953846
 0.99953846 0.99953846 0.99953846 0.99953846 0.99953846 0.99984615
 0.99984615]
     C  Accuracy
0    1  0.988921
1    2  0.993692
2    3  0.997077
3    4  0.998308
4    5  0.998923
5    6  0.998923
6    7  0.999077
7    8  0.999231
8    9  0.999385
9   10  0.999385
10  11  0.999385
11  12  0.999538
12  13  0.999538
13  14  0.999538
14  15  0.999538
15  16  0.999538
16  17  0.999538
17  18  0.999538
18  19  0.999538
19  20  0.999538
20  21  0.999538
21  22  0.999538
22  23  0.999538
23  24  0.999846
24  25  0.999846
