<a href="https://colab.research.google.com/github/cm-int/machine-learning-fundamentals/blob/main/module_2/Democode/Mod_2_Lesson_3_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating a Binary Classification Machine Learning Model

In this demonstration, you’ll see how to create a binary classification model to determine whether mushrooms are poisonous or edible depending on their appearance. You’ll compare the results of several classification models.

This demonstration uses a modified version of the **Mushroom Classification: Safe to eat or deadly poison?** dataset originally donated to the UCI Machine Learning repository. It is available for use under the **CC0: Public Domain** licence.

---

The dataset has the following structure:



> |Variable|Description|
> |--------|-----------|
> |class   |edible=e, poisonous=p|
> |cap-shape|bell=b, conical=c, convex=x, flat=f, knobbed=k, sunken=s|
> |cap-surface|fibrous=f, grooves=g, scaly=y, smooth=s|
> |cap-color|brown=n, buff=b, cinnamon=c, gray=g, green=r, pink=p, purple=u, red=e, white=w, yellow=y|
> |bruises|bruises=t, no=f|
> |odor|almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s|
> |gill-attachment|attached=a, descending=d, free=f, notched=n|
> |gill-spacing|close=c, crowded=w, distant=d|
> |gill-size|broad=b, narrow=n|
> |gill-color|black=k, brown=n, buff=b, chocolate=h, gray=g, green=r,orange=o, pink=p, purple=u, red=e, white=w, yellow=y|
> |stalk-shape|enlarging=e, tapering=t|
> |stalk-root|bulbous=b, club=c, cup=u, equal=e, rhizomorphs=z, rooted=r,missing=?|
> |stalk-surface-above-ring|fibrous=f, scaly=y, silky=k, smooth=s|
> |stalk-surface-below-ring|fibrous=f, scaly=y, silky=k, smooth=s|
> |stalk-color-above-ring|brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p,red=e, white=w, yellow=y|
> |stalk-color-below-ring|brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p,red=e, white=w, yellow=y|
> |veil-type|partial=p, universal=u|
> |veil-color|brown=n, orange=o, white=w, yellow=y|
> |ring-number|none=n, one=o, two=t|
> |ring-type|cobwebby=c, evanescent=e, flaring=f, large=l, none=n, pendant=p, sheathing=s, zone=z|
> |spore-print-color|black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y|
> |population|abundant=a, clustered=c, numerous=n, scattered=s, several=v, solitary=y|
> |habitat|grasses=g, leaves=l, meadows=m, paths=p, urban=u, waste=w, woods=d|


---


# Prepare the data

In [None]:
# Upload the mushrooms.csv file

!wget 'https://raw.githubusercontent.com/cm-int/machine-learning-fundamentals/main/module_2/Democode/mushrooms.csv'

In [None]:
# Read the data from the CSV file

import numpy as np
import pandas as pd

mushrooms = pd.read_csv("mushrooms.csv")
mushrooms

In [None]:
# Convert categorical data to dummy variables

mushrooms = pd.get_dummies(mushrooms)
mushrooms

In [None]:
# Create a features dataset without the class dummy variables

features = mushrooms.drop(['class_e', 'class_p'], axis=1)
features

In [None]:
# Create a predictions dataset using the edible class

predictions = mushrooms['class_e']

# Visualize the data

In [None]:
import graphviz
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

visual_model = TSNE(learning_rate = 100, init='pca')
visual_transformation = visual_model.fit_transform(features)
x_data = visual_transformation[:, 0]
y_data = visual_transformation[:, 1]
transformed_data = pd.DataFrame({'x':x_data, 'y':y_data})

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(transformed_data.loc[predictions==0]['x'], transformed_data.loc[predictions==0]['y'], c='red')
plt.scatter(transformed_data.loc[predictions==1]['x'], transformed_data.loc[predictions==1]['y'], c='blue')
plt.legend(loc ='lower left', labels = ['poisonous', 'edible'])
plt.show()

# Create a decision tree model to classify the data

In [None]:
# Split the data into training and test datasets

from sklearn.model_selection import train_test_split

features_train, features_test, predictions_train, predictions_test = \
  train_test_split(features, predictions, test_size=0.33, random_state=13) # Random state specified to ensure repeatability if necessary

In [None]:
print(f'features_train: {features_train.shape}\npredictions_train: {predictions_train.shape}')

In [None]:
# Create a decision tree classifier that limits the depth to 4 levels below the root node

from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(criterion='gini', max_depth=4)

In [None]:
# Train the model

_ = dt_model.fit(features_train, predictions_train)

In [None]:
# View the decision tree

from sklearn.tree import export_graphviz

view_data = export_graphviz(dt_model, out_file=None, feature_names=features.columns, class_names=['edible', 'poisonous'])
graph = graphviz.Source(view_data)
graph

In [None]:
# Make a couple of test predictions using two observations from the test data and compare the results to the real class labels for these observations

# First, observation and prediction #3

predicted_class_label = dt_model.predict(features_test[2:3])[0]
real_class_label = predictions_test.iat[2]

is_prediction_correct = (predicted_class_label == real_class_label)
print(f'Predicted value (edible): {predicted_class_label}')
print(f'Is prediction correct? {is_prediction_correct}')

print(f'Probabilities: {dt_model.predict_proba(features_test[2:3])[0]}')

In [None]:
# Second, observation and prediction #501

predicted_class_label = dt_model.predict(features_test[500:501])[0]
real_class_label = predictions_test.iat[500]

is_prediction_correct = (predicted_class_label == real_class_label)
print(f'Predicted value (edible): {predicted_class_label}')
print(f'Is prediction correct? {is_prediction_correct}')

print(f'Probabilities: {dt_model.predict_proba(features_test[500:501])[0]}')

In [None]:
# Create and train another decision tree classifier with 6 levels below the root node, for comparison

dt_model = DecisionTreeClassifier(criterion='gini', max_depth=6)
_ = dt_model.fit(features_train, predictions_train)

In [None]:
# Repeat the tests and compare the probabilities

# First, observation and prediction #3
predicted_class_label = dt_model.predict(features_test[2:3])[0]
real_class_label = predictions_test.iat[2]

is_prediction_correct = (predicted_class_label == real_class_label)
print(f'Predicted value (edible): {predicted_class_label}')
print(f'Is prediction correct? {is_prediction_correct}')

print(f'Probabilities: {dt_model.predict_proba(features_test[2:3])[0]}')
print('\n')

# Second, observation and prediction 501

predicted_class_label = dt_model.predict(features_test[500:501])[0]
real_class_label = predictions_test.iat[500]

is_prediction_correct = (predicted_class_label == real_class_label)
print(f'Predicted value (edible): {predicted_class_label}')
print(f'Is prediction correct? {is_prediction_correct}')

print(f'Probabilities: {dt_model.predict_proba(features_test[500:501])[0]}')

# Create a random forest model to classify the data

In [None]:
# Create and train the model

from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier(n_estimators=100)
_ = forest_model.fit(features_train, predictions_train)

In [None]:
# Test using the same data as before and compare the probabilities to those of the decision tree model

# First, observation and prediction #3
predicted_class_label = forest_model.predict(features_test[2:3])[0]
real_class_label = predictions_test.iat[2]

is_prediction_correct = (predicted_class_label == real_class_label)
print(f'Predicted value (edible): {predicted_class_label}')
print(f'Is prediction correct? {is_prediction_correct}')

print(f'Probabilities: {forest_model.predict_proba(features_test[2:3])[0]}')
print('\n')

# Second, observation and prediction 501

predicted_class_label = forest_model.predict(features_test[500:501])[0]
real_class_label = predictions_test.iat[500]

is_prediction_correct = (predicted_class_label == real_class_label)
print(f'Predicted value (edible): {predicted_class_label}')
print(f'Is prediction correct? {is_prediction_correct}')

print(f'Probabilities: {forest_model.predict_proba(features_test[500:501])[0]}')

# Create a gradient boosted tree model to classify the data

In [None]:
# Create and train the model

from sklearn.ensemble import GradientBoostingClassifier

gbt_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.25, max_depth=6)
_ = gbt_model.fit(features_train, predictions_train)

In [None]:
# Test the model and compare the results with earlier

# First, observation and prediction #3
predicted_class_label = gbt_model.predict(features_test[2:3])[0]
real_class_label = predictions_test.iat[2]

is_prediction_correct = (predicted_class_label == real_class_label)
print(f'Predicted value (edible): {predicted_class_label}')
print(f'Is prediction correct? {is_prediction_correct}')

print(f'Probabilities: {gbt_model.predict_proba(features_test[2:3])[0]}')
print('\n')

# Second, observation and prediction #501

predicted_class_label = gbt_model.predict(features_test[500:501])[0]
real_class_label = predictions_test.iat[500]

is_prediction_correct = (predicted_class_label == real_class_label)
print(f'Predicted value (edible): {predicted_class_label}')
print(f'Is prediction correct? {is_prediction_correct}')

print(f'Probabilities: {gbt_model.predict_proba(features_test[500:501])[0]}')

# Create a logistic regression model to classify the data

In [None]:
# Create and train the model

from sklearn.linear_model import LogisticRegression

logit_model = LogisticRegression( max_iter=400, solver="lbfgs", tol=1e-4)
_ = logit_model.fit(features_train, predictions_train)

In [None]:
# Test the model and compare the results with earlier

# First, observation and prediction #3
predicted_class_label = logit_model.predict(features_test[2:3])[0]
real_class_label = predictions_test.iat[2]

is_prediction_correct = (predicted_class_label == real_class_label)
print(f'Predicted value (edible): {predicted_class_label}')
print(f'Is prediction correct? {is_prediction_correct}')

print(f'Probabilities: {logit_model.predict_proba(features_test[2:3])[0]}')
print('\n')

# Second, observation and prediction #501

predicted_class_label = logit_model.predict(features_test[500:501])[0]
real_class_label = predictions_test.iat[500]

is_prediction_correct = (predicted_class_label == real_class_label)
print(f'Predicted value (edible): {predicted_class_label}')
print(f'Is prediction correct? {is_prediction_correct}')

print(f'Probabilities: {logit_model.predict_proba(features_test[500:501])[0]}')

# Create a K-Nearest Neighbors model to classify the data

In [None]:
# Create and train the model

from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
_ = knn_model.fit(features_train, predictions_train)

In [None]:
# Test the model and compare the results with earlier

# First, observation and prediction #3
predicted_class_label = knn_model.predict(features_test[2:3])[0]
real_class_label = predictions_test.iat[2]

is_prediction_correct = (predicted_class_label == real_class_label)
print(f'Predicted value (edible): {predicted_class_label}')
print(f'Is prediction correct? {is_prediction_correct}')

print(f'Probabilities: {knn_model.predict_proba(features_test[2:3])[0]}')
print('\n')

# Second, observation and prediction #501

predicted_class_label = knn_model.predict(features_test[500:501])[0]
real_class_label = predictions_test.iat[500]

is_prediction_correct = (predicted_class_label == real_class_label)
print(f'Predicted value (edible): {predicted_class_label}')
print(f'Is prediction correct? {is_prediction_correct}')

print(f'Probabilities: {knn_model.predict_proba(features_test[500:501])[0]}')