<a href="https://colab.research.google.com/github/cm-int/machine-learning-fundamentals/blob/main/module_2/Labs/Lab2_2_MobilePhonePrices_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab 2.2: Creating a Multi-class and Multi-Label Machine Learning Model

In the first part of this lab, you'll create multi-class machine learning models to classify mobile phones into one of four different price brackets according to the features that the phones provide. You'll import, prepare, and visualize the data, and then you'll create machine learning models using the Decision Tree, Logistic Regression, and K-Nearest Neighbors algorithms. You'll compare the results of each of these models.

In the second part of this lab, you'll turn the problem around 180 degrees. You'll build a multi-label model that predicts whether a phone has 4G, WiFi, and Bluetooth capabilities based on its other characteristics, including the price bracket.

#Read the Data

In [None]:
# Upload the mobilephonedata.csv file from Github

!wget 'https://raw.githubusercontent.com/cm-int/machine-learning-fundamentals/main/module_2/Labs/mobilephonedata.csv'

In [None]:
# Read the data into a Pandas DataFrame named mobilephonedata

import numpy as np
import pandas as pd

mobilephonedata = pd.read_csv("mobilephonedata.csv")
mobilephonedata

#Split the Data

In [None]:
# Create the phone_features DataFrame with every column apart from price_range

phone_features = mobilephonedata.drop(['price_range'], axis=1)
phone_features

In [None]:
# Create the phone_price DataFrame containing only the price_range column

phone_price = mobilephonedata['price_range']
phone_price

In [None]:
# Visualize the data to establish whether classification looks feasible

import graphviz
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

visual_model = TSNE(learning_rate = 100, init='pca')
visual_transformation = visual_model.fit_transform(phone_features)
x_data = visual_transformation[:, 0]
y_data = visual_transformation[:, 1]
transformed_data = pd.DataFrame({'x':x_data, 'y':y_data})

plt.figure(figsize=(10, 10))
for i in range(0, 9) :
  plt.scatter(transformed_data.loc[phone_price==i]['x'], transformed_data.loc[phone_price==i]['y'])

plt.show()

#Use a Decision Tree to classify the data

In [None]:
# Split the data into training and test datasets

from sklearn.model_selection import train_test_split

features_train, features_test, predictions_train, predictions_test = train_test_split(phone_features, phone_price, test_size=0.33, random_state=13)
print(f'features_train: {features_train.shape}\npredictions_train: {predictions_train.shape}')

In [None]:
# Create and fit a Decision Tree classifier to the training data

from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
_ = tree_model.fit(features_train, predictions_train)

In [None]:
# Examine the decision tree generated by the classifier

from sklearn.tree import export_graphviz

view_data = export_graphviz(tree_model, out_file=None, feature_names=phone_features.columns, class_names=['Price 0', 'Price 1', 'Price 2', 'Price 3'])
graph = graphviz.Source(view_data)
graph

In [None]:
# Make predictions using the test data

tree_test_results = tree_model.predict(features_test)

In [None]:
# Compare the test results to the predicted results

print(f'Test Results: {tree_test_results[0:100]}\n')
print(f'Predictions: {predictions_test[0:100].values}')

In [None]:
# Generate the confusion matrix

from sklearn.metrics import ConfusionMatrixDisplay

_ = ConfusionMatrixDisplay.from_predictions(predictions_test, tree_test_results, display_labels=['Price 0', 'Price 1', 'Price 2', 'Price 3'])

In [None]:
# Calculate the accuracy, precision, recall, and error rate for the model

from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy = accuracy_score(tree_test_results, predictions_test)
precision = precision_score(tree_test_results, predictions_test, average=None)
recall = recall_score(tree_test_results, predictions_test, average=None)

errors = tree_test_results.size - accuracy_score(tree_test_results, predictions_test, normalize=False)

print(f'Accuracy Score: {accuracy}\n')
print(f'Precision Score: {precision}\n')
print(f'Recall Score: {recall}\n')
print(f'Number of classification errors: {errors}')

In [None]:
# Calculate the probabilities for each prediction
 
test_probs = tree_model.predict_proba(features_test)
test_probs[0:50]

In [None]:
# Binarize the predictions test data (convert from a single column containing 0-3, to 4 columns containing 0 or 1)

from sklearn.preprocessing import label_binarize

num_classes = 4
binarized_predictions_test = label_binarize(predictions_test, classes=[*range(num_classes)])
binarized_predictions_test

In [None]:
# Plot the ROC curve for each class

from sklearn.metrics import roc_curve

fpr = dict()
tpr = dict()
plt.figure(figsize=(10, 10))

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(binarized_predictions_test[:, i], test_probs[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="best")
plt.title("ROC Curves")
plt.show()

# Use Logistic Regression to classify the data

In [None]:
# Repeat the steps as before and compare the results
# Use the same training and test DataFrames

from sklearn.linear_model import LogisticRegression

logit_model = LogisticRegression(max_iter=500, solver='sag', multi_class='multinomial', tol=1e-2) # Try with 'multinomial' and 'ovr' and observe any differences in accuracy, precision, and recall
_ = logit_model.fit(features_train, predictions_train)

In [None]:
logit_test_results = logit_model.predict(features_test)
print(f'Test Results: {logit_test_results[0:100]}\n')
print(f'Predictions: {predictions_test[0:100].values}')

In [None]:
_ = ConfusionMatrixDisplay.from_predictions(predictions_test, logit_test_results, display_labels=['Price 0', 'Price 1', 'Price 2', 'Price 3'])

In [None]:
accuracy = accuracy_score(logit_test_results, predictions_test)
precision = precision_score(logit_test_results, predictions_test, average=None)
recall = recall_score(logit_test_results, predictions_test, average=None)

errors = logit_test_results.size - accuracy_score(logit_test_results, predictions_test, normalize=False)

print(f'Accuracy Score: {accuracy}\n')
print(f'Precision Score: {precision}\n')
print(f'Recall Score: {recall}\n')
print(f'Number of classification errors: {errors}')

In [None]:
test_probs = logit_model.predict_proba(features_test)
test_probs[0:50]

In [None]:
fpr = dict()
tpr = dict()
plt.figure(figsize=(10, 10))

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(binarized_predictions_test[:, i], test_probs[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="best")
plt.title("ROC Curves")
plt.show()

#Use the K-Nearest Neighbours algorithm to classify the data

In [None]:
# Repeat the steps as before and compare the results

from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
_ = knn_model.fit(features_train, predictions_train)

In [None]:
knn_test_results = knn_model.predict(features_test)
print(f'Test Results: {knn_test_results[0:100]}\n')
print(f'Predictions: {predictions_test[0:100].values}')

In [None]:
_ = ConfusionMatrixDisplay.from_predictions(predictions_test, knn_test_results, display_labels=['Price 0', 'Price 1', 'Price 2', 'Price 3'])

In [None]:
accuracy = accuracy_score(knn_test_results, predictions_test)
precision = precision_score(knn_test_results, predictions_test, average=None)
recall = recall_score(knn_test_results, predictions_test, average=None)

errors = knn_test_results.size - accuracy_score(knn_test_results, predictions_test, normalize=False)

print(f'Accuracy Score: {accuracy}\n')
print(f'Precision Score: {precision}\n')
print(f'Recall Score: {recall}\n')
print(f'Number of classification errors: {errors}')

In [None]:
test_probs = knn_model.predict_proba(features_test)
test_probs[0:50]

In [None]:
fpr = dict()
tpr = dict()
plt.figure(figsize=(10, 10))

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(binarized_predictions_test[:, i], test_probs[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="best")
plt.title("ROC Curves")
plt.show()

# Perform Multi-label Classification
Predict whether a mobile phone has 4G, WiFi, and Bluetooth capabilities based on its other features and price category.

In [None]:
# Retrieve the original data again

import numpy as np
import pandas as pd

mobilephonedata = pd.read_csv("mobilephonedata.csv")
mobilephonedata

In [None]:
# Create the phone_features DataFrame with every column apart from blue, four_g, and wifi

phone_features = mobilephonedata.drop(['blue', 'four_g', 'wifi'], axis=1)
phone_features

In [None]:
# Create the phone_labels DataFrame containing the blue, four_g, and wifi columns

phone_labels = mobilephonedata[['blue', 'four_g', 'wifi']]
phone_labels

In [None]:
# Split the data into training and test datasets

from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split(phone_features, phone_labels, test_size=0.33, random_state=13)
print(f'features_train: {features_train.shape}\nlabels_train: {labels_train.shape}')

In [None]:
# Create a Multi Output Classifier wrapped around a Gradient Boosted Tree to classify the data

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier

multi_model = MultiOutputClassifier(estimator=GradientBoostingClassifier())
_ = multi_model.fit(features_train, labels_train)

# Show the number of estimators created for the model. There should be one classifier per label
print(multi_model.estimators_)

In [None]:
# Make test predictions

print(f'Predicted results:\n{labels_test}\n')

results = multi_model.predict(features_test)
print(f'Actual results:\n{results}')

In [None]:
# Generate the confusion matrices from the predictions
# Note: One confusion matrix per label

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

cm = confusion_matrix(labels_test["blue"], results[:, 0])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No', 'Yes'])
disp.plot()
plt.title('Bluetooth')
plt.show()

cm = confusion_matrix(labels_test["four_g"], results[:, 1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No', 'Yes'])
disp.plot()
plt.title('4G')
plt.show()

cm = confusion_matrix(labels_test["wifi"], results[:, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No', 'Yes'])
disp.plot()
plt.title('WiFi')
plt.show()

In [None]:
# Calculate the accuracy, precision, and recall for the model

from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy = accuracy_score(results, labels_test)
precision = precision_score(results, labels_test, average=None)
recall = recall_score(results, labels_test, average=None)

print(f'Accuracy Score: {accuracy}\n')
print(f'Precision Score: {precision}\n')
print(f'Recall Score: {recall}\n')

In [None]:
# Calculate the probabilities for each prediction
 
probabilities = multi_model.predict_proba(features_test)
probabilities[0:50]

In [None]:
# Generate the ROC curves for each label
# How good is this model?

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

fpr = dict()
tpr = dict()
plt.figure(figsize=(10, 10))
num_labels=3

for i in range(num_labels):
    fpr[i], tpr[i], _ = roc_curve(labels_test.iloc[:, i], probabilities[i][:, 1])
    auc = roc_auc_score(labels_test.iloc[:, i], probabilities[i][:, 1])
    plt.plot(fpr[i], tpr[i], lw=2, label=f'Label {phone_labels.columns[i]}: AUC {auc}')

plt.plot((0, 1), (0,1), label="Random Guess", c='red', linewidth=5)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="best")
plt.title("ROC Curves")
plt.show()