In [None]:
import numpy as np
import pandas as pd

#Read the Data

In [None]:
mobilephonedata = pd.read_csv("mobilephonedata.csv")
mobilephonedata

#Split the Data
## Price_range is the class to predict

In [None]:
phone_features = mobilephonedata.drop(['price_range'], axis=1)
phone_features

In [None]:
phone_price = mobilephonedata['price_range']
phone_price

In [None]:
#Visualize the data

import graphviz
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

visual_model = TSNE(learning_rate = 100, init='pca')
visual_transformation = visual_model.fit_transform(phone_features)
x_data = visual_transformation[:, 0]
y_data = visual_transformation[:, 1]
transformed_data = pd.DataFrame({'x':x_data, 'y':y_data})

plt.figure(figsize=(10, 10))
for i in range(0, 9) :
  plt.scatter(transformed_data.loc[phone_price==i]['x'], transformed_data.loc[phone_price==i]['y'])

plt.show()

#Use a Decision Tree to classify the data

In [None]:
from sklearn.model_selection import train_test_split

features_train, features_test, predictions_train, predictions_test = train_test_split(phone_features, phone_price, test_size=0.33, random_state=13)
print(f'features_train: {features_train.shape}\npredictions_train: {predictions_train.shape}')

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
_ = tree_model.fit(features_train, predictions_train)

In [None]:
# Examine the decision tree

from sklearn.tree import export_graphviz

view_data = export_graphviz(tree_model, out_file=None, feature_names=phone_features.columns, class_names=['Price 0', 'Price 1', 'Price 2', 'Price 3'])
graph = graphviz.Source(view_data)
graph

In [None]:
tree_test_results = tree_model.predict(features_test)

In [None]:
# Compare the test results to the predicted results

print(f'Test Results: {tree_test_results[0:100]}\n')
print(f'Predictions: {predictions_test[0:100].values}')

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

_ = ConfusionMatrixDisplay.from_predictions(predictions_test, tree_test_results, display_labels=['Price 0', 'Price 1', 'Price 2', 'Price 3'])

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(tree_test_results, predictions_test)
errors = tree_test_results.size - accuracy_score(tree_test_results, predictions_test, normalize=False)

print(f'Accuracy Score: {accuracy}\n')
print(f'Number of classification errors: {errors}')

In [None]:
test_probs = tree_model.predict_proba(features_test)
test_probs[0:50]

In [None]:
# binarize the predictions test data (convert from a single column containing 0-3, to 4 columns containing 0 or 1)

from sklearn.preprocessing import label_binarize

num_classes = 4
binarized_predictions_test = label_binarize(predictions_test, classes=[*range(num_classes)])
binarized_predictions_test

In [None]:
# Plot the ROC curve for each class

from sklearn.metrics import roc_curve

fpr = dict()
tpr = dict()
plt.figure(figsize=(10, 10))

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(binarized_predictions_test[:, i], test_probs[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="best")
plt.title("ROC Curves")
plt.show()

# Use Logistic Regression to classify the data

In [None]:
from sklearn.linear_model import LogisticRegression

logit_model = LogisticRegression(max_iter=500, solver='sag', multi_class='multinomial')
_ = logit_model.fit(features_train, predictions_train)

In [None]:
logit_test_results = logit_model.predict(features_test)
print(f'Test Results: {logit_test_results[0:100]}\n')
print(f'Predictions: {predictions_test[0:100].values}')

In [None]:
_ = ConfusionMatrixDisplay.from_predictions(predictions_test, logit_test_results, display_labels=['Price 0', 'Price 1', 'Price 2', 'Price 3'])

In [None]:
accuracy = accuracy_score(logit_test_results, predictions_test)
errors = logit_test_results.size - accuracy_score(logit_test_results, predictions_test, normalize=False)

print(f'Accuracy Score: {accuracy}\n')
print(f'Number of classification errors: {errors}')

In [None]:
test_probs = logit_model.predict_proba(features_test)
test_probs[0:50]

In [None]:
fpr = dict()
tpr = dict()
plt.figure(figsize=(10, 10))

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(binarized_predictions_test[:, i], test_probs[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="best")
plt.title("ROC Curves")
plt.show()

#Use Logistic Regression with scaled data

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaled_features_train = features_train.copy()
scaled_features_test = features_test.copy()

scaler = MinMaxScaler()

scaled_features_train = scaler.fit_transform(scaled_features_train)
scaled_features_test = scaler.fit_transform(scaled_features_test)
scaled_features_test

In [None]:
logit_model = LogisticRegression(max_iter=500, multi_class='multinomial', solver="sag")
_ = logit_model.fit(scaled_features_train, predictions_train)

In [None]:
logit_test_results = logit_model.predict(scaled_features_test)
print(f'Test Results: {logit_test_results[0:100]}\n')
print(f'Predictions: {predictions_test[0:100].values}')

In [None]:
_ = ConfusionMatrixDisplay.from_predictions(predictions_test, logit_test_results, display_labels=['Price 0', 'Price 1', 'Price 2', 'Price 3'])

In [None]:
accuracy = accuracy_score(logit_test_results, predictions_test)
errors = logit_test_results.size - accuracy_score(logit_test_results, predictions_test, normalize=False)

print(f'Accuracy Score: {accuracy}\n')
print(f'Number of classification errors: {errors}')

In [None]:
test_probs = logit_model.predict_proba(scaled_features_test)
test_probs[0:50]

In [None]:
fpr = dict()
tpr = dict()
plt.figure(figsize=(10, 10))

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(binarized_predictions_test[:, i], test_probs[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="best")
plt.title("ROC Curves")
plt.show()

#Use the K-Nearest Neighbours algorithm to classify the data

In [None]:
#First, use the ordinary unscaled data

from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
_ = knn_model.fit(features_train, predictions_train)

In [None]:
knn_test_results = knn_model.predict(features_test)
print(f'Test Results: {knn_test_results[0:100]}\n')
print(f'Predictions: {predictions_test[0:100].values}')

In [None]:
_ = ConfusionMatrixDisplay.from_predictions(predictions_test, knn_test_results, display_labels=['Price 0', 'Price 1', 'Price 2', 'Price 3'])

In [None]:
accuracy = accuracy_score(knn_test_results, predictions_test)
errors = logit_test_results.size - accuracy_score(knn_test_results, predictions_test, normalize=False)

print(f'Accuracy Score: {accuracy}\n')
print(f'Number of classification errors: {errors}')

In [None]:
test_probs = knn_model.predict_proba(features_test)
test_probs[0:50]

In [None]:
fpr = dict()
tpr = dict()
plt.figure(figsize=(10, 10))

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(binarized_predictions_test[:, i], test_probs[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="best")
plt.title("ROC Curves")
plt.show()

In [None]:
# Now with the scaled data

knn_model = KNeighborsClassifier(n_neighbors=5)
_ = knn_model.fit(scaled_features_train, predictions_train)

In [None]:
knn_test_results = knn_model.predict(scaled_features_test)
print(f'Test Results: {knn_test_results[0:100]}\n')
print(f'Predictions: {predictions_test[0:100].values}')

In [None]:
_ = ConfusionMatrixDisplay.from_predictions(predictions_test, knn_test_results, display_labels=['Price 0', 'Price 1', 'Price 2', 'Price 3'])

In [None]:
accuracy = accuracy_score(knn_test_results, predictions_test)
errors = logit_test_results.size - accuracy_score(knn_test_results, predictions_test, normalize=False)

print(f'Accuracy Score: {accuracy}\n')
print(f'Number of classification errors: {errors}')

In [None]:
test_probs = knn_model.predict_proba(scaled_features_test)
test_probs[0:50]

In [None]:
fpr = dict()
tpr = dict()
plt.figure(figsize=(10, 10))

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(binarized_predictions_test[:, i], test_probs[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="best")
plt.title("ROC Curves")
plt.show()