In [1]:
# Imports
import pandas as pd
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier

# Decision Tree for comparison
from sklearn.tree import DecisionTreeClassifier
from IPython.display import Image

# If any figures change, set to true
gen_new_plot=True

# Section 1B: Initial comparison of KNN versus the Decision Tree

In [None]:
df = pd.read_csv("clean_Ford.csv")

In [None]:
X = df.drop("model", axis=1)
y = df.model


<!-- ### Drop attributes or records to fine-tune the KNN model

A KNN model on the full dataset yields a high misclassification rate for certain cars such as the "Tourneo Connect" and the "Fusion" models. The cars with higher misclassification rates will isolated to a separate KNN model to study the behavior related to the high misclassification rates.

The primary dataset to be examined is stored to `df` and is exported to the "clean1_Ford.csv" file. 

The car models with high misclassification rates are stored in `df2` and exported to the "clean2_Ford.csv" file. -->

---

# Section 2 : Scaling the Data

Scale the dataset. Additionaly, multiply the "mpg scaled" attribute by a weight of `mpg_weight` to adjust the impact for groupings of this attribute.

In [None]:
mpg_weight = 7.0

# use mpg_col_name to rename the columns after adjusting the "mpg scaled" feature
mpg_col_name = "mpg scaled * " + str(mpg_weight)
print("mpg_col_name: \'" + mpg_col_name + "\'")

In [None]:
scaler = MinMaxScaler()
scaler.fit(X)
X_scaled_mpg_x_7_0 = scaler.transform(X)
columns = X.columns + " scaled"
X_scaled_mpg_x_7_0 = pd.DataFrame(X_scaled_mpg_x_7_0, columns=columns)

In [None]:
# adjust the 'mpg scaled' feature
mpg_scaled = X_scaled_mpg_x_7_0["mpg scaled"].to_frame(name=mpg_col_name)*mpg_weight
X_scaled_mpg_x_7_0 = pd.concat([X_scaled_mpg_x_7_0.drop("mpg scaled", axis=1), mpg_scaled], axis=1)
display(X_scaled_mpg_x_7_0.head())

In [None]:
# X_scaled_mpg_x_7_0.to_csv("X_scaled_mpg_x_7.0.csv")

---

# Section 3: Designing the KNN Model

Read the scaled data from files. Store the features and targets to variables. These variables will be used for train-test splitting.

In [None]:
# read the target attribute
y = y.to_numpy().ravel()

### study interactions

In [None]:



if (gen_new_plot):
    # labels for pairwise plots
    from sklearn.preprocessing._label import LabelEncoder

    pd.plotting.scatter_matrix(
        X_scaled_mpg_x_7_0[X_scaled_mpg_x_7_0.columns[[0,1,2,3,11]]],
        c=LabelEncoder().fit(y).transform(y),
        diagonal='hist',
        hist_kwds={'bins':20},
        figsize=(11, 8.5),
    )
    plt.show()
else:
    Image("")

Divide data into training and test sets

In [None]:
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    X_scaled_mpg_x_7_0, y,
    random_state=0
)

In [None]:
display(X_train_scaled.shape, y_train.shape)
display(X_test_scaled.shape, y_test.shape)

--- 

# Section 4: Decision Tree Comparision

Create a decision tree and a KNN from the dataset and compare accuracies of each model

In [None]:
tree = DecisionTreeClassifier(
    random_state=0
)
knn = KNeighborsClassifier()


# import the scaled data and split into training and test sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    pd.read_csv("X_scaled.csv"), 
    pd.read_csv("y.csv"), 
    random_state=0
)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

tree.fit(X_train_scaled, y_train)
knn.fit(X_train_scaled, y_train)

# hacky way to clear the output
display() 

In [None]:
print("Accuracies on the test set")
print("tree accuracy: " + str(tree.score(X_test_scaled, y_test)))
print("knn accuracy: " + str(knn.score(X_test_scaled, y_test)))

Create a decision tree and a KNN from the secondary dataset and compare accuracies of each model

---

# Section 5: Cross Validation

Evaluate the accuracy of a 10-fold cross validation for the KNN model.

In [None]:
# features = pd.concat([X_scaled, X2_scaled], ignore_index=True, verify_integrity=True)
features = X_scaled_mpg_x_7_0

In [None]:
# target = pd.DataFrame(np.append(y, y2))
target = pd.DataFrame(y, columns=["model"])

In [None]:
models = target.model.unique().tolist()

In [None]:
indexes = np.empty(shape=0, dtype=np.uint16)

display(target.shape[0])

for model in models:
    mask = (target.model == model)
    if (target[mask].shape[0] <= 50):
        index = target[mask].index[:]
        print(model, "- remove", index.shape[0], "record(s)")
        indexes = np.append(indexes, index)

target = target.drop(index=indexes, axis=0)
features = features.drop(index=indexes, axis=0)

display(target.shape[0])

In [None]:
models = target.model.unique().tolist()

In [None]:
print(models)

In [None]:
target = target.to_numpy().ravel()

In [None]:
knn = KNeighborsClassifier()
cv = cross_val_score(knn, features, target, cv=10)

In [None]:
cv.mean()

In [None]:
tree = DecisionTreeClassifier(
    random_state=0
)

cv_tree = cross_val_score(tree, features, target, cv=10)
cv_tree.mean()

---


# Accuracy Assessment on individual targets

In [None]:
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    features, target,
    random_state=0
)

In [None]:
arr = np.array([])
y_test = y_test.squeeze()


for n in range(1,15):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train_scaled, y_train)
    y1_predict = knn.predict(X_test_scaled)
    m = np.mean(y1_predict == y_test)
    arr = np.append(arr, np.array(m))
plt.plot(range(1, arr.size+1), arr)
plt.title("K vs. Accuracy")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)
y_predict = knn.predict(X_test_scaled)
display(knn.score(X_test_scaled, y_test))

In [None]:
# misclassified_y = pd.DataFrame(y_test[y_test != y_predict], columns=["model"])

misclassified_y = y_test[y_test != y_predict]

names = pd.Series(misclassified_y).unique()
sum = 0
error_counts = np.array([])
name_counts = np.array([])
for name in names:
    errors = misclassified_y[misclassified_y == name].shape[0]
    sum += errors
    error_counts = np.append(error_counts, np.array(errors))
    name_total = y_test[y_test == name].shape[0]
    name_counts = np.append(name_counts, np.array(name_total))
display(names, misclassified_y.shape, sum)

error_percentage = 100 * error_counts / name_counts



# Plot misclassifications
plt.figure(figsize=(12, 4))
plt.bar(range(error_counts.size), height=error_counts, align='edge', width=0.2)
plt.xticks(range(error_counts.size), names, rotation=70)
plt.title("KNN Final Model \nError counts per model")
plt.show()

plt.figure(figsize=(12, 4))
plt.bar(range(error_counts.size), height=error_percentage, align='edge', width=0.2)
plt.xticks(range(error_counts.size), names, rotation=70)
plt.title("KNN Final Model \nError percentage per model")
plt.show()

### Assess Accuracy of decision tree for comparison

In [None]:
tree = DecisionTreeClassifier(
    random_state=0
)
tree.fit(X_train_scaled, y_train)
y_predict = tree.predict(X_test_scaled)
display(tree.score(X_test_scaled, y_test))

In [None]:
misclassified_y = y_test[y_test != y_predict]

names = pd.Series(misclassified_y).unique()
sum = 0
error_counts = np.array([])
name_counts = np.array([])
for name in names:
    errors = misclassified_y[misclassified_y == name].shape[0]
    sum += errors
    error_counts = np.append(error_counts, np.array(errors))
    name_total = y_test[y_test == name].shape[0]
    name_counts = np.append(name_counts, np.array(name_total))
display(names, misclassified_y.shape, sum)

error_percentage = 100 * error_counts / name_counts



# Plot misclassifications
plt.figure(figsize=(12, 4))
plt.bar(range(error_counts.size), height=error_counts, align='edge', width=0.2)
plt.xticks(range(error_counts.size), names, rotation=70)
plt.title("Decision Tree \nError counts per model")
plt.show()

plt.figure(figsize=(12, 4))
plt.bar(range(error_counts.size), height=error_percentage, align='edge', width=0.2)
plt.xticks(range(error_counts.size), names, rotation=70)
plt.title("Decision Tree \nError percentage per model")
plt.show()

### Get accuracy assessment without scaling

In [None]:

features =  df.drop("model", axis=1)
target = df.model
X_train, X_test, y_train, y_test = train_test_split(
    features, target,
    random_state=0
)

In [None]:
knn2 = KNeighborsClassifier(n_neighbors=3)
knn2.fit(X_train, y_train)
y_predict = knn2.predict(X_test)
display(knn2.score(X_test, y_test))

In [None]:
# misclassified_y = pd.DataFrame(y_test[y_test != y_predict], columns=["model"])

misclassified_y = y_test[y_test != y_predict]

names = pd.Series(misclassified_y).unique()
sum = 0
error_counts = np.array([])
name_counts = np.array([])
for name in names:
    errors = misclassified_y[misclassified_y == name].shape[0]
    sum += errors
    error_counts = np.append(error_counts, np.array(errors))
    name_total = y_test[y_test == name].shape[0]
    name_counts = np.append(name_counts, np.array(name_total))
display(names, misclassified_y.shape, sum)

error_percentage = 100 * error_counts / name_counts



# Plot misclassifications
plt.figure(figsize=(12, 4))
plt.bar(range(error_counts.size), height=error_counts, align='edge', width=0.2)
plt.xticks(range(error_counts.size), names, rotation=70)
plt.title("KNN without scaling \nError counts per model")
plt.show()

plt.figure(figsize=(12, 4))
plt.bar(range(error_counts.size), height=error_percentage, align='edge', width=0.2)
plt.xticks(range(error_counts.size), names, rotation=70)
plt.title("KNN without scaling \nError percentage per model")
plt.show()

In [None]:
display(tree.feature_importances_)
display(tree.feature_names_in_)