# Section 1: Cleaning the Data

In [1]:
# Imports
import pandas as pd
from IPython.display import display
# Import datatypes for parameter type specification
from sklearn.preprocessing import StandardScaler

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Produce knn boundary graphs
from sklearn.inspection import DecisionBoundaryDisplay
import seaborn as sns

# Import datatypes for parameter type specification
from pandas.core import frame
from sklearn.preprocessing._label import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

Load the dataset from a file to a dataframe. Drop any attributes with irrelevant data.

In [None]:
# Load the data file
df = pd.read_csv("./ford.csv")

# drop unecessary columns
df = df.drop(["tax"], axis=1)

print(df.shape)
display(df.head())

In [None]:
models = df.model.unique()
fuelTypes = df.fuelType.unique()
transmissions = df.transmission.unique()

print("Models\n", models,
      "\n\nFuel Types\n", fuelTypes,
      "\n\nTransmissions\n", transmissions,)

### Convert categorical data to numeric format
The target feature is car model. Use `sklearn.preprocessing.LabelEncoder()` to ordinalize the target feature. Also encode the other categorical features as 0 or 1 for each category using `pandas.get_dummies()`. 

In [None]:
print(df.dtypes)

In [None]:
transmissionNumeric = pd.get_dummies(df.transmission)
df = pd.concat([df.drop("transmission", axis=1), transmissionNumeric], axis=1)
fuelTypeNumeric = pd.get_dummies(df.fuelType)
df = pd.concat([df.drop("fuelType", axis=1), fuelTypeNumeric], axis=1)
display(df.head())
df.to_csv("clean1_Ford.csv", index=False)

### Drop attributes or records to fine-tune the KNN model

The primary dataset to be examined is stored to `df` and is exported to the "clean1_Ford.csv" file. The car models with high misclassification rates are stored in `df2` and exported to the "clean2_Ford.csv" file.

In [None]:
df2 = df[df.model == ' Grand Tourneo Connect']
df2 = pd.concat([df2, df[df.model == ' Tourneo Connect']])
df2 = pd.concat([df2, df[df.model == ' Transit Tourneo']])
df2 = pd.concat([df2, df[df.model == ' Tourneo Custom']])
df2 = pd.concat([df2, df[df.model == ' Fusion']])
df2 = pd.concat([df2, df[df.model == ' Ranger']])
df2 = pd.concat([df2, df[df.model == ' Streetka']])
df2 = pd.concat([df2, df[df.model == ' Escort']])
df2 = pd.concat([df2, df[df.model == ' Grand C-MAX']])
df2 = pd.concat([df2, df[df.model == ' Mondeo']])
df2 = pd.concat([df2, df[df.model == ' S-MAX']])
df2.to_csv("clean2_Ford.csv", index=False)

df = df[df.model != ' Grand Tourneo Connect']
df = df[df.model != ' Tourneo Connect']
df = df[df.model != ' Transit Tourneo']
df = df[df.model != ' Tourneo Custom']
df = df[df.model != ' Fusion']
df = df[df.model != ' Ranger']
df = df[df.model != ' Streetka']
df = df[df.model != ' Escort']
df = df[df.model != ' Grand C-MAX']
df = df[df.model != ' Mondeo']
df = df[df.model != ' S-MAX']
df.to_csv("clean1_Ford.csv", index=False)

---

# Section 2 : Scaling the Data

Scale data for car models that have a low misclassification rate

In [None]:
df = pd.read_csv("clean1_Ford.csv")

In [None]:
scaler = StandardScaler()
X = df.drop("model", axis=1)
scaler.fit(X)
X_scaled = scaler.transform(X)
X.columns = X.columns + " scaled"
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
display(X_scaled.head())

In [None]:
mpg_scaled = X_scaled["mpg scaled"].to_frame()*20.0
X_scaled = pd.concat([X_scaled.drop("mpg scaled", axis=1), mpg_scaled], axis=1)

In [None]:
pd.DataFrame(scaler.mean_).to_csv("mean.csv", index=False)
pd.DataFrame(scaler.var_).to_csv("variance.csv", index=False)
X_scaled.to_csv("X_scaled.csv", index=False)
df.model.to_csv("y.csv", index=False)


Repeat the process for the second dataset

In [None]:
df2 = pd.read_csv("clean2_Ford.csv")

In [None]:
scaler2 = StandardScaler()
X2 = df.drop("model", axis=1)
scaler2.fit(X2)
X2_scaled = scaler2.transform(X2)
X2.columns = X2.columns + " scaled"
X2_scaled = pd.DataFrame(X2_scaled, columns=X2.columns)
display(X2_scaled.head())

In [None]:
mpg_scaled = X2_scaled["mpg scaled"].to_frame()*20.0
X2_scaled = pd.concat([X2_scaled.drop("mpg scaled", axis=1), mpg_scaled], axis=1)

In [None]:
pd.DataFrame(scaler2.mean_).to_csv("mean2.csv", index=False)
pd.DataFrame(scaler2.var_).to_csv("variance2.csv", index=False)
X2_scaled.to_csv("X2_scaled.csv", index=False)
df2.model.to_csv("y2.csv", index=False)


---

# Section 3: Designing the KNN Model

In [None]:
# read scaled data for non-target attributes
X_scaled = pd.read_csv("X_scaled.csv")

# read the target attribute
y = pd.read_csv("y.csv")

In [56]:
# encode a single column from a dataframe
def encode_y(y:frame.DataFrame) -> dict:
    y = y.squeeze()
    labEnc_y = LabelEncoder()
    labEnc_y = labEnc_y.fit(y)
    y = pd.DataFrame(labEnc_y.transform(y), columns=[y.name])
    return dict(y=y,encoder=labEnc_y)

def unencode_y(y:frame.DataFrame, encoder:LabelEncoder):
    # use sklearn.preprocessing.LabelEncoder.inverse_transform() 
    # to revert the encoded data
    y = y.squeeze()
    return(pd.DataFrame(encoder.inverse_transform(y), columns=[y.name]))

### study interactions

In [None]:
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, random_state=0)

In [None]:
encode_yItem = encode_y(y_train)
labels_train = encode_yItem.get("y")

# # Use this variable to re-encode y labels
# encoder = encode_yItem.get("encoder")

In [None]:
# display(y_train.head())
# pd.plotting.scatter_matrix(X_train_scaled,
#                            c=labels_train.squeeze(),
#                            hist_kwds={'bins':20},
#                            figsize=(15,15))
# plt.show()
# print(type(y_train), type(y_test))

### Plot a 2d knn as an example

##### Select two attributes for the example KNN model

In [None]:
names_2d = ["mpg scaled", "price scaled"]
X2d_train = X_train_scaled.loc[:, names_2d]
y_train = np.ravel(y_train)
print(y_train)


##### Build the sample model from the training set
- show boundaries for the trained model.
- points from the training set are labeled based on the car models

In [None]:
knn = KNeighborsClassifier(n_neighbors=9, weights="distance")
knn.fit(X2d_train, y_train)
_, ax = plt.subplots()

DecisionBoundaryDisplay.from_estimator(
    knn,
    X2d_train,
    grid_resolution=100,
    eps=0.1,
    ax=ax,
    response_method="predict",
    plot_method="pcolormesh",
    xlabel=names_2d[0],
    ylabel=names_2d[1],
    shading="auto",
)

# Plot also the training points
sns.scatterplot(
    x=X2d_train[names_2d[0]],
    y=X2d_train[names_2d[1]],
    hue=y_train,
    alpha=1.0,
    edgecolor="black",
    s=3
)

# plt.gcf().set_size_inches(9,5)
# plt.gcf().align_xlabels()

plt.show()


### Assess accuracy for the sample KNN model

In [None]:
X2d_test = X_test_scaled.loc[:,[names_2d[0],names_2d[1]]]

arr = np.array([])
y_test = y_test.squeeze()
for n in range(1,15):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X2d_train, y_train)
    y_predict = knn.predict(X2d_test)
    m = np.mean(y_predict == y_test)
    arr = np.append(arr, np.array(m))
plt.plot(range(1, arr.size+1), arr)
plt.show()

In [None]:
# Sample data still being used.
# let's try a specific value of k
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X2d_train, y_train)
y_predict = knn.predict(X2d_test)
m = np.mean(y_predict == y_test)
arr = np.append(arr, np.array(m))
display(knn.score(X2d_test, y_test))

misclassified = y_test[y_test != y_predict]

In [None]:
names = misclassified.unique()
sum = 0
error_counts = np.array([])
name_counts = np.array([])
for name in names:
    errors = misclassified[misclassified == name].shape[0]
    sum += errors
    error_counts = np.append(error_counts, np.array(errors))
    name_total = y_test[y_test == name].shape[0]
    name_counts = np.append(name_counts, np.array(name_total))
display(names, misclassified.shape, sum)

error_percentage = 100 * error_counts / name_counts



# Plot misclassifications
plt.figure(figsize=(12, 4))
plt.bar(range(error_counts.size), height=error_counts, align='edge', width=0.2)
plt.xticks(range(error_counts.size), names, rotation=70)
plt.title("Error counts per model")
plt.show()

plt.figure(figsize=(12, 4))
plt.bar(range(error_counts.size), height=error_percentage, align='edge', width=0.2)
plt.xticks(range(error_counts.size), names, rotation=70)
plt.title("Error percentage per model")
plt.show()

--- 

# Section 4: Decision Tree Comparision

Create a decision tree and a KNN from the full dataset and compare accuracies of each model

In [None]:
tree = DecisionTreeClassifier(random_state=0)
knn = KNeighborsClassifier()


# import the scaled data and split into training and test sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(pd.read_csv("X_scaled.csv"), pd.read_csv("y.csv"), random_state=0)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

tree.fit(X_train_scaled, y_train)
knn.fit(X_train_scaled, y_train)
display() # hacky way to clear the output

In [None]:
display(tree.score(X_test_scaled, y_test), knn.score(X_test_scaled, y_test))

---