# Import data
Import data from a file containing the preprocessed dataset.

In [None]:
import pandas as pd
import os

# depending on the OS the path to the data file is different
if os.name == 'nt':
    data = pd.read_csv(r'..\data\preprocessed-data-classification.csv')
    food_groups = pd.read_excel(r'..\data\food-groups.xls')
elif os.name == 'posix':
    data = pd.read_csv(r'../data/preprocessed-data-classification.csv')
    food_groups = pd.read_excel(r'../data/food-groups.xls')

# filter food groups with 'Food Group Code' with length 2
food_groups = food_groups[food_groups['Food Group Code'].apply(lambda x: len(str(x)) == 2)]

data.head()

---
# Prepare data
1. Determine the target variable.
2. Determine the features.
3. Split the data into training and test sets.

In [None]:
# use nutrition columns as features
X_COLS = list(data.columns[3:])

# use classification column as target
y_COL = data.columns[1]

print(f"X_COLS: {X_COLS}")
print(f"y_COL: {y_COL}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# split data into train and test sets
train, test = train_test_split(data, test_size=0.15, random_state=43)

# create design matrix X and predictions y
X_train = train[X_COLS]
y_train = train[y_COL]
X_test = test[X_COLS]
y_test = test[y_COL]

---
# $k$-nn
We create a $k$-nn model which is used to classify a food into different food groups based on its nutritional information.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# create knn with 3 neighbors
knn = KNeighborsClassifier(n_neighbors=4)

# replace NaN values with 0
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
y_train = y_train.fillna(0)
y_test = y_test.fillna(0)

# train the model using the training set
knn.fit(X_train, y_train)

In [None]:
# predict the labels of the test set
y_pred = knn.predict(X_test)

### Evaluation
Calculate certain metrics to evaluate the model.

In [None]:
# check the accuracy
from sklearn.metrics import accuracy_score

# y_test constains the true labels of the test set
# y_pred contains the predicted labels of the test set
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# compare manually predicted labels with the true labels
print(f"First few predictions: {y_pred[:10]}")
print(f"First few true labels: {y_test[:10].values}")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# count the number of correct predictions
print("Correct predictions per food group:")
for value in y_test.unique():
    correct = 0
    for i in range(len(y_test)):
        if y_test.values[i] == y_pred[i] and y_test.values[i] == value:
            correct += 1
    food_group_name = food_groups[food_groups['Food Group Code'] == value]['Food Group and Sub-Group Name'].values[0].strip()
    print(f"{value}: {correct} ({food_group_name})")

# create confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=y_test.unique())

# create confusion matrix display
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

# plot confusion matrix
disp.plot()
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")

# plot the labels on the x and y axis
plt.xticks(range(len(y_test.unique())), y_test.unique())
plt.yticks(range(len(y_test.unique())), y_test.unique())

plt.show()

# save the confusion matrix
disp.figure_.savefig('../report/figs/knn-confusion-matrix.png', bbox_inches='tight')