# Import data
Import data from a file containing the preprocessed dataset.

In [None]:
import pandas as pd
import os

# depending on the OS the path to the data file is different
if os.name == 'nt':
    data = pd.read_csv(r'..\data\generated\preprocessed-data-classification.csv')
    food_groups = pd.read_excel(r'..\data\food-groups.xls')
elif os.name == 'posix':
    data = pd.read_csv(r'../data/generated/preprocessed-data-classification.csv')
    food_groups = pd.read_excel(r'../data/food-groups.xls')

# filter food groups with 'Food Group Code' with length 2
food_groups = food_groups[food_groups['Food Group Code'].apply(lambda x: len(str(x)) == 2)]

data.head()

---
# Prepare data
1. Determine the target variable.
2. Determine the features.
3. Split the data into training and test sets.

In [None]:
# use nutrition columns as features
X_COLS = list(data.columns[3:])

# use classification column as target
y_COL = data.columns[1]

print(f"X_COLS: {X_COLS}")
print(f"y_COL: {y_COL}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# split data into train and test sets
train, test = train_test_split(data, test_size=0.15, random_state=43)

# create design matrix X and predictions y
X_train = train[X_COLS]
y_train = train[y_COL]
X_test = test[X_COLS]
y_test = test[y_COL]

# replace NaN values with 0
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
y_train = y_train.fillna(0)
y_test = y_test.fillna(0)


---
# $k$-nn
We create a $k$-nn model which is used to classify a food into different food groups based on its nutritional information.

First we determine the best $k$ value based on the accuracy of different models.

In [None]:
# find the best k based on accuracy using cross validation

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

def cross_validate(X_train, y_train):
    # create list of possible k values from 1 to 100
    k_values = list(range(1, 101))

    # create list of cross validation scores
    cv_scores = []

    # perform 10-fold cross validation for each k 
    # and take the mean of the scores
    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
        cv_scores.append(scores.mean())

    # plot the accuracy for each k
    plt.plot(k_values, cv_scores)
    plt.xlabel("k")
    plt.ylabel("Cross Validation Mean Accuracy")

    # add xtick for every 5th k
    plt.xticks([1] + list(range(5, 101, 5)))

    plt.grid()
    plt.gcf().set_size_inches(10, 5)
    plt.title("k-NN Accuracy for Different k Values")

    # save the plot
    plt.savefig('../report/figs/knn-cross-validation.png', bbox_inches='tight')

    # find best k, other than k of 1
    cv_scores = [[score, k] for score, k in zip(cv_scores, k_values)]
    cv_scores = sorted(cv_scores)
    if cv_scores[::-1][0][1] == 1:
        best_k = cv_scores[::-1][1][1]
        best_k_accuracy = cv_scores[::-1][1][0]
    else:
        best_k = cv_scores[::-1][0][1]
        best_k_accuracy = cv_scores[::-1][0][0]

    print(cv_scores[::-1])
    print(f"Best k: {best_k}")
    print(f"Best k accuracy: {best_k_accuracy}")

    return best_k

best_k = cross_validate(X_train, y_train)

In [None]:
KNN_NEIGHBORS = best_k

# create knn with best_k neighbors
knn = KNeighborsClassifier(n_neighbors=KNN_NEIGHBORS)

# train the model using the training set
knn.fit(X_train, y_train)

In [None]:
def predict_y(X_test):
    # predict the labels of the test set
    return knn.predict(X_test)

y_pred = predict_y(X_test)

## Evaluation
Calculate certain metrics to evaluate the model.

### Accuracy

In [None]:
# check the accuracy
from sklearn.metrics import accuracy_score

def print_accuracy_and_predictions(y_test, y_pred):
    # y_test constains the true labels of the test set
    # y_pred contains the predicted labels of the test set
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

    # compare manually predicted labels with the true labels
    print(f"First few predictions: {y_pred[:10]}")
    print(f"First few true labels: {y_test[:10].values}")

    return accuracy
    
print_accuracy_and_predictions(y_test, y_pred)

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def print_cm(y_test, y_pred, save_fig=False):
    # count the number of correct predictions
    print("Correct predictions per food group:")
    for value in y_test.unique():
        correct = 0
        for i in range(len(y_test)):
            if y_test.values[i] == y_pred[i] and y_test.values[i] == value:
                correct += 1
        food_group_name = food_groups[food_groups['Food Group Code'] == value]['Food Group and Sub-Group Name'].values[0].strip()
        print(f"{value}: {correct} ({food_group_name})")

    # create confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=y_test.unique())

    # create confusion matrix display
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)

    # plot confusion matrix
    disp.plot()
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")

    # plot the labels on the x and y axis
    plt.xticks(range(len(y_test.unique())), y_test.unique())
    plt.yticks(range(len(y_test.unique())), y_test.unique())

    plt.show()

    if save_fig:
        disp.figure_.savefig('../report/figs/knn-confusion-matrix.png', bbox_inches='tight')

print_cm(y_test, y_pred, save_fig=True)

### Cross Validation


In [None]:
from sklearn.model_selection import KFold
SPLIT_COUNT = 7

# splits the data into subsets
kfold = KFold(n_splits=SPLIT_COUNT, shuffle=True, random_state=43)

accuracy_scores = []
for train, test in kfold.split(data):
    # create design matrix X and predictions y
    X_train = data.iloc[train][X_COLS]
    y_train = data.iloc[train][y_COL]
    X_test = data.iloc[test][X_COLS]
    y_test = data.iloc[test][y_COL]

    # reinitialise knn model
    knn = KNeighborsClassifier(n_neighbors=KNN_NEIGHBORS)

    # train the model using the training subset
    knn.fit(X_train, y_train)
    
    # predict and show
    y_pred = predict_y(X_test)
    print_accuracy_and_predictions(y_test, y_pred)
    print_cm(y_test, y_pred)

    # add accuracy to list
    accuracy_scores.append(accuracy_score(y_test, y_pred))

# print average accuracy
print(f"Average accuracy score across {len(accuracy_scores)} CV splits: {sum(accuracy_scores) / len(accuracy_scores)}")

### Bootstrap Validation

In [None]:
import numpy as np
from sklearn.utils import resample
from sklearn.metrics import recall_score, precision_score, f1_score
import seaborn as sns

N_BOOTSTRAPS = 1000
boot_accuracies = []
boot_recalls = []
boot_precisions = []
boot_f1s = []

dataid_x = range(np.asarray(X_COLS).shape[0])

for i in range(N_BOOTSTRAPS):
    
    X_boot, y_boot = resample(X_train, y_train, replace=True)
    
    # predict
    y_pred_boot = knn.predict(X_boot)
    
    # evaluate
    boot_accuracies.append(accuracy_score(y_boot, y_pred_boot))
    boot_recalls.append(recall_score(y_boot, y_pred_boot, average='macro'))
    boot_precisions.append(precision_score(y_boot, y_pred_boot, average='macro'))
    boot_f1s.append(f1_score(y_boot, y_pred_boot, average='macro'))

# print results
print(f"Bootstrapped accuracies: {boot_accuracies}")
print(f"Bootstrapped recalls: {boot_recalls}")
print(f"Bootstrapped precisions: {boot_precisions}")
print(f"Bootstrapped f1s: {boot_f1s}")
# print averages
print(f"Mean accuracy: {np.mean(boot_accuracies)}")
print(f"Mean recall: {np.mean(boot_recalls)}")
print(f"Mean precision: {np.mean(boot_precisions)}")
print(f"Mean f1: {np.mean(boot_f1s)}")

# plot distribution of accuracy
sns.kdeplot(boot_accuracies)
plt.title(f"Accuracy across {N_BOOTSTRAPS} bootstrap samples")
plt.xlabel("Accuracy")
plt.show()

# plot distribution of recall
sns.kdeplot(boot_recalls)
plt.title(f"Recall across {N_BOOTSTRAPS} bootstrap samples")
plt.xlabel("Recall")
plt.show()

# plot distribution of precision
sns.kdeplot(boot_precisions)
plt.title(f"Precision across {N_BOOTSTRAPS} bootstrap samples")
plt.xlabel("Precision")
plt.show()

# plot distribution of f1
sns.kdeplot(boot_f1s)
plt.title(f"F1 across {N_BOOTSTRAPS} bootstrap samples")
plt.xlabel("F1")
plt.show()