# Import data
Import data from a file containing the preprocessed dataset.

In [None]:
import pandas as pd
import os

V1 = 'First'
V2 = 'Second'
V3 = 'Third'

# depending on the OS the path to the data file is different
if os.name == 'nt':
    first_data = pd.read_csv(r'..\data\generated\preprocessed-data-classification-first.csv')
    second_data = pd.read_csv(r'..\data\generated\preprocessed-data-classification-second.csv')
    third_data = pd.read_csv(r'..\data\generated\preprocessed-data-classification-third.csv')
    food_groups = pd.read_excel(r'..\data\food-groups.xls')
elif os.name == 'posix':
    first_data = pd.read_csv(r'../data/generated/preprocessed-data-classification-first.csv')
    second_data = pd.read_csv(r'../data/generated/preprocessed-data-classification-second.csv')
    third_data = pd.read_csv(r'../data/generated/preprocessed-data-classification-third.csv')
    food_groups = pd.read_excel(r'../data/food-groups.xls')

# filter food groups with 'Food Group Code' with length 2
food_groups = food_groups[food_groups['Food Group Code'].apply(lambda x: len(str(x)) == 2)]


print('First version of food groups:')
first_data.head()

---
# Prepare data
1. Determine the target variable.
2. Determine the features.
3. Split the data into training and test sets.

In [None]:
def feat_and_target(data, v):
    # use nutrition columns as features
    X_COLS = list(data.columns[3:])

    # use classification column as target
    y_COL = data.columns[1]

    print(f"{v} version:")
    print(f"X_COLS: {X_COLS}")
    print(f"y_COL: {y_COL}\n")
    return X_COLS, y_COL

first_X_COLS, first_y_COL = feat_and_target(first_data, V1)
second_X_COLS, second_y_COL = feat_and_target(second_data, V2)
third_X_COLS, third_y_COL = feat_and_target(third_data, V3)

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.15

def train_test_data(data, X_COLS, y_COL):
    # split data into train and test sets
    train, test = train_test_split(data, test_size=TEST_SIZE, random_state=43)

    # create design matrix X and predictions y
    X_train = train[X_COLS]
    y_train = train[y_COL]
    X_test = test[X_COLS]
    y_test = test[y_COL]

    # replace NaN values with 0
    X_train = X_train.fillna(0)
    X_test = X_test.fillna(0)
    y_train = y_train.fillna(0)
    y_test = y_test.fillna(0)

    return X_train, y_train, X_test, y_test

first_X_train, first_y_train, first_X_test, first_y_test = train_test_data(first_data, first_X_COLS, first_y_COL)
second_X_train, second_y_train, second_X_test, second_y_test = train_test_data(second_data, second_X_COLS, second_y_COL)
third_X_train, third_y_train, third_X_test, third_y_test = train_test_data(third_data, third_X_COLS, third_y_COL)

print(first_y_train.head(50))
# print(second_X_test)
# print(third_X_test)


---
# $k$-nn
We create a $k$-nn model which is used to classify a food into different food groups based on its nutritional information.

First we determine the best $k$ value based on the accuracy of different models.

In [None]:
# find the best k based on accuracy using cross validation

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold

def cross_validate(X_train, y_train, v):
    # create list of possible k values from 1 to 100
    k_values = list(range(1, 101))

    # create list of cross validation scores
    cv_scores = []

    # perform 10-fold cross validation for each k 
    # and take the mean of the scores
    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
        cv_scores.append(scores.mean())

    # plot the accuracy for each k
    plt.plot(k_values, cv_scores)
    plt.xlabel("k")
    plt.ylabel("Cross Validation Mean Accuracy")

    # add xtick for every 5th k
    plt.xticks([1] + list(range(10, 91, 10)) + [100])

    plt.grid()
    plt.gcf().set_size_inches(4, 5)
    plt.title("k-NN Accuracy for Different k Values")

    # save the plot
    plt.savefig(f'../report/figs/knn-cross-validation-{v.lower()}.png', bbox_inches='tight')
    plt.show()
    plt.clf()

    # find best k, other than k of 1
    cv_scores = [[score, k] for score, k in zip(cv_scores, k_values)]
    cv_scores = sorted(cv_scores)
    if cv_scores[::-1][0][1] == 1:
        best_k = cv_scores[::-1][1][1]
        best_k_accuracy = cv_scores[::-1][1][0]
    else:
        best_k = cv_scores[::-1][0][1]
        best_k_accuracy = cv_scores[::-1][0][0]

    print(f"{v} version:")
    print(cv_scores[::-1])
    print(f"Best k: {best_k}")
    print(f"Best k accuracy: {best_k_accuracy}\n")

    return best_k

first_best_k = cross_validate(first_X_train, first_y_train, V1)
second_best_k = cross_validate(second_X_train, second_y_train, V2)
third_best_k = cross_validate(third_X_train, third_y_train, V3)

In [None]:
import numpy as np
# create knn with best_k neighbors
first_knn = KNeighborsClassifier(n_neighbors=first_best_k)
# train the model using the training set
first_knn.fit(first_X_train, first_y_train)
# get first y prediction
first_y_pred = first_knn.predict(first_X_test)


# create knn with best_k neighbors
second_knn = KNeighborsClassifier(n_neighbors=second_best_k)
# train the model using the training set
second_knn.fit(second_X_train, second_y_train)
# get first y prediction
second_y_pred = second_knn.predict(second_X_test)


# create knn with best_k neighbors
third_knn = KNeighborsClassifier(n_neighbors=third_best_k)
# train the model using the training set
third_knn.fit(third_X_train, third_y_train)
# get first y prediction
third_y_pred = third_knn.predict(third_X_test)


## Evaluation
Calculate certain metrics to evaluate the model.

### Accuracy

In [None]:
# check the accuracy
from sklearn.metrics import accuracy_score

def print_accuracy_and_predictions(y_test, y_pred, v):
    print(f"{v} version:")
    # y_test constains the true labels of the test set
    # y_pred contains the predicted labels of the test set
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\tAccuracy: {accuracy}")

    # compare manually predicted labels with the true labels
    print(f"\tFirst few predictions: {y_pred[:10]}")
    print(f"\tFirst few true labels: {y_test[:10].values}\n")

    return accuracy
    
print_accuracy_and_predictions(first_y_test, first_y_pred, V1)
print_accuracy_and_predictions(second_y_test, second_y_pred, V2)
print_accuracy_and_predictions(third_y_test, third_y_pred, V3)

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def print_cm(y_test, y_pred, v, save_fig=False):
    print(f"{v} version:")

    # count the number of correct predictions
    print("Correct predictions per food group:")
    for value in y_test.unique():
        correct = 0
        for i in range(len(y_test)):
            if y_test.values[i] == y_pred[i] and y_test.values[i] == value:
                correct += 1
        food_group_name = food_groups[food_groups['Food Group Code'] == value]['Food Group and Sub-Group Name'].values[0].strip()
        print(f"{value}: {correct} ({food_group_name})")

    # create confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=y_test.unique())

    # create confusion matrix display
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)

    # plot confusion matrix
    disp.plot()
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")

    # plot the labels on the x and y axis
    plt.xticks(range(len(y_test.unique())), y_test.unique())
    plt.yticks(range(len(y_test.unique())), y_test.unique())

    plt.show()

    if save_fig:
        disp.figure_.savefig(f'../report/figs/knn-confusion-matrix-{v.lower()}.png', bbox_inches='tight')

print_cm(first_y_test, first_y_pred, V1, save_fig=True)
print_cm(second_y_test, second_y_pred, V2, save_fig=True)
print_cm(third_y_test, third_y_pred, V3, save_fig=True)


### Cross Validation


In [None]:
from sklearn.model_selection import KFold
SPLIT_COUNT = 7
V1 = 'First'
V2 = 'Second'
V3 = 'Third'
def cross_val_print(X_train, y_train, X_test, y_test, y_pred, k_val, data, X_COLS, y_COL, v):
    # splits the data into subsets
    kfold = KFold(n_splits=SPLIT_COUNT, shuffle=True, random_state=43)

    accuracy_scores = []
    for train, test in kfold.split(data):
        # create design matrix X and predictions y
        X_train = data.iloc[train][X_COLS]
        y_train = data.iloc[train][y_COL]
        X_test = data.iloc[test][X_COLS]
        y_test = data.iloc[test][y_COL]

        # reinitialise knn model
        knn = KNeighborsClassifier(n_neighbors=k_val)

        # train the model using the training subset
        knn.fit(X_train, y_train)
        
        # predict and show
        y_pred = knn.predict(X_test)

        print_accuracy_and_predictions(y_test, y_pred, v)
        print_cm(y_test, y_pred, v)

        # add accuracy to list
        accuracy_scores.append(accuracy_score(y_test, y_pred))

    # print average accuracy
    print(f"{v} average accuracy score across {len(accuracy_scores)} CV splits: {sum(accuracy_scores) / len(accuracy_scores)}")

cross_val_print(first_X_train, first_y_train, first_X_test, first_y_test, first_y_pred, first_best_k, first_data, first_X_COLS, first_y_COL, V1)
cross_val_print(second_X_train, second_y_train, second_X_test, second_y_test, second_y_pred, second_best_k, second_data, second_X_COLS, second_y_COL, V2)
cross_val_print(third_X_train, third_y_train, third_X_test, third_y_test, third_y_pred, third_best_k, third_data, third_X_COLS, third_y_COL, V3)


### Bootstrap Validation

In [None]:
import numpy as np
from sklearn.utils import resample
from sklearn.metrics import recall_score, precision_score, f1_score
import seaborn as sns

import numpy as np
from sklearn.utils import resample
from sklearn.metrics import recall_score, precision_score, f1_score
import seaborn as sns

def boot_val_print(X_train, y_train, X_test, y_test, y_pred, k_val, data, X_COLS, y_COL, v):
    N_BOOTSTRAPS = 1000
    boot_accuracies = []
    boot_recalls = []
    boot_precisions = []
    boot_f1s = []

    for i in range(N_BOOTSTRAPS):
        
        X_boot, y_boot = resample(X_train, y_train, replace=True)
        
        # reinitialise knn model
        knn = KNeighborsClassifier(n_neighbors=k_val)

        # train the model using the training subset
        knn.fit(X_boot, y_boot)
        
        # predict
        y_pred_boot = knn.predict(X_train)
        
        # evaluate
        boot_accuracies.append(accuracy_score(y_train, y_pred_boot))
        boot_recalls.append(recall_score(y_train, y_pred_boot, average='macro'))
        boot_precisions.append(precision_score(y_train, y_pred_boot, average='macro'))
        boot_f1s.append(f1_score(y_train, y_pred_boot, average='macro'))

    # print results
    print(f"Bootstrapped accuracies: {boot_accuracies}")
    print(f"Bootstrapped recalls: {boot_recalls}")
    print(f"Bootstrapped precisions: {boot_precisions}")
    print(f"Bootstrapped f1s: {boot_f1s}")
    # print averages
    print(f"Mean accuracy: {np.mean(boot_accuracies)}")
    print(f"Mean recall: {np.mean(boot_recalls)}")
    print(f"Mean precision: {np.mean(boot_precisions)}")
    print(f"Mean f1: {np.mean(boot_f1s)}")

    print(f"[{v}]")
    # plot distribution of accuracy
    sns.kdeplot(boot_accuracies)
    plt.title(f"Accuracy across {N_BOOTSTRAPS} bootstrap samples")
    plt.xlabel("Accuracy")
    plt.show()

    # plot distribution of recall
    sns.kdeplot(boot_recalls)
    plt.title(f"Recall across {N_BOOTSTRAPS} bootstrap samples")
    plt.xlabel("Recall")
    plt.show()

    # plot distribution of precision
    sns.kdeplot(boot_precisions)
    plt.title(f"Precision across {N_BOOTSTRAPS} bootstrap samples")
    plt.xlabel("Precision")
    plt.show()

    # plot distribution of f1
    sns.kdeplot(boot_f1s)
    plt.title(f"F1 across {N_BOOTSTRAPS} bootstrap samples")
    plt.xlabel("F1")
    plt.show()
    
boot_val_print(first_X_train, first_y_train, first_X_test, first_y_test, first_y_pred, first_best_k, first_data, first_X_COLS, first_y_COL, V1)
boot_val_print(second_X_train, second_y_train, second_X_test, second_y_test, second_y_pred, second_best_k, second_data, second_X_COLS, second_y_COL, V2)
boot_val_print(third_X_train, third_y_train, third_X_test, third_y_test, third_y_pred, third_best_k, third_data, third_X_COLS, third_y_COL, V3)