# Phase 3 - KNN Classifier Model

In [22]:
import pandas as pd
import numpy as np

In [23]:
# creating dataframes for both super and non-super foods
superfoods_df = pd.read_csv("superfoods_tagged.csv")
foods_df = pd.read_csv("Foods_tagged.csv")

# dropping all columns that don't exist in both dataframes
columns_to_drop = superfoods_df.columns.difference(foods_df.columns)
superfoods_df = superfoods_df.drop(columns=columns_to_drop)

columns_to_drop = foods_df.columns.difference(superfoods_df.columns)
foods_df = foods_df.drop(columns=columns_to_drop)

print(superfoods_df.shape)
print(foods_df.shape)

# creating a 'superfood' feature
superfoods_df["superfood"] = True
foods_df["superfood"] = False

# combining the dataframes into one
df = pd.concat([superfoods_df, foods_df])
df.head()

(1176, 157)
(4925, 157)


Unnamed: 0,fdcId,description,commonNames,additionalDescriptions,dataType,ndbNumber,publishedDate,foodCategory,allHighlightFields,score,...,"Zinc, Zn (MG) (% Daily Value)","Selenium, Se (UG) (% Daily Value)","Copper, Cu (MG) (% Daily Value)","Manganese, Mn (MG) (% Daily Value)","Potassium, K (MG) (% Daily Value)","Sodium, Na (MG) (% Daily Value)",Calories (% Daily Value),Total Nutrient % Daily Value,Nutrient Density Score,superfood
0,168208,"Fruit juice smoothie, ODWALLA, ORIGINAL SUPERFOOD",,,SR Legacy,9513.0,2019-04-01,Fruits and Fruit Juices,,321.29324,...,0.545455,0.545455,4.444444,3.304348,3.085106,0.130435,2.54345,38.041197,0.747827,True
1,2665686,"SUPERFOOD VEGGIE CAKES, SUPERFOOD",,,Branded,,2023-11-16,"Frozen Breakfast Sandwiches, Biscuits & Meals",,72.36845,...,,,,,3.12766,6.086957,3.502,33.208975,0.474143,True
2,2620391,"ORGANIC, SUPERFOODS VEGGIE BURGERS, SUPERFOODS",,,Branded,,2023-08-31,Frozen Patties and Burgers,,48.151413,...,,,,,5.085106,19.608696,6.8405,47.017905,0.343673,True
3,1882647,"SUPERFOOD PESTO SAUCE, SUPERFOOD PESTO",,,Branded,,2021-07-29,Prepared Pasta & Pizza Sauces,,48.151413,...,,,,,,28.73913,6.171,102.445284,0.830054,True
4,2145788,"SUPERFOOD POWER SALAD, SUPERFOOD POWER",,,Branded,,2021-10-28,"Pickles, Olives, Peppers & Relishes",,48.151413,...,,,,,3.446809,3.478261,4.2,22.172933,0.263963,True


In [24]:
# converts the label to 0 or 1
df["superfood"] = df["superfood"].astype(str).str.lower().map({"true": 1, "false": 0})
y = df["superfood"].values

# removes the label from full dataframe not from numeric-only selection
X = df.drop(columns=["superfood"])

# keeps only numeric nutrient features
X = X.select_dtypes(include=[np.number]).values

In [25]:
# train test split (70/30)
np.random.seed(1)
indices = np.arange(len(X))
np.random.shuffle(indices)

train_size = int(0.7 * len(X))
train_idx = indices[:train_size]
test_idx = indices[train_size:]

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

In [26]:
# min-max scaling
X_min = X_train.min(axis=0)
X_max = X_train.max(axis=0)
X_range = np.where(X_max - X_min == 0, 1, X_max - X_min)

X_train_scaled = (X_train - X_min) / X_range
X_test_scaled = (X_test - X_min) / X_range

In [27]:
# KNN classifier
def knn_predict(X_train, y_train, x_test, k = 5):
    distances = np.sqrt(np.sum((X_train - x_test)**2, axis=1))
    k_idx = np.argsort(distances)[:k]
    
    return int(y_train[k_idx].mean() >= 0.5)

def knn_predict_batch(X_train, y_train, X_test, k = 5):
    return np.array([knn_predict(X_train, y_train, x, k) for x in X_test])

In [28]:
# evaluates accuracy
k = 5
preds = knn_predict_batch(X_train_scaled, y_train, X_test_scaled, k)
accuracy = (preds == y_test).mean()

print(f"KNN Accuracy (k={k}, 70/30 split): {accuracy:.3f}")

KNN Accuracy (k=5, 70/30 split): 0.813


# For comparison:
Using a KNN modle isn’t a great fit for this dataset because the classes are really imbalanced and the nutrient data doesn’t give the model much to work with. Since most foods are labeled as non-superfoods, KNN ends up predicting that category almost every time, no matter what value of k you use. On top of that, many foods have similar or incomplete nutrient profiles, so the distance calculations that KNN relies on don’t actually help separate the two groups. The result is a model that basically ignores the minority class and doesn’t improve even when you tweak the parameters.