# Phase III - Clasification Model

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.linear_model import SGDClassifier
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'colab'

In [3]:
#creating dataframes for both super and non-super foods
superfoods_df = pd.read_csv("superfoods_tagged.csv")
foods_df = pd.read_csv("Foods_tagged.csv")

In [4]:
#dropping all columns that don't exist in both dataframes
columns_to_drop = superfoods_df.columns.difference(foods_df.columns)
superfoods_df = superfoods_df.drop(columns=columns_to_drop)

columns_to_drop = foods_df.columns.difference(superfoods_df.columns)
foods_df = foods_df.drop(columns=columns_to_drop)

print(superfoods_df.shape)
print(foods_df.shape)

(1176, 157)
(4925, 157)


In [5]:
#creating a 'superfood' feature
superfoods_df['superfood'] = True
foods_df['superfood'] = False

In [6]:
#combining the dataframes into one
df = pd.concat([superfoods_df, foods_df])

In [7]:
df.head()

Unnamed: 0,fdcId,description,commonNames,additionalDescriptions,dataType,ndbNumber,publishedDate,foodCategory,allHighlightFields,score,...,"Zinc, Zn (MG) (% Daily Value)","Selenium, Se (UG) (% Daily Value)","Copper, Cu (MG) (% Daily Value)","Manganese, Mn (MG) (% Daily Value)","Potassium, K (MG) (% Daily Value)","Sodium, Na (MG) (% Daily Value)",Calories (% Daily Value),Total Nutrient % Daily Value,Nutrient Density Score,superfood
0,168208,"Fruit juice smoothie, ODWALLA, ORIGINAL SUPERFOOD",,,SR Legacy,9513.0,2019-04-01,Fruits and Fruit Juices,,321.29324,...,0.545455,0.545455,4.444444,3.304348,3.085106,0.130435,2.54345,38.041197,0.747827,True
1,2665686,"SUPERFOOD VEGGIE CAKES, SUPERFOOD",,,Branded,,2023-11-16,"Frozen Breakfast Sandwiches, Biscuits & Meals",,72.36845,...,,,,,3.12766,6.086957,3.502,33.208975,0.474143,True
2,2620391,"ORGANIC, SUPERFOODS VEGGIE BURGERS, SUPERFOODS",,,Branded,,2023-08-31,Frozen Patties and Burgers,,48.151413,...,,,,,5.085106,19.608696,6.8405,47.017905,0.343673,True
3,1882647,"SUPERFOOD PESTO SAUCE, SUPERFOOD PESTO",,,Branded,,2021-07-29,Prepared Pasta & Pizza Sauces,,48.151413,...,,,,,,28.73913,6.171,102.445284,0.830054,True
4,2145788,"SUPERFOOD POWER SALAD, SUPERFOOD POWER",,,Branded,,2021-10-28,"Pickles, Olives, Peppers & Relishes",,48.151413,...,,,,,3.446809,3.478261,4.2,22.172933,0.263963,True


In [26]:
df.shape

(6101, 158)

# Required Nutrients on a label

Macronutrients:
- Total Fat
- Saturated Fat
- Trans Fat
- Cholesterol
- Total Carbohydrates
- Dietary Fiber
- Added Sugars
- Protein

Vitamins and Minerals:
- Vitamin D
- Calcium
- Iron
- Potassium

Other:
- Calories
- Sodium

In [27]:
# Define the feature columns
feature_columns = ['Total lipid (fat) (G)', 'Fatty acids, total saturated (G)', 'Fatty acids, total trans (G)', 'Cholesterol (MG)',
                   'Carbohydrate, by difference (G)', 'Fiber, total dietary (G)', 'Total Sugars (G)', 'Protein (G)', 'Vitamin D (D2 + D3) (UG)',
                   'Calcium, Ca (MG)', 'Iron, Fe (MG)', 'Potassium, K (MG)', 'Sodium, Na (MG)']

# Drop rows with any NaN values in the feature columns or the 'superfood' column
# This ensures that both X and y will be clean
df_cleaned = df.dropna(subset=feature_columns + ['superfood'])

print(df_cleaned.shape)
# standardize feature columns
for feat in df_cleaned.columns:
    if feat in feature_columns:
        df_cleaned[f'{feat}'] = (df_cleaned[feat] - df_cleaned[feat].mean()) / df_cleaned[feat].std()

# using required macronutrients as x features from the cleaned dataframe
x = [df_cleaned[col] for col in feature_columns]

(2081, 158)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [13]:
X = np.vstack([np.ones(len(x[0])), x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12]]).T
X

array([[ 1.00000000e+00, -8.38664031e-01, -8.61721528e-01, ...,
        -4.60367521e-01, -4.00992747e-01, -1.03898081e+00],
       [ 1.00000000e+00, -2.34996530e-01, -8.96456082e-01, ...,
        -6.02976445e-01, -3.95178443e-01, -5.91915440e-01],
       [ 1.00000000e+00, -2.33183715e-01, -4.30020636e-01, ...,
        -8.73903353e-02, -1.27720490e-01,  4.22955571e-01],
       ...,
       [ 1.00000000e+00,  3.25163403e-01,  4.66461675e-01, ...,
         3.69002502e-04, -3.63199775e-01,  1.29424354e+00],
       [ 1.00000000e+00,  7.33046850e-01,  9.72593755e-01, ...,
        -5.08239446e-02, -3.98085595e-01,  1.07886899e+00],
       [ 1.00000000e+00,  2.61714867e-01, -5.29026158e-02, ...,
        -2.33655898e-01,  4.38014576e-02,  8.68407344e-02]])

In [14]:
y = [1 if super_food else -1 for super_food in df_cleaned['superfood']]

In [54]:
w = np.array([1, -1, 0, -2, 1, 2, 1, 2, 1, -1, 0, -1, 1])
w

array([ 1, -1,  0, -2,  1,  2,  1,  2,  1, -1,  0, -1,  1])

In [55]:
perceptron = SGDClassifier(loss='hinge', max_iter=1000, random_state=42,
                           fit_intercept = True, alpha = 0.0001, shuffle = True)
perceptron.fit(X, y)

# Predicting the labels
print(perceptron.predict(X))
print(y)
fin_w = np.concatenate([perceptron.intercept_, perceptron.coef_[0]])
print(fin_w) #print final w vector
print(f'The final line is: x_2 = {-(fin_w[0]/fin_w[2]).round(2)} + {-(fin_w[1]/fin_w[2]).round(2)}x_1')

[-1 -1  1 ... -1 -1 -1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

In [56]:
actual_supers = 0
for i in y:
  if i == 1:
    actual_supers += 1

predicted_supers = 0
for i in perceptron.predict(X):
  if i == 1:
    predicted_supers += 1

print(actual_supers)
print(predicted_supers)

accuracy = np.mean(perceptron.predict(X) == y)
print(f"\nOverall Accuracy: {accuracy:.3f}")

156
84

Overall Accuracy: 0.965
