In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

import warnings

# Filter out import warnings
warnings.filterwarnings("ignore", category=ImportWarning)

In [26]:
data = pd.read_csv('food_data.csv')
data.head()

Unnamed: 0,Food,Calories,Calories from Fat,Total Fat,Total Fat.1,Sodium,Sodium.1,Potassium,Potassium.1,Total Carbo-hydrate,Total Carbo-hydrate.1,Protein,Vitamin A,Vitamin C,Calcium,Iron,Food Type
0,Asparagus,20,0,0.0,0,0,0,230,7,4,1,2,10,15,2,2,Vegetables
1,Bell Pepper,25,0,0.0,0,40,2,220,6,6,2,1,4,190,2,4,Vegetables
2,Broccoli,45,0,0.5,1,80,3,460,13,8,3,4,6,220,6,6,Vegetables
3,Carrot,30,0,0.0,0,60,3,250,7,7,2,1,110,10,2,2,Vegetables
4,Cauliflower,25,0,0.0,0,30,1,270,8,5,2,2,0,100,2,2,Vegetables


we are going to predict the food type based on all the variables in the data

In [27]:
## select the independent and dependent variables

# we are using the food column as index
# you using dot index here
data.index = data.iloc[:,0]
data.index

Index(['Asparagus', 'Bell Pepper', 'Broccoli', 'Carrot', 'Cauliflower',
       'Celery', 'Cucumber', 'Green (Snap) Beans', 'Green Cabbage',
       'Green Onion', 'Iceberg Lettuce', 'Leaf Lettuce', 'Mushrooms', 'Onion',
       'Potato', 'Radishes', 'Summer Squash', 'Sweet Corn', 'Sweet Potato',
       'Tomato', 'Apple', 'Avocado', 'Banana', 'Cantaloupe', 'Grapefruit',
       'Grapes', 'Honeydew Melon', 'Kiwifruit', 'Lemon', 'Lime', 'Nectarine',
       'Orange', 'Peach', 'Pear', 'Pineapple', 'Plums', 'Strawberries',
       'Sweet Cherries', 'Tangerine', 'Watermelon', 'Blue Crab', 'Catfish',
       'Clams', 'Cod', 'Flounder/Sole', 'Haddock', 'Halibut', 'Lobster',
       'Ocean Perch', 'Orange Roughy', 'Oysters', 'Pollock', 'Rainbow Trout',
       'Rockfish', 'Salmon, Atlantic/Coho/Sockeye /Chinook', 'Salmon, Pink',
       'Scallops', 'Shrimp', 'Swordfish', 'Tilapia', 'Tuna'],
      dtype='object', name='Food')

In [28]:
data_to_use = data.iloc[:,1:16]
data_to_use

Unnamed: 0_level_0,Calories,Calories from Fat,Total Fat,Total Fat.1,Sodium,Sodium.1,Potassium,Potassium.1,Total Carbo-hydrate,Total Carbo-hydrate.1,Protein,Vitamin A,Vitamin C,Calcium,Iron
Food,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Asparagus,20,0,0.0,0,0,0,230,7,4,1,2,10,15,2,2
Bell Pepper,25,0,0.0,0,40,2,220,6,6,2,1,4,190,2,4
Broccoli,45,0,0.5,1,80,3,460,13,8,3,4,6,220,6,6
Carrot,30,0,0.0,0,60,3,250,7,7,2,1,110,10,2,2
Cauliflower,25,0,0.0,0,30,1,270,8,5,2,2,0,100,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Scallops,140,10,1.0,2,310,13,430,12,5,2,27,2,0,4,14
Shrimp,100,10,1.5,2,240,10,220,6,0,0,21,4,4,6,10
Swordfish,120,50,6.0,9,100,4,310,9,0,0,16,2,2,0,6
Tilapia,110,20,2.5,4,30,1,360,10,0,0,22,0,2,0,2


In [29]:
data_to_target = data.iloc[:,16]
data_to_target

Food
Asparagus      Vegetables
Bell Pepper    Vegetables
Broccoli       Vegetables
Carrot         Vegetables
Cauliflower    Vegetables
                  ...    
Scallops          Seafood
Shrimp            Seafood
Swordfish         Seafood
Tilapia           Seafood
Tuna              Seafood
Name: Food Type, Length: 61, dtype: object

test_size: This parameter specifies the proportion of the dataset that should be allocated to the testing set. In this case, test_size=20 means that 20% of the data will be allocated to the testing set, while the remaining 80% will be allocated to the training set.



In [30]:
X_train, X_test, y_train, y_test = train_test_split(data_to_use, data_to_target, test_size=20, random_state=32)

In [31]:
#create a naive bayes
gaussian_model = GaussianNB()

#train my model
gaussian_model.fit(X_train, y_train)

In [32]:
#make prediction
prediction = gaussian_model.predict(X_test)

In [33]:
#get result in a dataframe
result = {'Actual_food_type': y_test, 'Predicted_food_type': prediction}
result_df = pd.DataFrame(result)

#is not accurate because our model do have enough pattern to predict accurately
result_df

Unnamed: 0_level_0,Actual_food_type,Predicted_food_type
Food,Unnamed: 1_level_1,Unnamed: 2_level_1
Halibut,Seafood,Seafood
Mushrooms,Vegetables,Vegetables
Tuna,Seafood,Seafood
Rockfish,Seafood,Seafood
Avocado,Fruits,Seafood
Shrimp,Seafood,Seafood
"Salmon, Pink",Seafood,Seafood
Flounder/Sole,Seafood,Seafood
Lemon,Fruits,Fruits
Banana,Fruits,Fruits


In [34]:
##accuracy
# this gives it 80% accurate!
# there are other metrics that can be used
print('Accuracy:', metrics.accuracy_score(y_test, prediction))

Accuracy: 0.8
