In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('drive/My Drive/Colab Notebooks/UMich/SI670/Final_Proj')

Mounted at /content/drive


In [2]:
!ls

calorie_regression_basic_densenet121.ipynb
calorie_regression_multiin_multiout_densenet121.ipynb
calorie_regression_multiout_densenet121.ipynb
clean_recipe_data_nutrition.csv
get_recipe_images.ipynb
recipe_data_nutrition.csv
Recipe_Images
Recipe_Images_224_x_224
Recipe_Images_224_x_224.zip


In [3]:
import pandas as pd
from tqdm import tqdm
import requests
import cv2
import numpy as np
import tensorflow as tf
from keras.applications.densenet import DenseNet121, preprocess_input
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Input, InputLayer, Flatten, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping
from PIL import Image
import zipfile
from  matplotlib import pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

In [4]:
SEED = 670

# Load Data and Preprocess

Basic preprocessing, remove unneeded columns, replace NAs with UNK

In [5]:
data = pd.read_csv('clean_recipe_data_nutrition.csv')
data = data[
    ['id', 'servings', 'calories_per_serving', 'protein_per_serving', 
     'fat_per_serving', 'carb_per_serving', 'meal_type', 'region', 'subregion']
]
data['meal_type'] = data['meal_type'].fillna(value='UNK_meal_type')
data['subregion'] = data['subregion'].fillna(value='UNK_subregion') 
data.head()

Unnamed: 0,id,servings,calories_per_serving,protein_per_serving,fat_per_serving,carb_per_serving,meal_type,region,subregion
0,1,2,388,17,22,29,main,latin_american,mexican
1,3,10,795,16,39,96,dessert,latin_american,mexican
2,4,12,384,17,25,25,main,latin_american,mexican
3,8,8,381,34,22,9,UNK_meal_type,latin_american,carribean
4,9,12,427,18,12,63,main,latin_american,mexican


# Split data into train and test

In [6]:
train_df, test_df = train_test_split(data, test_size = 0.2, random_state = SEED)

In [7]:
train_df.head()

Unnamed: 0,id,servings,calories_per_serving,protein_per_serving,fat_per_serving,carb_per_serving,meal_type,region,subregion
4971,8160,4,127,4,9,9,salad,european,italian
2573,4249,6,300,6,12,44,side,asian,indian
786,1354,8,395,11,17,48,UNK_meal_type,european,"scandi,vian"
3684,6038,25,318,26,6,41,UNK_meal_type,european,"scandi,vian"
435,784,35,141,7,8,11,UNK_meal_type,latin_american,mexican


In [8]:
train_idx = train_df['id'].values
test_idx = test_df['id'].values

In [9]:
train_y = np.column_stack(
    (train_df['calories_per_serving'].values,
     train_df['protein_per_serving'].values,
     train_df['fat_per_serving'].values,
     train_df['carb_per_serving'].values)
)
test_y = np.column_stack(
    (test_df['calories_per_serving'].values,
     test_df['protein_per_serving'].values,
     test_df['fat_per_serving'].values,
     test_df['carb_per_serving'].values)
)

# Baseline 

## Baseline 1 using basic average

In [19]:
dummy_preds_cal = [train_df['calories_per_serving'].mean()] * len(test_y)
dummy_preds_pro = [train_df['protein_per_serving'].mean()] * len(test_y)
dummy_preds_fat = [train_df['fat_per_serving'].mean()] * len(test_y)
dummy_preds_carb = [train_df['carb_per_serving'].mean()] * len(test_y)

dum_mae_cal = mean_absolute_error(test_y[:,0:1], dummy_preds_cal)
dum_mae_pro = mean_absolute_error(test_y[:,1:2], dummy_preds_pro)
dum_mae_fat = mean_absolute_error(test_y[:,2:3], dummy_preds_fat)
dum_mae_carb = mean_absolute_error(test_y[:,3:4], dummy_preds_carb)
print(dum_mae_cal, dum_mae_pro, dum_mae_fat, dum_mae_carb)

169.2281044806938 13.27684997820451 10.889347053020396 18.26553398949228


## Baseline 2 using average grouped by `meal_type`

In [10]:
# there are multiple meal types associated with some recipes
# assign them to only one
fix_meal_type_dict = {
    'UNK_meal_type':'UNK_meal_type', 
    'appetizer': 'appetizer', 
    'appetizer,bread': 'bread',
    'appetizer,bread,side': 'bread', 
    'appetizer,dessert': 'dessert', 
    'appetizer,main': 'appetizer',
    'appetizer,main,salad,side': 'main', 
    'appetizer,salad': 'salad', 
    'appetizer,side': 'appetizer',
    'appetizer,soups_and_stews': 'soups_and_stews', 
    'bread': 'bread', 
    'bread,main': 'bread', 
    'dessert': 'dessert',
    'dessert,main': 'dessert', 
    'dessert,salad': 'dessert', 
    'drinks': 'drinks', 
    'main': 'main', 
    'main,salad': 'salad',
    'main,side': 'side', 
    'main,side,soups_and_stews': 'main', 
    'main,soups_and_stews': 'main',
    'salad': 'salad', 
    'salad,side': 'salad', 
    'side': 'side', 
    'side,soups_and_stews': 'soups_and_stews',
    'soups_and_stews': 'soups_and_stews',
    ',main': 'main'
}
train_df = train_df.replace({'meal_type': fix_meal_type_dict})
test_df = test_df.replace({'meal_type': fix_meal_type_dict})

In [12]:
meal_type_means_cal = train_df.groupby('meal_type', as_index=False)['calories_per_serving'].mean()
meal_type_means_pro = train_df.groupby('meal_type', as_index=False)['protein_per_serving'].mean()
meal_type_means_fat = train_df.groupby('meal_type', as_index=False)['fat_per_serving'].mean()
meal_type_means_carb = train_df.groupby('meal_type', as_index=False)['carb_per_serving'].mean()

In [14]:
dummy_preds_cal = []
dummy_preds_pro = []
dummy_preds_fat = []
dummy_preds_carb = []
for meal_type in test_df['meal_type'].values:
    dummy_preds_cal.append(meal_type_means_cal.loc[meal_type_means_cal['meal_type'] == meal_type]['calories_per_serving'].values[0])
    dummy_preds_pro.append(meal_type_means_pro.loc[meal_type_means_pro['meal_type'] == meal_type]['protein_per_serving'].values[0])
    dummy_preds_fat.append(meal_type_means_fat.loc[meal_type_means_fat['meal_type'] == meal_type]['fat_per_serving'].values[0])
    dummy_preds_carb.append(meal_type_means_carb.loc[meal_type_means_carb['meal_type'] == meal_type]['carb_per_serving'].values[0])
dummy_preds_cal = np.reshape(np.array(dummy_preds_cal), (-1, 1))
dummy_preds_pro = np.reshape(np.array(dummy_preds_pro), (-1, 1))
dummy_preds_fat = np.reshape(np.array(dummy_preds_fat), (-1, 1))
dummy_preds_carb = np.reshape(np.array(dummy_preds_carb), (-1, 1))

In [15]:
dum_mae_cal = mean_absolute_error(test_y[:,0:1], dummy_preds_cal)
dum_mae_pro = mean_absolute_error(test_y[:,1:2], dummy_preds_pro)
dum_mae_fat = mean_absolute_error(test_y[:,2:3], dummy_preds_fat)
dum_mae_carb = mean_absolute_error(test_y[:,3:4], dummy_preds_carb)
print(dum_mae_cal, dum_mae_pro, dum_mae_fat, dum_mae_carb)

156.55781950810103 10.195439796931778 10.241503833421401 17.28250919577458


## Baseline 3 grouped by region

In [26]:
np.unique(list(train_df.region.unique()) + list(test_df.region.unique()))

array(['african', 'african,european', 'african,european,usa',
       'african,middle_eastern', 'african,middle_eastern,usa',
       'african,usa', 'asian', 'asian,austrailia_new_zealand,usa',
       'asian,european', 'asian,usa', 'austrailia_new_zealand',
       'austrailia_new_zealand,european', 'austrailia_new_zealand,usa',
       'european', 'european,latin_american', 'european,middle_eastern',
       'european,middle_eastern,usa', 'european,usa', 'latin_american',
       'latin_american,usa', 'middle_eastern', 'middle_eastern,usa',
       'usa'], dtype='<U32')

In [29]:
fix_region_dict = {
    'african': 'african', 
    'african,european': 'african',
    'african,european,usa': 'african',
    'african,middle_eastern': 'african', 
    'african,middle_eastern,usa': 'african',
    'african,usa': 'african', 
    'asian': 'asian', 
    'asian,austrailia_new_zealand,usa': 'asian',
    'asian,european': 'asian', 
    'asian,usa': 'asian', 
    'austrailia_new_zealand': 'austrailia_new_zealand',
    'austrailia_new_zealand,european': 'austrailia_new_zealand', 
    'austrailia_new_zealand,usa': 'austrailia_new_zealand',
    'european': 'european',
    'european,latin_american': 'european', 
    'european,middle_eastern': 'european',
    'european,middle_eastern,usa': 'european', 
    'european,usa': 'european', 
    'latin_american': 'latin_american',
    'latin_american,usa': 'latin_american', 
    'middle_eastern': 'middle_eastern', 
    'middle_eastern,usa': 'middle_eastern',
    'usa': 'usa'
}
train_df = train_df.replace({'region': fix_region_dict})
test_df = test_df.replace({'region': fix_region_dict})

In [30]:
region_means_cal = train_df.groupby('region', as_index=False)['calories_per_serving'].mean()
region_means_pro = train_df.groupby('region', as_index=False)['protein_per_serving'].mean()
region_means_fat = train_df.groupby('region', as_index=False)['fat_per_serving'].mean()
region_means_carb = train_df.groupby('region', as_index=False)['carb_per_serving'].mean()

In [31]:
dummy_preds_cal = []
dummy_preds_pro = []
dummy_preds_fat = []
dummy_preds_carb = []
for region in test_df['region'].values:
    dummy_preds_cal.append(region_means_cal.loc[region_means_cal['region'] == region]['calories_per_serving'].values[0])
    dummy_preds_pro.append(region_means_pro.loc[region_means_pro['region'] == region]['protein_per_serving'].values[0])
    dummy_preds_fat.append(region_means_fat.loc[region_means_fat['region'] == region]['fat_per_serving'].values[0])
    dummy_preds_carb.append(region_means_carb.loc[region_means_carb['region'] == region]['carb_per_serving'].values[0])
dummy_preds_cal = np.reshape(np.array(dummy_preds_cal), (-1, 1))
dummy_preds_pro = np.reshape(np.array(dummy_preds_pro), (-1, 1))
dummy_preds_fat = np.reshape(np.array(dummy_preds_fat), (-1, 1))
dummy_preds_carb = np.reshape(np.array(dummy_preds_carb), (-1, 1))

In [32]:
dum_mae_cal = mean_absolute_error(test_y[:,0:1], dummy_preds_cal)
dum_mae_pro = mean_absolute_error(test_y[:,1:2], dummy_preds_pro)
dum_mae_fat = mean_absolute_error(test_y[:,2:3], dummy_preds_fat)
dum_mae_carb = mean_absolute_error(test_y[:,3:4], dummy_preds_carb)
print(dum_mae_cal, dum_mae_pro, dum_mae_fat, dum_mae_carb)

168.17948249596085 13.00992934045323 10.86428218331197 18.283117015916783
