In [None]:
from google.colab import drive, runtime
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('drive/My Drive/Colab Notebooks/UMich/SI670/Final_Proj')

In [None]:
import pandas as pd
from tqdm import tqdm
import requests
import cv2
import numpy as np
import tensorflow as tf
#from keras.applications.densenet import DenseNet121, preprocess_input
#from keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
#from keras.applications.efficientnet import EfficientNetB0, preprocess_input
from keras.applications.nasnet import NASNetMobile, preprocess_input
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Input, InputLayer, Flatten, GlobalAveragePooling2D, Dropout, concatenate, Embedding, Reshape
from keras.layers import Concatenate
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping
from PIL import Image
import zipfile
from  matplotlib import pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

In [None]:
SEED = 670

# Load Data and Preprocess
Basic preprocessing, remove unneeded columns, replace NAs with UNK, narrow down `region`/`meal_type`.

In [None]:
data = pd.read_csv('clean_recipe_data_nutrition.csv')
data = data[
    ['id', 'servings', 'calories_per_serving', 'protein_per_serving', 
     'fat_per_serving', 'carb_per_serving', 'meal_type', 'region']
]
data['meal_type'] = data['meal_type'].fillna(value='UNK_meal_type')
data.head()

Unnamed: 0,id,servings,calories_per_serving,protein_per_serving,fat_per_serving,carb_per_serving,meal_type,region
0,1,2,388,17,22,29,main,latin_american
1,3,10,795,16,39,96,dessert,latin_american
2,4,12,384,17,25,25,main,latin_american
3,8,8,381,34,22,9,UNK_meal_type,latin_american
4,9,12,427,18,12,63,main,latin_american


In [None]:
# there are multiple meal types associated with some recipes
# assign them to only one
fix_meal_type_dict = {
    'UNK_meal_type':'UNK_meal_type', 
    'appetizer': 'appetizer', 
    'appetizer,bread': 'bread',
    'appetizer,bread,side': 'bread', 
    'appetizer,dessert': 'dessert', 
    'appetizer,main': 'appetizer',
    'appetizer,main,salad,side': 'main', 
    'appetizer,salad': 'salad', 
    'appetizer,side': 'appetizer',
    'appetizer,soups_and_stews': 'soups_and_stews', 
    'bread': 'bread', 
    'bread,main': 'bread', 
    'dessert': 'dessert',
    'dessert,main': 'dessert', 
    'dessert,salad': 'dessert', 
    'drinks': 'drinks', 
    'main': 'main', 
    'main,salad': 'salad',
    'main,side': 'side', 
    'main,side,soups_and_stews': 'main', 
    'main,soups_and_stews': 'main',
    'salad': 'salad', 
    'salad,side': 'salad', 
    'side': 'side', 
    'side,soups_and_stews': 'soups_and_stews',
    'soups_and_stews': 'soups_and_stews',
    ',main': 'main'
}
data = data.replace({'meal_type': fix_meal_type_dict})
fix_region_dict = {
    'african': 'african', 
    'african,european': 'african',
    'african,european,usa': 'african',
    'african,middle_eastern': 'african', 
    'african,middle_eastern,usa': 'african',
    'african,usa': 'african', 
    'asian': 'asian', 
    'asian,austrailia_new_zealand,usa': 'asian',
    'asian,european': 'asian', 
    'asian,usa': 'asian', 
    'austrailia_new_zealand': 'austrailia_new_zealand',
    'austrailia_new_zealand,european': 'austrailia_new_zealand', 
    'austrailia_new_zealand,usa': 'austrailia_new_zealand',
    'european': 'european',
    'european,latin_american': 'european', 
    'european,middle_eastern': 'european',
    'european,middle_eastern,usa': 'european', 
    'european,usa': 'european', 
    'latin_american': 'latin_american',
    'latin_american,usa': 'latin_american', 
    'middle_eastern': 'middle_eastern', 
    'middle_eastern,usa': 'middle_eastern',
    'usa': 'usa'
}
data = data.replace({'region': fix_region_dict})

Loaded weighted calorie predictions from nutritionix and food101 classification

In [None]:
weighted_nutrition = pd.read_csv('weighted_nutrition.csv')
weighted_nutrition.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
weighted_nutrition.head()

Unnamed: 0,id,weighted_cal,weighted_pro,weighted_fat,weighted_carb
0,0,286.304811,29.143274,12.753691,12.897601
1,1,264.732791,21.452818,13.690746,13.481654
2,2,326.277472,4.935468,17.25068,39.309275
3,3,380.316082,32.387473,26.780495,0.184563
4,4,137.537077,7.995832,6.126593,13.032693


In [None]:
data = pd.merge(data, weighted_nutrition, on='id')

In [None]:
data.head()

Unnamed: 0,id,servings,calories_per_serving,protein_per_serving,fat_per_serving,carb_per_serving,meal_type,region,weighted_cal,weighted_pro,weighted_fat,weighted_carb
0,1,2,388,17,22,29,main,latin_american,264.732791,21.452818,13.690746,13.481654
1,3,10,795,16,39,96,dessert,latin_american,380.316082,32.387473,26.780495,0.184563
2,4,12,384,17,25,25,main,latin_american,137.537077,7.995832,6.126593,13.032693
3,8,8,381,34,22,9,UNK_meal_type,latin_american,321.578552,15.328963,12.359435,35.918468
4,9,12,427,18,12,63,main,latin_american,394.862481,43.675238,22.45396,1.500357


# Load images, split data into train and test

In [None]:
images = []
imgzip = zipfile.ZipFile("Recipe_Images_224_x_224.zip")
for f in tqdm(imgzip.infolist()):
    ifile = imgzip.open(f)
    img = np.asarray(Image.open(ifile))
    images.append(img)

100%|██████████| 8573/8573 [00:14<00:00, 610.22it/s]


In [None]:
data['stratify_split'] = data['meal_type'] + data['region']

In [None]:
data['stratify_split'].value_counts()

UNK_meal_typeusa                       1075
UNK_meal_typeeuropean                   542
dessertusa                              451
maineuropean                            252
UNK_meal_typelatin_american             223
mainasian                               215
mainlatin_american                      214
mainusa                                 202
saladusa                                177
sideusa                                 138
desserteuropean                         135
breadusa                                114
soups_and_stewsusa                      108
appetizereuropean                       107
UNK_meal_typemiddle_eastern             102
breadeuropean                           100
UNK_meal_typeasian                       92
saladeuropean                            83
appetizerusa                             81
sideeuropean                             74
soups_and_stewseuropean                  64
appetizerasian                           56
dessertlatin_american           

In [None]:
drop_stratify_split = ['dessertaustrailia_new_zealand', 'breadmiddle_eastern', 'dessertmiddle_eastern']
data = data[~data['stratify_split'].isin(drop_stratify_split)]

In [None]:
train_df, test_df = train_test_split(data, test_size = 0.2, random_state = SEED, stratify=data[['stratify_split']])

In [None]:
train_idx = train_df['id'].values
test_idx = test_df['id'].values
train_images = np.array([images[idx] for idx in train_idx])
test_images = np.array([images[idx] for idx in test_idx])

In [None]:
train_y = np.column_stack(
    (train_df['calories_per_serving'].values,
     train_df['protein_per_serving'].values,
     train_df['fat_per_serving'].values,
     train_df['carb_per_serving'].values)
)
test_y = np.column_stack(
    (test_df['calories_per_serving'].values,
     test_df['protein_per_serving'].values,
     test_df['fat_per_serving'].values,
     test_df['carb_per_serving'].values)
)

In [None]:
one_hot_enc = OneHotEncoder(sparse = False)
train_df_tabular = train_df[['meal_type', 'region']]
train_tabular = one_hot_enc.fit_transform(train_df_tabular)
test_df_tabular = test_df[['meal_type', 'region']]
test_tabular = one_hot_enc.transform(test_df_tabular)

In [None]:
cont_col = ['weighted_cal', 'weighted_pro',	'weighted_fat',	'weighted_carb']
train_df_cont = train_df[cont_col]
test_df_cont = test_df[cont_col]

In [None]:
train_tabular.shape

(4172, 16)

In [None]:
train_df_cont.to_numpy().shape

(4172, 4)

In [None]:
train_tab_arr = np.concatenate((train_tabular, train_df_cont.to_numpy()), axis=1)
test_tab_arr = np.concatenate((test_tabular, test_df_cont.to_numpy()), axis=1)

In [None]:
scaler = StandardScaler()
train_tab_arr = scaler.fit_transform(train_tab_arr)
test_tab_arr = scaler.transform(test_tab_arr)

# Create model


In [None]:
IMG_SHAPE = (224, 224, 3)

In [None]:
def create_cnn(IMG_SHAPE, fine_tune=False, fine_tune_at=0):
    data_augmentation = tf.keras.Sequential([
        tf.keras.layers.RandomFlip('horizontal'),
        tf.keras.layers.RandomRotation(0.2),
    ])
    base_model = NASNetMobile(weights='imagenet', include_top=False, input_shape=IMG_SHAPE)
    base_model.trainable = False
    if fine_tune:
        fine_tune_at = fine_tune_at
        for layer in base_model.layers[-fine_tune_at:]:
            if not isinstance(layer, layers.BatchNormalization):
                layer.trainable = True

    inputs = tf.keras.Input(shape=(224, 224, 3))
    x = data_augmentation(inputs)
    x = preprocess_input(x)
    x = base_model(x, training = False)
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.2)(x)
    outputs = Dense(4, activation='relu')(x)
    model = Model(inputs, x)
    return model


def create_mlp(dim):
    model = Sequential()
    model.add(Dense(16, input_dim=dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(4, activation='relu'))
    return model

In [None]:
mlp = create_mlp(train_tab_arr.shape[1])
cnn = create_cnn(IMG_SHAPE)

combined_input = concatenate([mlp.output, cnn.output])

x = Dense(8, activation = 'relu')(combined_input)
x = Dense(4, activation = 'linear')(x)

model = tf.keras.Model(inputs = [mlp.input, cnn.input], outputs = x)
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005),
    loss = 'huber_loss',
    metrics = ['mean_absolute_error']
)
model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/nasnet/NASNet-mobile-no-top.h5
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 sequential_1 (Sequential)      (None, 224, 224, 3)  0           ['input_2[0][0]']                
                                                                                                  
 tf.math.truediv (TFOpLambda)   (None, 224, 224, 3)  0           ['sequential_1[0][0]']           
                                                                             

In [None]:
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
initial_epochs = 500
history = model.fit(
    x = [train_tab_arr, train_images],
    y = train_y,
    batch_size = 32,
    epochs = initial_epochs,
    validation_split = 0.2,
    callbacks = [es]
)
model.save_weights('initial_multiin_101_multiout_NASNetMobile.h5')

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500


In [None]:
#model.load_weights('initial_multiin_101_multiout_NASNetMobile.h5')

In [None]:
test_preds = model.predict([test_tab_arr, test_images])
mae_cal = mean_absolute_error(test_y[:,0:1], test_preds[:,0:1])
mae_pro = mean_absolute_error(test_y[:,1:2], test_preds[:,1:2])
mae_fat = mean_absolute_error(test_y[:,2:3], test_preds[:,2:3])
mae_carb = mean_absolute_error(test_y[:,3:4], test_preds[:,3:4])
print(mae_cal, mae_pro, mae_fat, mae_carb)

147.1869055210859 9.839766522591141 9.898559170207758 17.424946706413767
