In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('drive/My Drive/Colab Notebooks/UMich/SI670/Final_Proj')

Mounted at /content/drive


In [None]:
%pip install transformers
%pip install datasets

In [None]:
from transformers import AutoFeatureExtractor, AutoModelForImageClassification
from datasets import load_dataset, Image, Dataset
import torch
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
import numpy as np
from scipy.special import softmax
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
SEED = 670

1. Load recipe data to get IDs
2. Load nutrtion101 data to get corresponding nutrition values of 101 items
3. Load food101 dataset to get dicts of labels to IDs (save dicts to JSON so we don't need to load food 101 in the future)
4. Load pretrained image classification model
5. Create dataset from scraped images
6. dataset -> extractor -> model -> logits
7. Get indexes of top 5 logits and their corresponding logit values
8. Compute a weighted average of the corresponding nutrtion101 values for each img
6. Save weighted averages to csv and use as features 

In [None]:
data = pd.read_csv('clean_recipe_data_nutrition.csv')
data = data[
    ['id', 'servings', 'calories_per_serving', 'protein_per_serving', 
     'fat_per_serving', 'carb_per_serving', 'meal_type', 'region', 'subregion']
]
data['meal_type'] = data['meal_type'].fillna(value='UNK_meal_type')
data['subregion'] = data['subregion'].fillna(value='UNK_subregion') 
data

Unnamed: 0,id,servings,calories_per_serving,protein_per_serving,fat_per_serving,carb_per_serving,meal_type,region,subregion
0,1,2,388,17,22,29,main,latin_american,mexican
1,3,10,795,16,39,96,dessert,latin_american,mexican
2,4,12,384,17,25,25,main,latin_american,mexican
3,8,8,381,34,22,9,UNK_meal_type,latin_american,carribean
4,9,12,427,18,12,63,main,latin_american,mexican
...,...,...,...,...,...,...,...,...,...
5214,8566,6,59,3,3,7,side,"european,usa","italian,jewish"
5215,8567,6,235,5,10,34,side,usa,jewish
5216,8568,8,125,6,4,19,main,usa,jewish
5217,8569,4,290,10,16,29,UNK_meal_type,middle_eastern,turkish


In [None]:
nutrition_101 = pd.read_csv('food101_nutritionix.csv')
nutrition_101.rename(columns={'Unnamed: 0': 'id'}, inplace=True)

In [None]:
nutrition_101.head()

Unnamed: 0,id,total_calories,total_protein,total_fat,total_carb,dish_name
0,0,296.25,2.38,13.75,42.5,apple_pie
1,1,110.34,8.01,7.48,2.2,baby_back_ribs
2,2,306.32,5.46,19.83,29.41,baklava
3,3,180.97,13.19,13.13,1.97,beef_carpaccio
4,4,551.58,32.61,43.91,5.72,beef_tartare


In [None]:
extractor = AutoFeatureExtractor.from_pretrained("skylord/swin-finetuned-food101")

model = AutoModelForImageClassification.from_pretrained("skylord/swin-finetuned-food101")
#food = load_dataset("food101")

Downloading:   0%|          | 0.00/240 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.81k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/348M [00:00<?, ?B/s]

In [None]:
#labels = food['validation'].features["label"].names
#label2id, id2label = dict(), dict()
#for i, label in enumerate(labels):
#    label2id[label] = str(i)
#    id2label[str(i)] = label

In [None]:
#with open("food101_label2id.json", "w") as outfile:
#    json.dump(label2id, outfile)
#
#with open("food101_id2label.json", "w") as outfile:
#    json.dump(id2label, outfile)

In [None]:
with open("food101_label2id.json", 'r') as file:
    label2id = json.load(file)

with open("food101_id2label.json", 'r') as file:
    id2label = json.load(file)

In [None]:
img_path_list = [f"./Recipe_Images_224_x_224/{img}" for img in os.listdir('Recipe_Images_224_x_224')]

In [None]:
dataset = Dataset.from_dict({"image": img_path_list}).cast_column("image", Image())

In [None]:
BATCH_SIZE = 16

In [None]:
logits_list = []
for i in tqdm(range(0, len(dataset), BATCH_SIZE)):
    inputs = extractor(dataset[i:i+BATCH_SIZE]['image'], return_tensors='pt')
    with torch.no_grad():
        logits = model(**inputs).logits
    for logit in logits:
        logits_list.append(logit.tolist())


100%|██████████| 536/536 [1:36:03<00:00, 10.75s/it]


In [None]:
with open("file.txt", "w") as output:
    output.write(str(logits_list))

In [None]:
def get_indices_and_weights(logits_list, top_n=5):
    logits_arr = np.array(logits_list)
    softmax_arr = softmax(logits_arr, axis = 1)
    indices = np.argpartition(softmax_arr, -top_n, axis = 1)[:, -top_n:]
    indices = np.sort(indices)
    nrow = softmax_arr.shape[0]
    probs_arr = []
    for row in tqdm(range(nrow)):
        probs_row = softmax_arr[row]
        indices_row = indices[row]
        top_n_probs = np.take(probs_row, indices_row)
        probs_arr.append(top_n_probs)
    probs_arr = np.array(probs_arr)
    row_sums = probs_arr.sum(axis = 1)
    rescaled_probs = probs_arr / row_sums[:, np.newaxis]
    return indices, rescaled_probs


def get_weighted_nutrition_avg(indices_row, weights, nutrition_101):
    weighted_cal = 0
    weighted_pro = 0
    weighted_fat = 0
    weighted_car = 0
    for i, idx in enumerate(indices_row):
        cal = nutrition_101.iloc[idx]['total_calories']
        pro = nutrition_101.iloc[idx]['total_protein']
        fat = nutrition_101.iloc[idx]['total_fat']
        car = nutrition_101.iloc[idx]['total_carb']
        weight = weights[i]
        weighted_cal += weight * cal
        weighted_pro += weight * pro
        weighted_fat += weight * fat
        weighted_car += weight * car
    return [weighted_cal, weighted_pro, weighted_fat, weighted_car]

In [None]:
indices, rescaled_probs = get_indices_and_weights(logits_list)
df_cols = ['weighted_cal', 'weighted_pro', 'weighted_fat', 'weighted_carb']
weighted_nutrition = pd.DataFrame(columns = df_cols)
for row in tqdm(range(indices.shape[0])):
    nutrition_list = get_weighted_nutrition_avg(indices[row], rescaled_probs[row], nutrition_101)
    df_to_append = pd.DataFrame([nutrition_list], columns = df_cols)
    weighted_nutrition = weighted_nutrition.append(df_to_append, ignore_index = True)

100%|██████████| 8573/8573 [00:00<00:00, 139515.58it/s]
100%|██████████| 8573/8573 [00:28<00:00, 303.79it/s]


In [None]:
weighted_nutrition

Unnamed: 0,weighted_cal,weighted_pro,weighted_fat,weighted_carb
0,286.304811,29.143274,12.753691,12.897601
1,264.732791,21.452818,13.690746,13.481654
2,326.277472,4.935468,17.250680,39.309275
3,380.316082,32.387473,26.780495,0.184563
4,137.537077,7.995832,6.126593,13.032693
...,...,...,...,...
8568,462.054413,49.766016,27.606067,0.003394
8569,352.948130,5.463504,29.552034,16.782724
8570,652.438868,28.582712,27.925315,71.613772
8571,631.974366,27.580930,27.191890,69.172316


In [None]:
weighted_nutrition.to_csv('weighted_nutrition.csv')