Version 3 of FoodItemClassification based on CNN by Kim https://arxiv.org/abs/1408.5882

FoodItem2.1: Has 4 filters (1,3,4,5) 




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Set createData to true, to extract train and test data from usda dataset. Otherwise, the code expects two pickles at
* `data/dataset_train`
* `data/dataset_test`

In [None]:
createData = False

Copy model data (embedding model and bigram model) created by `WordEmbeddingModel.ipynb`

In [None]:
!mkdir data
!mkdir data/models
!cp drive/MyDrive/Uni/Masterarbeit/data/embeddingmodel.zip data/models/zip.zip
!unzip -o data/models/zip.zip -d  data/models 

* Copy train dataset from google drive (if not available set `createData=True` to create)
* Copy test dataset from google drive (if not available set `createData=True` to create)
* Copy a selection from the crawled data from google drive (use `misc/export_data.py` for export)

In [None]:
!cp drive/MyDrive/Uni/Masterarbeit/data/dataset_train data/dataset_train
!cp drive/MyDrive/Uni/Masterarbeit/data/dataset_test data/dataset_test
!cp drive/MyDrive/Uni/Masterarbeit/data/crawled.csv data/crawled.csv

In [None]:
%%capture
!pip install wandb --upgrade

In [None]:
import pandas as pd
import math
import numpy as np
import torch
import torchvision
from torch.nn import Module
from torch.utils.data import Dataset, DataLoader
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import random
from gensim.models.phrases import Phrases, Phraser
import nltk
from nltk.tokenize import RegexpTokenizer
import re
import time
import urllib.request
import os.path
import wandb

Define mapping dict to reduce the number of categories, all categories in the list are getting mapped to the dict key

In [None]:
mapping = {'Baking':['Baking Additives & Extracts','Baking Decorations & Dessert Toppings','Baking/Cooking Mixes/Supplies'],
'Biscuits/Cookies':['Biscuits/Cookies','Biscuits/Cookies (Shelf Stable)'],
'Bread':['Bread','Bread & Muffin Mixes','Breads & Buns'],
'Cakes':['Cake, Cookie & Cupcake Mixes','Cakes - Sweet (Frozen)','Cakes, Cupcakes, Snack Cakes','Croissants, Sweet Rolls, Muffins & Other Pastries','Pies/Pastries - Sweet (Shelf Stable)'],
'Soup':['Canned Soup','Canned Condensed Soup','Other Soups','Chili & Stew'],
'Cereal':['Cereal','Cereals Products - Ready to Eat (Shelf Stable)','Processed Cereal Products'],
'Chips, Pretzels & Snacks': ['Chips, Pretzels & Snacks','Chips/Crisps/Snack Mixes - Natural/Extruded (Shelf Stable)','Popcorn, Peanuts, Seeds & Related Snacks','Wholesome Snacks','Crackers & Biscotti','Flavored Snack Crackers','Other Snacks','Snacks'],
'Fish & Seafood': ['Fish  Unprepared/Unprocessed','Fish & Seafood','Fish – Unprepared/Unprocessed','Canned Seafood','Canned Tuna','Frozen Fish & Seafood','Shellfish Unprepared/Unprocessed'],
'Chocolate':['Confectionery Products','Chocolate'],
'Oils & Butters':['Oils Edible','Nut & Seed Butters','Butter & Spread','Vegetable & Cooking Oils'],
'Dough Based Products':['Dough Based Products / Meals','Frozen Bread & Dough'],
'Bacon, Sausages & Ribs':['Frozen Bacon, Sausages & Ribs','Frozen Sausages, Hotdogs & Brats','Bacon, Sausages & Ribs','Sausages, Hotdogs & Brats'],
'Flours & Grains':['Flours & Corn Meal','Grains/Flour'],
'Herbs & Spices':['Herbs & Spices','Herbs/Spices/Extracts'],
'Meat/Poultry/Other Animals':['Meat/Poultry/Other Animals  Prepared/Processed','Meat/Poultry/Other Animals  Unprepared/Unprocessed',
                              'Meat/Poultry/Other Animals Sausages  Prepared/Processed','Meat/Poultry/Other Animals Sausages – Prepared/Processed',
                              'Meat/Poultry/Other Animals – Prepared/Processed','Meat/Poultry/Other Animals – Unprepared/Unprocessed',
                              'Other Frozen Meats','Poultry, Chicken & Turkey','Canned Meat','Frozen Poultry, Chicken & Turkey','Other Meats','Frozen Patties and Burgers'],
'Non Alcoholic Beverages':['Non Alcoholic Beverages  Not Ready to Drink','Non Alcoholic Beverages  Ready to Drink','Non Alcoholic Beverages – Ready to Drink'],
'Cooking Sauces':['Cooking Sauces','Oriental, Mexican & Ethnic Sauces','Other Cooking Sauces'],
'Pasta/Noodles':['Pasta by Shape & Type','Pasta/Noodles','All Noodles'],
'Subs, Sandwiches, Wraps & Burittos':['Prepared Wraps and Burittos','Prepared Subs & Sandwiches'],
'Vegetables':['Vegetables  Prepared/Processed','Vegetables – Prepared/Processed','Canned Vegetables','Tomatoes','Frozen Vegetables'],
'Vegetable Based Products / Meals':['Vegetable Based Products / Meals','Vegetable Based Products / Meals - Not Ready to Eat (Frozen)'],
'Pancakes, Waffles, French Toast & Crepes':['Frozen Pancakes, Waffles, French Toast & Crepes', 'Pancakes, Waffles, French Toast & Crepes'],
'Entrees, Sides & Small Meals': ['Entrees, Sides & Small Meals',"Frozen Appetizers & Hors D'oeuvres",'Frozen Prepared Sides'],
'Fruit  Prepared/Processed':['Fruit  Prepared/Processed','Fruit - Prepared/Processed (Shelf Stable)']}
remove = ["Baby/Infant  Foods/Beverages","Miscellanious"]

Define functions later used:


*   `get_device()`: returns cuda if available otherwise cpu
*   `unify_cat(dataframe, col, cat_list,new_cat)`: put all categories in `cat_list` into `new_cat` for a given pandas `dataframe`
*   `get_data(file_path)`: reads in the original usda csv data branded_food.csv  and food.csv
*   `get_splitted_data(file_path, stratify=True, max_data= -1)`: reads in the original usda csv data and splits into train and test set, with `stratify` one can decide to stratify the data and one can limit the amount of extracted data with `max_data`
*   `early_stopping(early_stop_after)`: returns a function that decides if early stopping should be applied based on `early_stop_after`
*   `test_crawled(curr_model,categories)`: returns dataframe that gives an overview about the categories for the MyFitnessPal data in `crawled.csv`

In [None]:
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu') 
    return device
  
def unify_cat(dataframe, col, cat_list,new_cat):
  #unify several categories into 1 with new name
  if new_cat not in dataframe[col].cat.categories:
    dataframe[col].cat.add_categories(new_cat,inplace = True)
  dataframe.loc[dataframe[col].isin(cat_list),col] = new_cat
  dataframe[col].cat.remove_unused_categories(inplace=True)

def get_data(file_path):
  branded_food = pd.read_csv(file_path[0])
  food2 = pd.read_csv(file_path[1])
  food2 = branded_food.merge(food2, on='fdc_id')
  del branded_food
  food = pd.DataFrame(food2["description"])
  food["brand"] = food2["brand_name"].fillna("").astype(str)
  food["category"] = food2["branded_food_category"].astype('category')
  del food2
  vc = food["category"].value_counts()
  vc = vc[vc > 100]
  no_of_classes = len(vc)
  food = food[food["category"].isin(vc.index)]
  food["category"] = food["category"].cat.remove_unused_categories()
  food["description"] = food["description"].fillna("").astype(str)
  food = food[~food["category"].isnull()]
  #put similar stuff together
  for x in remove:
    food = food[food["category"] != x]
  for x,y in mapping.items():
    unify_cat(food,"category",y,x)
  food["category"] = food["category"].cat.remove_unused_categories()
  return food

def get_splitted_data(file_path, stratify=True, max_data= -1):
  data = get_data(file_path)
  if max_data != -1:
    data = data.sample(max_data)
  if stratify:
    return train_test_split(data,test_size=0.2,stratify=data["category"])
  return train_test_split(data,test_size=0.2)

def early_stopping(early_stop_after):
  early_stop_counter=0
  best_early_stopping = 0
  def inner_early_stopping(current):
    nonlocal early_stop_counter, best_early_stopping
    if best_early_stopping > current:
      early_stop_counter +=1
    else:
      best_early_stopping = current
      early_stop_counter =0
    return early_stop_counter >= early_stop_after

  return inner_early_stopping

def test_crawled(curr_model,categories):
  categories = np.array(categories)
  crawled_food = pd.read_csv("data/crawled.csv")
  crawled_food["tokens"] = crawled_food.apply(lambda row: bigram_model[fd.preprocess(row["meal_name"])], axis=1)
  crawled_x = [embedd(v) for _,v in crawled_food["tokens"].iteritems()]
  crawled_x = torch.stack(crawled_x).float().to(device)
  curr_model.eval()
  y = curr_model(crawled_x)
  y = torch.nn.functional.softmax(y,dim=1)
  y = y.cpu()
  val,idx = torch.topk(y, 1, dim=1)
  crawled_food["category"] = categories[np.array(idx[:,0])]
  crawled_food["pct"] = val[:,0].detach().numpy()
  return crawled_food

Load the embedding and bigram model

In [None]:
embedding_model = KeyedVectors.load("data/models/mymodel")
embedding_size = 300
bigram_model = Phrases.load("data/models/bigram_model.pkl")
bigram_model = Phraser(bigram_model)
nltk.download('stopwords')
nltk.download('wordnet')
lst_stopwords = nltk.corpus.stopwords.words("english")
custom = ["gal", "oz", "t", "tsp", "teaspoon", 
          "tablespoon", "tbl", "tbs", "tbsp",
          "fl", "oz", "gil", "ounce", "ml", "l",
          "dl", "lb", "pund", "mg", "g", "kg", "gram", "cup","cups","container","avg","homemade","piece","serving","spam","servings","grams"]
lst_stopwords.extend(custom)
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

## Dataset
Create the USDA dataset class

In [None]:
class FoodItemDataset(Dataset):
    def __init__(self, data, token_count,device):
      self.token_count = token_count
      self.no_of_classes = len(data["category"].value_counts())
      self.codes = list(data["category"].cat.codes)
      self.cats = list(data["category"].cat.categories)
      self.embedding = [self.calc(i,row[1]) for i,row in enumerate(data.iterrows())]
      
      
      self.codes = np.array(self.codes, dtype=np.int_)
    def __len__(self):
      return len(self.embedding)

    def __getitem__(self, idx):
      return (self.codes[idx],self.embedding[idx])

    def preprocess(self,name):
      name = re.sub(r'[^\w\s]', '', str(name).lower().strip())
      name = tokenizer.tokenize(name)
      return [lemmatizer.lemmatize(x) for x in name if x not in lst_stopwords and len(x)>2 and not any(char.isdigit() for char in x)]

    def calc(self, index, row):
      if index%100000 == 0:
        print(index)
      embedding = np.zeros((self.token_count,embedding_size),dtype=float)
      tokens_brand = self.preprocess(row["brand"])
      tokens = self.preprocess(row["description"])
      #if not any(x in tokens for x in tokens_brand):
      tokens = tokens_brand + tokens
      tokens = bigram_model[tokens]
      return tokens

Now load the data. if `createData` is True then download it from usda and fill dataset class (could take a while). Otherwise load the Dataset from pickle

In [None]:
device = get_device()
if createData:
  if os.path.exists("zip.zip"):
    assert False
  urllib.request.urlretrieve("https://fdc.nal.usda.gov/fdc-datasets/FoodData_Central_branded_food_csv_2021-10-28.zip", "zip.zip")
  !unzip zip.zip -d data
  data_train, data_test = get_splitted_data(("data/branded_food.csv","data/food.csv"))
  fd = FoodItemDataset(data_train,10,device)
  fd_test = FoodItemDataset(data_test,10,device)
  torch.save(fd,"data/dataset_train")
  torch.save(fd_test,"data/dataset_test")
else:
  fd = torch.load("data/dataset_train")
  fd_test = torch.load("data/dataset_test")

Define the collate_batch used in the Dataloader

In [None]:
def embedd(tokens):
  embedding = np.zeros((10,embedding_size),dtype=float)
  for i,token in enumerate(tokens[:10]):
    if not(token == "" or token == " " or token not in embedding_model.wv):
      embedding[i,:] = embedding_model.wv[token]
  return torch.FloatTensor(embedding)

def embedd_mean(tokens):
  i = 0
  embedding = np.zeros((embedding_size,),dtype=float)
  for token in tokens[:10]:
    if not(token == "" or token == " " or token not in embedding_model.wv):
      embedding += embedding_model.wv[token]
      i += 1
  if i == 0:
    return torch.FloatTensor(embedding)
  return torch.FloatTensor(embedding/i)

def collate_batch(batch):
  code_list, embedding_list = [],[]
  for code, tokens in batch:
    code_list.append(code)
    embedding_list.append(embedd(tokens))
  code_list = torch.LongTensor(code_list)
  embedding_list = torch.stack(embedding_list)
  return code_list, embedding_list

def collate_batch_eval(batch):
  code_list, embedding_list = collate_batch(batch)
  tokens_list = [x for _,x in batch]
  return code_list,embedding_list, tokens_list


Define the FoodClassificationNetwork with 4 filter sizes

In [None]:
class FoodClassificationCnnModel(Module):
  def __init__(self, no_of_classes,device,dropout=0.2):
    super().__init__()
    n_filters = 100
    self.device = device
    self.no_of_classes = no_of_classes
    self.filter1 =  torch.nn.Conv2d(in_channels = 1, 
                                    out_channels = n_filters, 
                                    kernel_size = (3, 300)).to(device)
    self.filter2 =  torch.nn.Conv2d(in_channels = 1, 
                                    out_channels = n_filters, 
                                    kernel_size = (4, 300)).to(device) 
    self.filter3 =  torch.nn.Conv2d(in_channels = 1, 
                                    out_channels = n_filters, 
                                    kernel_size = (5, 300)).to(device)
    self.filter4 =  torch.nn.Conv2d(in_channels = 1, 
                                    out_channels = n_filters, 
                                    kernel_size = (1, 300)).to(device)
    self.linear = torch.nn.Sequential(torch.nn.Linear(4*n_filters,100),
                                      torch.nn.LeakyReLU(),
                                      torch.nn.BatchNorm1d(100),
                                      torch.nn.Linear(100,self.no_of_classes)).to(device)
    self.relu = torch.nn.ReLU().to(device)
    self.dropout = torch.nn.Dropout(dropout).to(device)
  
  def forward(self,x):
    x = x.unsqueeze(1)
    f1 = self.relu(self.filter1(x)).squeeze(3)
    f2 = self.relu(self.filter2(x)).squeeze(3)
    f3 = self.relu(self.filter3(x)).squeeze(3)
    f4 = self.relu(self.filter4(x)).squeeze(3)
    f1 = torch.nn.functional.max_pool1d(f1,f1.shape[2])
    f2 = torch.nn.functional.max_pool1d(f2,f2.shape[2])
    f3 = torch.nn.functional.max_pool1d(f3,f3.shape[2])
    f4 = torch.nn.functional.max_pool1d(f4,f4.shape[2])
    linear = self.dropout(torch.cat((f1,f2,f3,f4),dim=1)).squeeze(2)
    out = self.linear(linear)
    return out
    
  def get_accuracy(self,X,y):
    y_pred = self.forward(X)
    res = torch.argmax(y_pred, dim=1)
    res = y-res
    l = int(torch.count_nonzero(res))
    count = list(y_pred.shape)[0]
    return (count-l)/count

In [None]:
config = dict(
    lr = 1e-4,
    no_of_classes = fd.no_of_classes,
    batch_size= 50000,
    num_workers=2,
    loss=torch.nn.CrossEntropyLoss(),
    dataset="FoodItem2.1(1,300)Filter",
    epochs=30,
    test_set_size= len(fd_test),
    train_set_size= len(fd),
    early_stop_after=2
)

In [None]:
model = FoodClassificationCnnModel(config["no_of_classes"],device)

Define the dataloaders for train and test Dataset

In [None]:
dataloader = DataLoader(fd,batch_size=config["batch_size"], 
                        collate_fn=collate_batch,pin_memory=True, 
                        shuffle=True,
                        num_workers=config["num_workers"])
dataloader_test = DataLoader(fd_test,batch_size=config["batch_size"], 
                             collate_fn=collate_batch, pin_memory=True,
                             num_workers=config["num_workers"])
optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
assert len(dataloader.dataset) > len(dataloader_test.dataset)
stopping_func = early_stopping(config["early_stop_after"])
loss_fn = config["loss"]

In [None]:
assert len(dataloader.dataset) > len(dataloader_test.dataset)

In [None]:
wandb.login()
run = wandb.init(project="food-classification", entity="gege-hoho", config=config)

Training loop:

1.   get food item name and label from dataset
2.   forward it to the model
3.   compare ground truth with prediction
4.   calculate loss
5.   feed loss to model
6.   log acurracy to wandb




In [None]:
print(f"Train with LR:{config['lr']}")
assert len(dataloader.dataset) > len(dataloader_test.dataset)
print(device)

for epoch in range(config["epochs"]):
  model.train()
  x1 = time.perf_counter()
  for data in dataloader:    
    y_train = data[0].to(device)
    X_train = data[1].to(device)
    #X_train = X_train.mean(dim=1)
    y_pred = model.forward(X_train)
    loss = loss_fn(y_pred, y_train)
    wandb.log({"train-loss": loss})
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  #scheduler.step()
  x2 = time.perf_counter()
  print(f"Epoch Time: {x2-x1}")
  wandb.log({"epoch-time": x2-x1})
  if epoch % 1 == 0:
    acc_tr = model.get_accuracy(X_train,y_train)
    print(f"{epoch} Accuracy Train {acc_tr}")
    wandb.log({"train-acc": acc_tr})
    acc_te = 0
    rec_te = np.zeros((len(fd_test.cats),))
    pre_te = np.zeros((len(fd_test.cats),))
    model.eval()
    for i,test in enumerate(dataloader_test):
      y_test = test[0].to(device)
      X_test = test[1].to(device)
      #X_test = X_test.mean(dim=1)
      acc_te += model.get_accuracy(X_test,y_test)
    acc_te = acc_te/(i+1)
    wandb.log({"test-acc": acc_te})
    print(f"{epoch} Accuracy Test {acc_te}")
    if stopping_func(acc_te):
      print("early stopping")
      break
  if epoch % 5 ==0:
    crawled_food = test_crawled(model,fd.cats)[0:500][["meal_name","tokens","category","pct"]]
    wandb.log({"crawled_food": crawled_food})
    print("logged cralwed food")  

Save the created model

In [None]:
torch.save(model,"model93.2")
run.finish()

Load the model again for evaluation

In [None]:
model = torch.load("model93.2")

In [None]:
def get_confusion_matrix(y_pred,y_true,labels):
  y_pred = y_pred.cpu()
  y_true = y_true.cpu()
  cm = confusion_matrix(y_true, y_pred)
  return cm
  
def get_evaluation(y_pred,y_true,labels):
  d = [[0,0,0] for _ in range(len(labels))]
  for y_p, y_t in zip(y_pred,y_true):
    if y_p == y_t:
      d[y_t][0] += 1
    else:
      d[y_p][2] += 1 # false negative for class y_p
      d[y_t][1] += 1 # false positive for class y_t
  recall = [x[0]/(x[0]+x[2]) if x[0]+x[2] != 0 else np.nan for x in d]
  prec = [x[0]/(x[0]+x[1])if x[0]+x[1] != 0 else np.nan for x in d]
  return np.array(recall),np.array(prec)

def print_evaluation(values,labels,categories):
  for x,l in zip(values,labels):
    print(f"{l}: {x:.2f}")

Calculate Acurracy, Recall and Precision for the Model

In [None]:
model.eval()
dataloader_eval = DataLoader(fd_test,batch_size=150000, collate_fn=collate_batch_eval, pin_memory=True)
wrong_data =  []
for i,test in enumerate(dataloader_eval):
  y_test = test[0].to(device)
  y_pred = model(test[1].to(device))
  tokens = test[2]
  y_pred = torch.argmax(y_pred, dim=1)
  for y_p, y_t,tokens in zip(y_pred,y_test,tokens):
    if y_p != y_t:
      #print(f"Expected {fd_test.cats[y_t]} Got {fd_test.cats[y_p]} for {tokens}")
      wrong_data.append((fd_test.cats[y_t],fd_test.cats[y_p],tokens)) #actual cat, predicted cat, tokens
    #X_test = X_test.mean(dim=1)
  
  recall,prec = get_evaluation(y_pred,y_test, fd_test.cats)
  cm = get_confusion_matrix(y_pred,y_test,fd_test.cats)
  break

In [None]:
print("Recall")
print(f"Mean: {np.mean(recall)}")
print(f"Std: {np.std(recall)}")
print(f"Min: {np.min(recall)}")
print(f"Max: {np.max(recall)}")
print("Precision")
print(f"Mean: {np.mean(prec)}")
print(f"Std: {np.std(prec)}")
print(f"Min: {np.min(prec)}")
print(f"Max: {np.max(prec)}")

In [None]:
print(fd_test.cats[np.argmin(recall)])
print(fd_test.cats[np.argmax(recall)])
print(fd_test.cats[np.argmin(prec)])
print(fd_test.cats[np.argmax(prec)])

In [None]:
for i,(p,r) in enumerate(zip(prec, recall)):
  print(f"{fd_test.cats[i].lower()};{(p*100):.2f}%;{(r*100):.2f}%")

TopK accuraccy:

In [None]:
k = 3
y_pred = model(test[1].to(device))
y_test = test[0].to(device)
topk = torch.topk(y_pred, k, dim=1)
s = 0
count = y_pred.shape[0]
for i in range(topk.indices.shape[1]):
  res = y_test -topk.indices[:,i]
  s += (res == 0).sum()

print(s/count)


Look at the created categories for MyFitnessPal data

In [None]:
crawled_food = test_crawled(model,fd.cats)
with pd.option_context('display.max_rows', None, 'display.max_columns', 100):
  display(crawled_food[0:400][["meal_name","tokens","category","pct"]])

Example how get a prediction for a given name

In [None]:
name = "FRUIT WAVE"
x = embedd(bigram_model[fd.preprocess(name.split(" "))]).float().to(device)
x = x.unsqueeze(0)
print(x.shape)
model.eval()
z = model.forward(x)
ss = torch.nn.Softmax()
z2 = ss(z)
fd.cats[torch.argmax(z2, dim=1)]