In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Version 3 of FoodClassification Experiments
This Version contains a generalized Dataset from the Version 2. In order to allow for CNN usage. It contains the definition of two CNN models 

1.   FoodItem2.1: Has 4 filters (1,3,4,5)
2.   FoodItem2.0: Has 3 filters (3,4,5)
3.   FoodItem1.0: Feed Forward which only works on the mean embedding



In [2]:
!mkdir data
!mkdir data/models
!cp drive/MyDrive/Uni/Masterarbeit/data/embeddingmodel.zip data/models/zip.zip
!unzip -o data/models/zip.zip -d  data/models 
!cp drive/MyDrive/Uni/Masterarbeit/data/dataset data/dataset.csv
!cp drive/MyDrive/Uni/Masterarbeit/data/dataset_train data/dataset_train
!cp drive/MyDrive/Uni/Masterarbeit/data/dataset_test data/dataset_test
!cp drive/MyDrive/Uni/Masterarbeit/data/crawled.csv data/crawled.csv

Archive:  data/models/zip.zip
  inflating: data/models/bigram_model.pkl  
  inflating: data/models/mymodel     
  inflating: data/models/mymodel.wv.vectors.npy  
  inflating: data/models/mymodel.trainables.syn1neg.npy  


In [3]:
%%capture
!pip install wandb --upgrade

In [4]:
import pandas as pd
import math
import numpy as np
import torch
import torchvision
from torch.nn import Module
from torch.utils.data import Dataset, DataLoader
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import random
from gensim.models.phrases import Phrases, Phraser
import nltk
from nltk.tokenize import RegexpTokenizer
import re
import time
import urllib.request
import os.path
import wandb

In [5]:
mapping = {'Baking':['Baking Additives & Extracts','Baking Decorations & Dessert Toppings','Baking/Cooking Mixes/Supplies'],
'Biscuits/Cookies':['Biscuits/Cookies','Biscuits/Cookies (Shelf Stable)'],
'Bread':['Bread','Bread & Muffin Mixes','Breads & Buns'],
'Cakes':['Cake, Cookie & Cupcake Mixes','Cakes - Sweet (Frozen)','Cakes, Cupcakes, Snack Cakes','Croissants, Sweet Rolls, Muffins & Other Pastries','Pies/Pastries - Sweet (Shelf Stable)'],
'Soup':['Canned Soup','Canned Condensed Soup','Other Soups','Chili & Stew'],
'Cereal':['Cereal','Cereals Products - Ready to Eat (Shelf Stable)','Processed Cereal Products'],
'Chips, Pretzels & Snacks': ['Chips, Pretzels & Snacks','Chips/Crisps/Snack Mixes - Natural/Extruded (Shelf Stable)','Popcorn, Peanuts, Seeds & Related Snacks','Wholesome Snacks','Crackers & Biscotti','Flavored Snack Crackers','Other Snacks','Snacks'],
'Fish & Seafood': ['Fish  Unprepared/Unprocessed','Fish & Seafood','Fish – Unprepared/Unprocessed','Canned Seafood','Canned Tuna','Frozen Fish & Seafood','Shellfish Unprepared/Unprocessed'],
'Chocolate':['Confectionery Products','Chocolate'],
'Oils & Butters':['Oils Edible','Nut & Seed Butters','Butter & Spread','Vegetable & Cooking Oils'],
'Dough Based Products':['Dough Based Products / Meals','Frozen Bread & Dough'],
'Bacon, Sausages & Ribs':['Frozen Bacon, Sausages & Ribs','Frozen Sausages, Hotdogs & Brats','Bacon, Sausages & Ribs','Sausages, Hotdogs & Brats'],
'Flours & Grains':['Flours & Corn Meal','Grains/Flour'],
'Herbs & Spices':['Herbs & Spices','Herbs/Spices/Extracts'],
'Meat/Poultry/Other Animals':['Meat/Poultry/Other Animals  Prepared/Processed','Meat/Poultry/Other Animals  Unprepared/Unprocessed',
                              'Meat/Poultry/Other Animals Sausages  Prepared/Processed','Meat/Poultry/Other Animals Sausages – Prepared/Processed',
                              'Meat/Poultry/Other Animals – Prepared/Processed','Meat/Poultry/Other Animals – Unprepared/Unprocessed',
                              'Other Frozen Meats','Poultry, Chicken & Turkey','Canned Meat','Frozen Poultry, Chicken & Turkey','Other Meats','Frozen Patties and Burgers'],
'Non Alcoholic Beverages':['Non Alcoholic Beverages  Not Ready to Drink','Non Alcoholic Beverages  Ready to Drink','Non Alcoholic Beverages – Ready to Drink'],
'Cooking Sauces':['Cooking Sauces','Oriental, Mexican & Ethnic Sauces','Other Cooking Sauces'],
'Pasta/Noodles':['Pasta by Shape & Type','Pasta/Noodles','All Noodles'],
'Subs, Sandwiches, Wraps & Burittos':['Prepared Wraps and Burittos','Prepared Subs & Sandwiches'],
'Vegetables':['Vegetables  Prepared/Processed','Vegetables – Prepared/Processed','Canned Vegetables','Tomatoes','Frozen Vegetables'],
'Vegetable Based Products / Meals':['Vegetable Based Products / Meals','Vegetable Based Products / Meals - Not Ready to Eat (Frozen)'],
'Pancakes, Waffles, French Toast & Crepes':['Frozen Pancakes, Waffles, French Toast & Crepes', 'Pancakes, Waffles, French Toast & Crepes'],
'Entrees, Sides & Small Meals': ['Entrees, Sides & Small Meals',"Frozen Appetizers & Hors D'oeuvres",'Frozen Prepared Sides'],
'Fruit  Prepared/Processed':['Fruit  Prepared/Processed','Fruit - Prepared/Processed (Shelf Stable)']}
remove = ["Baby/Infant  Foods/Beverages","Miscellanious"]

In [6]:
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu') 
    return device
  
def unify_cat(dataframe, col, cat_list,new_cat):
  #unify several categories into 1 with new name
  if new_cat not in dataframe[col].cat.categories:
    dataframe[col].cat.add_categories(new_cat,inplace = True)
  dataframe.loc[dataframe[col].isin(cat_list),col] = new_cat
  dataframe[col].cat.remove_unused_categories(inplace=True)

def getData(file_path):
  branded_food = pd.read_csv(file_path[0])
  food2 = pd.read_csv(file_path[1])
  food2 = branded_food.merge(food2, on='fdc_id')
  del branded_food
  food = pd.DataFrame(food2["description"])
  food["brand"] = food2["brand_name"].fillna("").astype(str)
  food["category"] = food2["branded_food_category"].astype('category')
  del food2
  vc = food["category"].value_counts()
  vc = vc[vc > 100]
  no_of_classes = len(vc)
  food = food[food["category"].isin(vc.index)]
  food["category"] = food["category"].cat.remove_unused_categories()
  food["description"] = food["description"].fillna("").astype(str)
  food = food[~food["category"].isnull()]
  #put similar stuff together
  for x in remove:
    food = food[food["category"] != x]
  for x,y in mapping.items():
    unify_cat(food,"category",y,x)
  food["category"] = food["category"].cat.remove_unused_categories()
  return food

def getSplittedData(file_path, stratify=True, max_data= -1,filter=10):
  data = getData(file_path)
  if max_data != -1:
    data = data.sample(max_data)
  if stratify:
    return train_test_split(data,test_size=0.2,stratify=data["category"])
  return train_test_split(data,test_size=0.2)

def early_stopping(early_stop_after):
  early_stop_counter=0
  best_early_stopping = 0
  def inner_early_stopping(current):
    nonlocal early_stop_counter, best_early_stopping
    if best_early_stopping > current:
      early_stop_counter +=1
    else:
      best_early_stopping = current
      early_stop_counter =0
    return early_stop_counter >= early_stop_after

  return inner_early_stopping

def get_confusion_matrix(y_pred,y_true,labels):
  y_pred = y_pred.cpu()
  y_true = y_true.cpu()
  cm = confusion_matrix(y_true, y_pred)
  return cm

def get_evaluation(y_pred,y_true,labels):
  d = [[0,0,0] for _ in range(len(labels))]
  for y_p, y_t in zip(y_pred,y_true):
    if y_p == y_t:
      d[y_t][0] += 1
    else:
      d[y_p][2] += 1 # false negative for class y_p
      d[y_t][1] += 1 # false positive for class y_t
  recall = [x[0]/(x[0]+x[2]) if x[0]+x[2] != 0 else np.nan for x in d]
  prec = [x[0]/(x[0]+x[1])if x[0]+x[1] != 0 else np.nan for x in d]
  return np.array(recall),np.array(prec)

def print_evaluation(values,labels,categories):
  for x,l in zip(values,labels):
    print(f"{l}: {x:.2f}")

def test_crawled(curr_model,categories):
  categories = np.array(categories)
  crawled_food = pd.read_csv("data/crawled.csv")
  crawled_food["tokens"] = crawled_food.apply(lambda row: bigram_model[fd.preprocess(row["meal_name"])], axis=1)
  crawled_x = [embedd(v) for _,v in crawled_food["tokens"].iteritems()]
  crawled_x = torch.stack(crawled_x).float().to(device)
  curr_model.eval()
  y = curr_model(crawled_x)
  y = torch.nn.functional.softmax(y,dim=1)
  y = y.cpu()
  val,idx = torch.topk(y, 1, dim=1)
  crawled_food["category"] = categories[np.array(idx[:,0])]
  crawled_food["pct"] = val[:,0].detach().numpy()
  return crawled_food

In [7]:
embedding_model = KeyedVectors.load("data/models/mymodel")
embedding_size = 300
bigram_model = Phrases.load("data/models/bigram_model.pkl")
bigram_model = Phraser(bigram_model)
nltk.download('stopwords')
nltk.download('wordnet')
lst_stopwords = nltk.corpus.stopwords.words("english")
custom = ["gal", "oz", "t", "tsp", "teaspoon", 
          "tablespoon", "tbl", "tbs", "tbsp",
          "fl", "oz", "gil", "ounce", "ml", "l",
          "dl", "lb", "pund", "mg", "g", "kg", "gram", "cup","cups","container","avg","homemade","piece","serving","spam","servings","grams"]
lst_stopwords.extend(custom)
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [8]:
class FoodItemDataset(Dataset):
    def __init__(self, data, token_count,device):
      self.token_count = token_count
      self.no_of_classes = len(data["category"].value_counts())
      self.codes = list(data["category"].cat.codes)
      self.cats = list(data["category"].cat.categories)
      self.embedding = [self.calc(i,row[1]) for i,row in enumerate(data.iterrows())]
      
      
      self.codes = np.array(self.codes, dtype=np.int_)
    def __len__(self):
      return len(self.embedding)

    def __getitem__(self, idx):
      return (self.codes[idx],self.embedding[idx])

    def preprocess(self,name):
      name = re.sub(r'[^\w\s]', '', str(name).lower().strip())
      name = tokenizer.tokenize(name)
      return [lemmatizer.lemmatize(x) for x in name if x not in lst_stopwords and len(x)>2 and not any(char.isdigit() for char in x)]

    def calc(self, index, row):
      if index%100000 == 0:
        print(index)
      embedding = np.zeros((self.token_count,embedding_size),dtype=float)
      tokens_brand = self.preprocess(row["brand"])
      tokens = self.preprocess(row["description"])
      #if not any(x in tokens for x in tokens_brand):
      tokens = tokens_brand + tokens
      tokens = bigram_model[tokens]
      return tokens

In [9]:
createData = False
device = get_device()
if createData:
  if os.path.exists("zip.zip"):
    assert False
  urllib.request.urlretrieve("https://fdc.nal.usda.gov/fdc-datasets/FoodData_Central_branded_food_csv_2021-10-28.zip", "zip.zip")
  !unzip zip.zip -d data
  data_train, data_test = getSplittedData(("data/branded_food.csv","data/food.csv"))
  fd = FoodItemDataset(data_train,10,device)
  fd_test = FoodItemDataset(data_test,10,device)
  torch.save(fd,"data/dataset_train")
  torch.save(fd_test,"data/dataset_test")
else:
  fd = torch.load("data/dataset_train")
  fd_test = torch.load("data/dataset_test")

In [10]:
fd.cats

['Alcohol',
 'Bacon, Sausages & Ribs',
 'Biscuits/Cookies',
 'Bread',
 'Breakfast Drinks',
 'Breakfast Sandwiches, Biscuits & Meals',
 'Candy',
 'Canned & Bottled Beans',
 'Canned Fruit',
 'Cereal',
 'Cereal/Muesli Bars',
 'Cheese',
 'Cheese/Cheese Substitutes',
 'Chewing Gum & Mints',
 'Chips, Pretzels & Snacks',
 'Chocolate',
 'Coffee',
 'Coffee/Tea/Substitutes',
 'Cooked & Prepared',
 'Cookies & Biscuits',
 'Cream',
 'Crusts & Dough',
 'Deli Salads',
 'Desserts/Dessert Sauces/Toppings',
 'Dips & Salsa',
 'Eggs & Egg Substitutes',
 'Energy, Protein & Muscle Recovery Drinks',
 'Entrees, Sides & Small Meals',
 'Fish & Seafood',
 'Flavored Rice Dishes',
 'French Fries, Potatoes & Onion Rings',
 'Frozen Breakfast Sandwiches, Biscuits & Meals',
 'Frozen Dinners & Entrees',
 'Frozen Fruit & Fruit Juice Concentrates',
 'Fruit  Prepared/Processed',
 'Fruit & Vegetable Juice, Nectars & Fruit Drinks',
 'Gelatin, Gels, Pectins & Desserts',
 'Grain Based Products / Meals',
 'Granulated, Brown & 

In [11]:
def embedd(tokens):
  embedding = np.zeros((10,embedding_size),dtype=float)
  for i,token in enumerate(tokens[:10]):
    if not(token == "" or token == " " or token not in embedding_model.wv):
      embedding[i,:] = embedding_model.wv[token]
  return torch.FloatTensor(embedding)

def embedd_mean(tokens):
  i = 0
  embedding = np.zeros((embedding_size,),dtype=float)
  for token in tokens[:10]:
    if not(token == "" or token == " " or token not in embedding_model.wv):
      embedding += embedding_model.wv[token]
      i += 1
  if i == 0:
    return torch.FloatTensor(embedding)
  return torch.FloatTensor(embedding/i)

def collate_batch(batch):
  code_list, embedding_list = [],[]
  for code, tokens in batch:
    code_list.append(code)
    embedding_list.append(embedd(tokens))
  code_list = torch.LongTensor(code_list)
  embedding_list = torch.stack(embedding_list)
  return code_list, embedding_list

def collate_batch_eval(batch):
  code_list, embedding_list = collate_batch(batch)
  tokens_list = [x for _,x in batch]
  return code_list,embedding_list, tokens_list


Load the crawled items from file for inloop testing

In [12]:
class FoodClassificationModel(Module):
  def __init__(self, no_of_classes,device):
    super().__init__()
    p = 0.2
    self.device = device
    self.no_of_classes = no_of_classes
    self.model = torch.nn.Sequential(
        torch.nn.Linear(300, 200),
        torch.nn.LeakyReLU(),
        torch.nn.BatchNorm1d(200),
        torch.nn.Linear(200, 100),
        torch.nn.LeakyReLU(),
        torch.nn.BatchNorm1d(100),
        torch.nn.Linear(100,self.no_of_classes)).to(self.device)
  def forward(self,x):
    return self.model(x)
    
  def get_accuracy(self,X,y):
    y_pred = self.forward(X)
    res = torch.argmax(y_pred, dim=1)
    res = y-res
    l = int(torch.count_nonzero(res))
    count = list(y_pred.shape)[0]
    return (count-l)/count

In [13]:
class FoodClassificationCnnModel(Module):
  def __init__(self, no_of_classes,device,dropout=0.2):
    super().__init__()
    n_filters = 100
    self.device = device
    self.no_of_classes = no_of_classes
    self.filter1 =  torch.nn.Conv2d(in_channels = 1, 
                                    out_channels = n_filters, 
                                    kernel_size = (3, 300)).to(device)
    self.filter2 =  torch.nn.Conv2d(in_channels = 1, 
                                    out_channels = n_filters, 
                                    kernel_size = (4, 300)).to(device) 
    self.filter3 =  torch.nn.Conv2d(in_channels = 1, 
                                    out_channels = n_filters, 
                                    kernel_size = (5, 300)).to(device)
    self.filter4 =  torch.nn.Conv2d(in_channels = 1, 
                                    out_channels = n_filters, 
                                    kernel_size = (1, 300)).to(device)
    self.linear = torch.nn.Sequential(torch.nn.Linear(4*n_filters,100),
                                      torch.nn.LeakyReLU(),
                                      torch.nn.BatchNorm1d(100),
                                      torch.nn.Linear(100,self.no_of_classes)).to(device)
    self.relu = torch.nn.ReLU().to(device)
    self.dropout = torch.nn.Dropout(dropout).to(device)
  
  def forward(self,x):
    x = x.unsqueeze(1)
    f1 = self.relu(self.filter1(x)).squeeze(3)
    f2 = self.relu(self.filter2(x)).squeeze(3)
    f3 = self.relu(self.filter3(x)).squeeze(3)
    f4 = self.relu(self.filter4(x)).squeeze(3)
    f1 = torch.nn.functional.max_pool1d(f1,f1.shape[2])
    f2 = torch.nn.functional.max_pool1d(f2,f2.shape[2])
    f3 = torch.nn.functional.max_pool1d(f3,f3.shape[2])
    f4 = torch.nn.functional.max_pool1d(f4,f4.shape[2])
    linear = self.dropout(torch.cat((f1,f2,f3,f4),dim=1)).squeeze(2)
    out = self.linear(linear)
    return out
    
  def get_accuracy(self,X,y):
    y_pred = self.forward(X)
    res = torch.argmax(y_pred, dim=1)
    res = y-res
    l = int(torch.count_nonzero(res))
    count = list(y_pred.shape)[0]
    return (count-l)/count

In [14]:
config = dict(
    lr = 1e-2,
    no_of_classes = fd.no_of_classes,
    batch_size= 50000,
    num_workers=2,
    loss=torch.nn.CrossEntropyLoss(),
    dataset="FoodItem2.1(1,300)Filter",
    epochs=30,
    test_set_size= len(fd_test),
    train_set_size= len(fd),
    early_stop_after=2
)

In [15]:
config["no_of_classes"]

108

In [16]:
model = FoodClassificationCnnModel(config["no_of_classes"],device)

In [17]:
dataloader = DataLoader(fd,batch_size=config["batch_size"], 
                        collate_fn=collate_batch,pin_memory=True, 
                        shuffle=True,
                        num_workers=config["num_workers"])
dataloader_test = DataLoader(fd_test,batch_size=config["batch_size"], 
                             collate_fn=collate_batch, pin_memory=True,
                             num_workers=config["num_workers"])
optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
assert len(dataloader.dataset) > len(dataloader_test.dataset)
stopping_func = early_stopping(config["early_stop_after"])
loss_fn = config["loss"]

In [None]:
assert len(dataloader.dataset) > len(dataloader_test.dataset)

In [None]:
wandb.login()
run = wandb.init(project="food-classification", entity="gege-hoho", config=config)

[34m[1mwandb[0m: Currently logged in as: [33mgege-hoho[0m (use `wandb login --relogin` to force relogin)


In [None]:
config['lr'] = 1e-4

In [None]:
print(f"Train with LR:{config['lr']}")
assert len(dataloader.dataset) > len(dataloader_test.dataset)
print(device)

for epoch in range(config["epochs"]):
  model.train()
  x1 = time.perf_counter()
  for data in dataloader:    
    y_train = data[0].to(device)
    X_train = data[1].to(device)
    #X_train = X_train.mean(dim=1)
    y_pred = model.forward(X_train)
    loss = loss_fn(y_pred, y_train)
    wandb.log({"train-loss": loss})
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  #scheduler.step()
  x2 = time.perf_counter()
  print(f"Epoch Time: {x2-x1}")
  wandb.log({"epoch-time": x2-x1})
  if epoch % 1 == 0:
    acc_tr = model.get_accuracy(X_train,y_train)
    print(f"{epoch} Accuracy Train {acc_tr}")
    wandb.log({"train-acc": acc_tr})
    acc_te = 0
    rec_te = np.zeros((len(fd_test.cats),))
    pre_te = np.zeros((len(fd_test.cats),))
    model.eval()
    for i,test in enumerate(dataloader_test):
      y_test = test[0].to(device)
      X_test = test[1].to(device)
      #X_test = X_test.mean(dim=1)
      acc_te += model.get_accuracy(X_test,y_test)
    acc_te = acc_te/(i+1)
    wandb.log({"test-acc": acc_te})
    print(f"{epoch} Accuracy Test {acc_te}")
    if stopping_func(acc_te):
      print("early stopping")
      break
  if epoch % 5 ==0:
    crawled_food = test_crawled(model,fd.cats)[0:500][["meal_name","tokens","category","pct"]]
    wandb.log({"crawled_food": crawled_food})
    print("logged cralwed food")
  
      
      
  

Train with LR:0.001
cuda:0
Epoch Time: 80.27048318800007
0 Accuracy Train 0.9260555028462998
0 Accuracy Test 0.927246424505286
Epoch Time: 80.03988636400027
1 Accuracy Train 0.9271228652751423
1 Accuracy Test 0.9276568500948768
Epoch Time: 79.92357244600134
2 Accuracy Train 0.9285460151802657
2 Accuracy Test 0.9287695798319326
Epoch Time: 80.26891889000035
3 Accuracy Train 0.9285163662239089
3 Accuracy Test 0.9289629655733262
Epoch Time: 79.97859717000028
4 Accuracy Train 0.9273897058823529
4 Accuracy Test 0.9296051179181352
Epoch Time: 80.37067238300006
5 Accuracy Train 0.9302063567362429
5 Accuracy Test 0.9302052697207915
Epoch Time: 80.10600337099822
6 Accuracy Train 0.9295837286527514
6 Accuracy Test 0.9296266738953646
Epoch Time: 80.12673555299989
7 Accuracy Train 0.9296726755218216
7 Accuracy Test 0.9318929357549471
Epoch Time: 80.3200975189975
8 Accuracy Train 0.9293465370018975
8 Accuracy Test 0.9315237137435618
Epoch Time: 80.09411333300159
9 Accuracy Train 0.9291093453510436


In [None]:
torch.save(model,"model93.2")
run.finish()

VBox(children=(Label(value=' 6.46MB of 6.46MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch-time,█▂▄▄▄▂▁▁▄▄▂▁▅▆▂▃▂▂▂▅▃▃▄▄▄▄▁▂▄▂▃▃▄▁▄▂▄▃▃▄
test-acc,▁▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████████████
train-acc,▁▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████████████
train-loss,█▅▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch-time,80.09411
model,FoodClassificationCn...
test-acc,0.93168
train-acc,0.92911
train-loss,0.23755


In [18]:
model = torch.load("model93.2")

In [19]:
model.eval()
dataloader_eval = DataLoader(fd_test,batch_size=150000, collate_fn=collate_batch_eval, pin_memory=True)
wrong_data =  []
for i,test in enumerate(dataloader_eval):
  y_test = test[0].to(device)
  y_pred = model(test[1].to(device))
  tokens = test[2]
  y_pred = torch.argmax(y_pred, dim=1)
  for y_p, y_t,tokens in zip(y_pred,y_test,tokens):
    if y_p != y_t:
      #print(f"Expected {fd_test.cats[y_t]} Got {fd_test.cats[y_p]} for {tokens}")
      wrong_data.append((fd_test.cats[y_t],fd_test.cats[y_p],tokens)) #actual cat, predicted cat, tokens
    #X_test = X_test.mean(dim=1)
  
  recall,prec = get_evaluation(y_pred,y_test, fd_test.cats)
  cm = get_confusion_matrix(y_pred,y_test,fd_test.cats)
  break


In [None]:
print("Recall")
print(f"Mean: {np.mean(recall)}")
print(f"Std: {np.std(recall)}")
print(f"Min: {np.min(recall)}")
print(f"Max: {np.max(recall)}")
print("Precision")
print(f"Mean: {np.mean(prec)}")
print(f"Std: {np.std(prec)}")
print(f"Min: {np.min(prec)}")
print(f"Max: {np.max(prec)}")

Recall
Mean: 0.8978495460956116
Std: 0.09468527891325575
Min: 0.4444444444444444
Max: 0.9819819819819819
Precision
Mean: 0.8603575885866698
Std: 0.13536598460951252
Min: 0.15384615384615385
Max: 0.9827132038195587


In [None]:
print(fd_test.cats[np.argmin(recall)])
print(fd_test.cats[np.argmax(recall)])
print(fd_test.cats[np.argmin(prec)])
print(fd_test.cats[np.argmax(prec)])

Weight Control
Gelatin, Gels, Pectins & Desserts
Prepared/Preserved Foods Variety Packs
Cheese


In [24]:
for i,(p,r) in enumerate(zip(prec, recall)):
  print(f"{fd_test.cats[i].lower()};{(p*100):.2f}%;{(r*100):.2f}%")

alcohol;85.06%;91.36%
bacon, sausages & ribs;96.24%;93.57%
biscuits/cookies;88.69%;94.20%
bread;95.46%;96.16%
breakfast drinks;73.33%;68.75%
breakfast sandwiches, biscuits & meals;54.65%;88.68%
candy;92.47%;92.82%
canned & bottled beans;86.35%;90.04%
canned fruit;95.10%;95.72%
cereal;97.14%;97.30%
cereal/muesli bars;83.72%;97.30%
cheese;98.27%;97.80%
cheese/cheese substitutes;69.23%;90.00%
chewing gum & mints;94.83%;93.75%
chips, pretzels & snacks;96.53%;95.71%
chocolate;87.03%;85.42%
coffee;77.88%;88.00%
coffee/tea/substitutes;97.06%;91.67%
cooked & prepared;74.31%;82.09%
cookies & biscuits;94.30%;94.28%
cream;90.94%;97.15%
crusts & dough;85.69%;87.57%
deli salads;83.91%;90.16%
desserts/dessert sauces/toppings;63.64%;60.87%
dips & salsa;96.19%;95.48%
eggs & egg substitutes;97.45%;95.02%
energy, protein & muscle recovery drinks;77.79%;59.28%
entrees, sides & small meals;82.72%;82.09%
fish & seafood;97.11%;96.14%
flavored rice dishes;83.04%;91.48%
french fries, potatoes & onion rings;92

TopK accuraccy:

In [None]:
k = 3
y_pred = model(test[1].to(device))
y_test = test[0].to(device)
topk = torch.topk(y_pred, k, dim=1)
s = 0
count = y_pred.shape[0]
for i in range(topk.indices.shape[1]):
  res = y_test -topk.indices[:,i]
  s += (res == 0).sum()

print(s/count)


tensor(0.9918, device='cuda:0')


In [None]:
crawled_food = test_crawled(model,fd.cats)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 100):
  display(crawled_food[0:400][["meal_name","tokens","category","pct"]])

Unnamed: 0,meal_name,tokens,category,pct
0,"Rockstar - Punched, 8 fl.oz.","[rockstar punched, floz]",Soda,0.303856
1,Airbourne - Airbourne Plus beta immune booster...,"[airbourne, airbourne, plus, beta, immune boos...","Fruit & Vegetable Juice, Nectars & Fruit Drinks",0.800493
2,"Powerade - Zero - Fruit Punch (32 Fl oz), 1 co...","[powerade zero, fruit punch]",Other Drinks,0.305022
3,"Homemade - Grilled Chicken Tenders, 4 oz","[grilled chicken, tender]",Frozen Dinners & Entrees,0.604318
4,"Parmesan cheese, 4 tbsp","[parmesan, cheese]",Cheese,0.978229
5,"Rotini - Pasta, 4 oz",[rotini pasta],Pasta/Noodles,0.913943
6,"Kroger - Mini Chocolate Cupcakes, 3 cupcakes","[kroger, mini, chocolate, cupcake, cupcake]",Cakes,0.999972
7,"Powerade - Twisted Blackberry, 1 container (30...","[powerade, twisted, blackberry]",Candy,0.8184
8,"Homemade - Toasted Ham and Cheese Sandwich, 1 ...","[toasted, ham, cheese, sandwich, sandwich]","Subs, Sandwiches, Wraps & Burittos",0.950372
9,"Banana, 1 medium","[banana, medium]",Cheese,0.156353


In [None]:
bigram_model[fd.preprocess("red bull".split(" "))]

['red bull']

In [None]:
x = embedd(bigram_model[fd.preprocess("FRUIT WAVE".split(" "))]).float().to(device)
x = x.unsqueeze(0)
print(x.shape)
model.eval()
z = model.forward(x)
ss = torch.nn.Softmax()
z2 = ss(z)
fd.cats[torch.argmax(z2, dim=1)]
# z = m2(x)
# z2 = ss(z)
#topk = torch.topk(z2,3, dim=1).values.cpu()#torch.argmax(z2, dim=1)
#print(topk)
#for x in topk:
#  print(fd.cats[x])

torch.Size([1, 10, 300])


  import sys


'Soda'

In [None]:
#Model as in Convolutional Neural Networks for Sentence Classification
class FoodClassificationCnnModelUsualElevator7(Module):
  def __init__(self, no_of_classes,device):
    super().__init__()
    n_filters = 100
    self.device = device
    self.no_of_classes = no_of_classes
    self.filter1 =  torch.nn.Conv2d(in_channels = 1, 
                                    out_channels = n_filters, 
                                    kernel_size = (3, 300)).to(device)
    self.filter2 =  torch.nn.Conv2d(in_channels = 1, 
                                    out_channels = n_filters, 
                                    kernel_size = (4, 300)).to(device) 
    self.filter3 =  torch.nn.Conv2d(in_channels = 1, 
                                    out_channels = n_filters, 
                                    kernel_size = (5, 300)).to(device)
    self.linear = torch.nn.Sequential(torch.nn.Linear(3*n_filters,100),
                                      torch.nn.LeakyReLU(),
                                      torch.nn.BatchNorm1d(100),
                                      torch.nn.Linear(100,self.no_of_classes)).to(device)
    self.relu = torch.nn.ReLU().to(device)
    self.dropout = torch.nn.Dropout(0.2).to(device)
  
  def forward(self,x):
    x = x.unsqueeze(1)
    f1 = self.relu(self.filter1(x)).squeeze(3)
    f2 = self.relu(self.filter2(x)).squeeze(3)
    f3 = self.relu(self.filter3(x)).squeeze(3)
    f1 = torch.nn.functional.max_pool1d(f1,f1.shape[2])
    f2 = torch.nn.functional.max_pool1d(f2,f2.shape[2])
    f3 = torch.nn.functional.max_pool1d(f3,f3.shape[2])

    linear = self.dropout(torch.cat((f1,f2,f3),dim=1)).squeeze(2)
    out = self.linear(linear)
    return out
    
  def get_accuracy(self,X,y):
    y_pred = self.forward(X)
    res = torch.argmax(y_pred, dim=1)
    res = y-res
    l = int(torch.count_nonzero(res))
    count = list(y_pred.shape)[0]
    return (count-l)/count


In [None]:
"""
wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                        y_true=np.array(y_test.cpu()), preds=np.array(y_pred.cpu()),
                        class_names=fd_test.cats)})
                        """

In [None]:
#version for mean embedding
"""
m2 = model.eval()

m2.eval()
crawled_food = pd.read_csv("data/crawled.csv")
crawled_food["tokens"] = crawled_food.apply(lambda row: bigram_model[fd.preprocess(row["meal_name"])], axis=1)

print("tokenized now lets go to embedding")
#todo check if any tokens have 0 embedding
crawled_food["embedding"] = crawled_food.apply(lambda row: embedd_mean(row["tokens"]),axis=1)


print("embedded")
#filter things with 0 embedding
null_embedding = crawled_food["embedding"].isnull()
food_null_embedding = crawled_food[null_embedding]
crawled_food= crawled_food[~null_embedding]
print(len(crawled_food))
x = crawled_food.embedding.apply(pd.Series)
x = torch.from_numpy(x.values).float().to(device)
y = m2(x)
y = torch.argmax(y, dim=1)
y = y.cpu().numpy()
crawled_food["category"] = np.nan
y
j = 0
for i,z in enumerate(~null_embedding):

  if z:
    crawled_food["category"][i] = fd.cats[y[j]]
    j+= 1
"""

tokenized now lets go to embedding
embedded
21039


RuntimeError: ignored