#**Imports and Util**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment1/'
FOLDERNAME = 'AI Capstone/Colab Notebooks/Data Filtering'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

#This was all copied from homework colabs from last semester

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import requests
import pandas as pd
from io import BytesIO

import torch.nn as nn
import torch as T
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

from collections import OrderedDict

from filtering_model import FilteringModel

In [None]:
#Credit to https://huggingface.co/thenlper/gte-base
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [None]:
def loadData(link):
  req = requests.get(link)
  content = BytesIO(req.content)
  df = pd.read_csv(content)
  return df

#**Dataset Loading**

In [None]:
food_data = loadData("https://raw.githubusercontent.com/ezramuskat/Ingredient-Substitution-Capstone/main/data_preparation/classification_dataset/common_ingredients.csv")
food_data

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,salt,yes,yes,yes,yes
1,olive oil,yes,yes,yes,yes
2,onions,yes,yes,yes,yes
3,water,yes,yes,yes,yes
4,garlic,yes,yes,yes,yes
...,...,...,...,...,...
494,boneless chicken breast,no,no,yes,yes
495,crème fraîche,yes,no,no,yes
496,cooked white rice,yes,yes,yes,yes
497,pecans,yes,yes,yes,yes


#**Best Common Ingredients Using GTE-BASE**

In [None]:
#Credit to this source: https://huggingface.co/thenlper/gte-base
def get_closest_n(input_texts,compare_texts,n=5):
  #Combine input texts and compare texts while preserving the length of the input texts
  input_length = len(input_texts)
  input_texts = [x for x in input_texts]
  input_texts.extend(compare_texts)

  #thenlper/gte-base
  tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-base")
  model = AutoModel.from_pretrained("thenlper/gte-base")

  # Tokenize the input texts
  batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
  outputs = model(**batch_dict)
  embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

  # (Optionally) normalize embeddings
  embeddings = nn.functional.normalize(embeddings, p=2, dim=1)
  scores = (embeddings[:input_length] @ embeddings[input_length:].T)

  best_scores = T.topk(scores,n)
  return [[input_texts[i+input_length] for i in best_scores[1][j]] for j in range(len(best_scores[1]))]

In [None]:
#input_texts = food_data["ingredient"].tolist()

#get_closest_n(["beans","milk","flour"],input_texts)

#Filtering Model Class Experimentation

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(768, 256)),
    ('relu1', nn.LeakyReLU()),
    ('bn1', nn.BatchNorm1d(256)),
    ('fc2', nn.Linear(256, 64)),
    ('dr1', nn.Dropout(0.3)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.BatchNorm1d(64)),
    ('fc3', nn.Linear(64, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient","facebook/drama-base",fmodel_internal_model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

Epoch: 0 | Train Loss: 9.155964612960815 | Val Loss: 11.586176991462708 | Val Acc: tensor(0.3000)
Epoch: 1 | Train Loss: 7.617407441139221 | Val Loss: 11.389505505561829 | Val Acc: tensor(0.4900)
Epoch: 2 | Train Loss: 7.3942758440971375 | Val Loss: 10.952295184135437 | Val Acc: tensor(0.5500)
Epoch: 3 | Train Loss: 6.509452790021896 | Val Loss: 10.09820032119751 | Val Acc: tensor(0.6725)
Epoch: 4 | Train Loss: 6.112509101629257 | Val Loss: 8.453011751174927 | Val Acc: tensor(0.8675)
Epoch: 5 | Train Loss: 5.510549396276474 | Val Loss: 7.402165800333023 | Val Acc: tensor(0.8825)
Epoch: 6 | Train Loss: 5.3928505182266235 | Val Loss: 7.52691787481308 | Val Acc: tensor(0.8650)
Epoch: 7 | Train Loss: 4.717501252889633 | Val Loss: 6.542365461587906 | Val Acc: tensor(0.8625)
Epoch: 8 | Train Loss: 4.414358615875244 | Val Loss: 5.614591464400291 | Val Acc: tensor(0.8875)
Epoch: 9 | Train Loss: 4.076723754405975 | Val Loss: 5.069181248545647 | Val Acc: tensor(0.9000)
Epoch: 10 | Train Loss: 3.

In [None]:
#Recipie for a non-kosher cheeseburger
fmodel.filter(["salt","bread","meat","cheese","cheddar cheese","onions","chili","taco seasoned meat","worcestershire sauce","bug infested lettuce","cookie labeled eat me","lead pipe","your hopes and dreams"],threshold=0.5)

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,salt,yes,yes,yes,yes
1,bread,yes,no,yes,no
2,meat,no,no,yes,yes
3,cheese,yes,no,no,yes
4,cheddar cheese,yes,no,no,yes
5,onions,yes,yes,yes,yes
6,chili,yes,yes,yes,yes
7,taco seasoned meat,no,no,yes,yes
8,worcestershire sauce,no,no,yes,no
9,bug infested lettuce,yes,yes,yes,yes


In [None]:
#Used Models
#"allenai/scibert_scivocab_uncased"
#"facebook/drama-base"