#**Imports and Util**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

FOLDERNAME = 'AI Capstone/Colab Notebooks/Data Filtering'
assert FOLDERNAME is not None, "[!] Enter the foldername."


import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

%load_ext autoreload
%autoreload 2

Mounted at /content/drive


In [2]:
#Was used for a test requirenemt. I might readd in the future, but right now it is not needed.
#!pip install sdv

In [3]:
import requests
import pandas as pd
from io import BytesIO

import torch.nn as nn
import torch as T
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

from collections import OrderedDict

from filtering_model import FilteringModel

In [4]:
#Credit to https://huggingface.co/thenlper/gte-base
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [5]:
def loadData(link):
  req = requests.get(link)
  content = BytesIO(req.content)
  df = pd.read_csv(content)
  return df

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np

#Function for testing a model
def test_model(predictions:pd.DataFrame,test_data:pd.DataFrame,name_column:str="ingredient",print_results=False):
  #Trim DataFrames
  test_data = test_data.drop(columns=[name_column])
  predictions = predictions.drop(columns=[name_column])

  #Convert to binary
  test_data = (test_data == 'yes').to_numpy()
  predictions = (predictions == 'yes').to_numpy()

  #Get Metrics
  accuracy = accuracy_score(test_data,predictions)
  precision = precision_score(test_data,predictions,average=None)
  recall = recall_score(test_data,predictions,average=None)

  #Print Results
  if print_results:
    print("Accuracy:",accuracy)
    print("Avg. Precision:",np.average(precision),"| Full Precision",precision)
    print("Avg. Recall",np.average(recall),"| Full Recall",recall)

  return [accuracy,precision,recall]

#Function for getting incorrect results of a model
def get_incorrect_results(predictions:pd.DataFrame,test_data:pd.DataFrame,name_column:str="ingredient"):
  #Reset indices of inputs for consistency
  test_data = test_data.reset_index(drop=True)
  predictions = predictions.reset_index(drop=True)

  #Gather false positives and negatives
  false_positives_df = (test_data == predictions) | (predictions == "no")
  false_negatives_df = (test_data == predictions) | (predictions == "yes")

  #Filter the data to show only rows with incorrect results
  incorrect_classifications = predictions[false_positives_df.eq(0).any(axis=1) | false_negatives_df.eq(0).any(axis=1)]
  incorrect_classifications = incorrect_classifications.where(false_positives_df==True,"false positive")
  incorrect_classifications = incorrect_classifications.where(false_negatives_df==True,"false negative")
  incorrect_classifications = incorrect_classifications.where((false_positives_df==False) | (false_negatives_df==False),"correct")

  #Reset the name column and return
  incorrect_classifications[name_column] = test_data[name_column]
  return incorrect_classifications

#**Dataset Loading and Analysis**

In [7]:
from sklearn.model_selection import train_test_split

food_data = loadData("https://raw.githubusercontent.com/ezramuskat/Ingredient-Substitution-Capstone/main/data_preparation/classification_dataset/common_ingredients_1000.csv")
food_data, food_test_data = train_test_split(food_data, test_size=0.1)

food_data

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
685,blackberries,yes,yes,yes,yes
950,pepperoni,no,no,yes,yes
880,bird chile,yes,yes,yes,yes
62,lemon,yes,yes,yes,yes
205,active dry yeast,yes,yes,yes,yes
...,...,...,...,...,...
825,konbu,yes,yes,yes,yes
794,lemon slices,yes,yes,yes,yes
418,grated lemon peel,yes,yes,yes,yes
543,seeds,yes,yes,yes,yes


In [8]:
print((food_data[:] == 'yes').sum() / (food_data[:] != 'yes').sum())

ingredient      0.000000
vegetarian      6.250000
vegan           2.730290
dairy_free      9.215909
gluten_free    13.983333
dtype: float64


In [9]:
specific_examples = ['rice','meat','bread','cheeseburger','veal']

#**Filtering Model Class Experimentation**

##**facebook/drama-base**

In [None]:
model_name = "facebook/drama-base"

###**Regression Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(768, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

Epoch: 1 | Train Loss: 14.05 | Val Loss: 3.48 | Train Acc: 0.648 | Val Acc: 0.705 | Train Pre: 0.854 | Val Pre: 0.862 | Train Rec: 0.854 | Val Rec: 0.862
Epoch: 2 | Train Loss: 11.958 | Val Loss: 2.952 | Train Acc: 0.679 | Val Acc: 0.741 | Train Pre: 0.854 | Val Pre: 0.878 | Train Rec: 0.854 | Val Rec: 0.878
Epoch: 3 | Train Loss: 10.621 | Val Loss: 2.673 | Train Acc: 0.679 | Val Acc: 0.723 | Train Pre: 0.854 | Val Pre: 0.871 | Train Rec: 0.854 | Val Rec: 0.871
Epoch: 4 | Train Loss: 9.764 | Val Loss: 2.524 | Train Acc: 0.678 | Val Acc: 0.711 | Train Pre: 0.854 | Val Pre: 0.863 | Train Rec: 0.854 | Val Rec: 0.863
Epoch: 5 | Train Loss: 9.206 | Val Loss: 2.378 | Train Acc: 0.68 | Val Acc: 0.717 | Train Pre: 0.855 | Val Pre: 0.866 | Train Rec: 0.855 | Val Rec: 0.866
Epoch: 6 | Train Loss: 8.882 | Val Loss: 2.251 | Train Acc: 0.678 | Val Acc: 0.729 | Train Pre: 0.854 | Val Pre: 0.872 | Train Rec: 0.854 | Val Rec: 0.872
Epoch: 7 | Train Loss: 8.622 | Val Loss: 2.25 | Train Acc: 0.678 | Val

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

Accuracy: 0.83
Avg. Precision: 0.9563119790862787 | Full Precision [0.95454545 0.93421053 0.96774194 0.96875   ]
Avg. Recall 0.9799002301104032 | Full Recall [0.98823529 0.97260274 1.         0.95876289]


Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
15,noodles,correct,correct,correct,false positive
16,bacon drippings,false positive,correct,correct,correct
17,celery ribs,false negative,false negative,correct,correct
43,amchur,correct,correct,correct,false negative
46,rib,false positive,false positive,correct,correct
48,low sodium soy sauce,correct,false negative,correct,false positive
55,urad dal,correct,correct,correct,false negative
60,ramen noodles,correct,correct,correct,false positive
65,salami,false positive,correct,correct,correct
68,ghee,correct,false positive,false positive,correct


In [None]:
fmodel.filter(specific_examples,bool_format=False)

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,veal,0.839466,0.734906,0.894085,0.901564


In [None]:
del fmodel_internal_model
del fmodel

###**Small Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(768, 128)),
    ('dr1', nn.Dropout(0.25)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.BatchNorm1d(128)),
    ('fc3', nn.Linear(128, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

Epoch: 1 | Train Loss: 13.296 | Val Loss: 3.603 | Train Acc: 0.274 | Val Acc: 0.736 | Train Pre: 0.971 | Val Pre: 0.969 | Train Rec: 0.971 | Val Rec: 0.969
Epoch: 2 | Train Loss: 10.44 | Val Loss: 2.42 | Train Acc: 0.508 | Val Acc: 0.642 | Train Pre: 0.988 | Val Pre: 0.98 | Train Rec: 0.988 | Val Rec: 0.98
Epoch: 3 | Train Loss: 8.462 | Val Loss: 1.991 | Train Acc: 0.711 | Val Acc: 0.721 | Train Pre: 0.989 | Val Pre: 0.985 | Train Rec: 0.989 | Val Rec: 0.985
Epoch: 4 | Train Loss: 6.393 | Val Loss: 1.527 | Train Acc: 0.832 | Val Acc: 0.851 | Train Pre: 0.993 | Val Pre: 0.975 | Train Rec: 0.993 | Val Rec: 0.975
Epoch: 5 | Train Loss: 4.993 | Val Loss: 1.288 | Train Acc: 0.891 | Val Acc: 0.849 | Train Pre: 0.99 | Val Pre: 0.974 | Train Rec: 0.99 | Val Rec: 0.974
Epoch: 6 | Train Loss: 3.53 | Val Loss: 1.048 | Train Acc: 0.931 | Val Acc: 0.871 | Train Pre: 0.991 | Val Pre: 0.966 | Train Rec: 0.991 | Val Rec: 0.966
Epoch: 7 | Train Loss: 2.714 | Val Loss: 0.898 | Train Acc: 0.958 | Val Acc

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

Accuracy: 0.88
Avg. Precision: 0.9640541840372776 | Full Precision [0.96551724 0.92307692 0.97826087 0.9893617 ]
Avg. Recall 0.9833248876446498 | Full Recall [0.98823529 0.98630137 1.         0.95876289]


Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
15,noodles,correct,correct,correct,false positive
18,corn flour,correct,correct,correct,false negative
22,light mayonnaise,correct,false positive,correct,correct
24,light soy sauce,correct,correct,correct,false negative
65,salami,false positive,false positive,correct,correct
68,ghee,correct,false positive,false positive,correct
70,pepperoni,false positive,false positive,correct,correct
72,rice flour,correct,correct,correct,false negative
79,greek yogurt,correct,correct,false positive,correct
80,dry white wine,correct,false positive,correct,correct


In [None]:
fmodel.filter(specific_examples,bool_format=False)

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,veal,0.916824,0.938004,0.992918,0.98354


In [None]:
del fmodel_internal_model
del fmodel

###**Large Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(768, 256)),
    ('relu1', nn.LeakyReLU()),
    ('bn1', nn.BatchNorm1d(256)),
    ('fc2', nn.Linear(256, 64)),
    ('dr1', nn.Dropout(0.3)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.BatchNorm1d(64)),
    ('fc3', nn.Linear(64, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch: 1 | Train Loss: 13.022 | Val Loss: 3.887 | Train Acc: 0.322 | Val Acc: 0.068 | Train Pre: 0.984 | Val Pre: 0.941 | Train Rec: 0.984 | Val Rec: 0.941
Epoch: 2 | Train Loss: 10.414 | Val Loss: 2.644 | Train Acc: 0.537 | Val Acc: 0.748 | Train Pre: 0.994 | Val Pre: 0.969 | Train Rec: 0.994 | Val Rec: 0.969
Epoch: 3 | Train Loss: 9.266 | Val Loss: 2.602 | Train Acc: 0.658 | Val Acc: 0.649 | Train Pre: 0.996 | Val Pre: 0.975 | Train Rec: 0.996 | Val Rec: 0.975
Epoch: 4 | Train Loss: 7.87 | Val Loss: 2.263 | Train Acc: 0.75 | Val Acc: 0.726 | Train Pre: 0.996 | Val Pre: 0.964 | Train Rec: 0.996 | Val Rec: 0.964
Epoch: 5 | Train Loss: 6.734 | Val Loss: 1.923 | Train Acc: 0.81 | Val Acc: 0.826 | Train Pre: 0.995 | Val Pre: 0.958 | Train Rec: 0.995 | Val Rec: 0.958
Epoch: 6 | Train Loss: 5.482 | Val Loss: 1.782 | Train Acc: 0.861 | Val Acc: 0.819 | Train Pre: 0.998 | Val Pre: 0.955 | Train Rec: 0.998 | Val Rec: 0.955
Epoch: 7 | Train Loss: 4.376 | Val Loss: 1.635 | Train Acc: 0.901 | Val

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

Accuracy: 0.86
Avg. Precision: 0.9435712161549791 | Full Precision [0.9375     0.9        0.97802198 0.95876289]
Avg. Recall 0.9643754633450558 | Full Recall [0.94936709 0.94029851 0.98888889 0.97894737]


Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
7,cream of tartar,false negative,false negative,false negative,false negative
14,steak,false positive,correct,correct,correct
17,semisweet chocolate,correct,false positive,false positive,correct
34,poultry seasoning,false negative,false negative,correct,correct
46,bow-tie pasta,false negative,false negative,correct,false positive
57,lard,false positive,false positive,correct,correct
63,tortillas,correct,correct,correct,false positive
78,pancetta,false positive,false positive,correct,correct
80,bittersweet chocolate,correct,false positive,false positive,correct
81,oyster sauce,false positive,false positive,correct,false positive


In [None]:
fmodel.filter(specific_examples,bool_format=False)

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,veal,0.983033,0.949614,0.989136,0.970964


In [None]:
del fmodel_internal_model
del fmodel

##**sentence-transformers/all-MiniLM-L6-v2**

In [10]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"

###**Regression Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(384, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

Epoch: 1 | Train Loss: 14.682 | Val Loss: 3.822 | Train Acc: 0.525 | Val Acc: 0.708 | Train Pre: 0.868 | Val Pre: 0.867 | Train Rec: 0.868 | Val Rec: 0.867
Epoch: 2 | Train Loss: 13.481 | Val Loss: 3.51 | Train Acc: 0.691 | Val Acc: 0.72 | Train Pre: 0.86 | Val Pre: 0.873 | Train Rec: 0.86 | Val Rec: 0.873
Epoch: 3 | Train Loss: 12.467 | Val Loss: 3.264 | Train Acc: 0.691 | Val Acc: 0.708 | Train Pre: 0.86 | Val Pre: 0.868 | Train Rec: 0.86 | Val Rec: 0.868
Epoch: 4 | Train Loss: 11.604 | Val Loss: 3.07 | Train Acc: 0.693 | Val Acc: 0.702 | Train Pre: 0.861 | Val Pre: 0.865 | Train Rec: 0.861 | Val Rec: 0.865
Epoch: 5 | Train Loss: 10.922 | Val Loss: 2.87 | Train Acc: 0.691 | Val Acc: 0.714 | Train Pre: 0.86 | Val Pre: 0.87 | Train Rec: 0.86 | Val Rec: 0.87
Epoch: 6 | Train Loss: 10.313 | Val Loss: 2.697 | Train Acc: 0.692 | Val Acc: 0.72 | Train Pre: 0.86 | Val Pre: 0.876 | Train Rec: 0.86 | Val Rec: 0.876
Epoch: 7 | Train Loss: 9.835 | Val Loss: 2.591 | Train Acc: 0.691 | Val Acc: 0.

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

Accuracy: 0.73
Avg. Precision: 0.9310122115856652 | Full Precision [0.9047619  0.9375     0.93333333 0.94845361]
Avg. Recall 0.9398255226200775 | Full Recall [0.96202532 0.89552239 0.93333333 0.96842105]


Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
7,cream of tartar,correct,false negative,correct,correct
17,semisweet chocolate,correct,correct,false positive,correct
20,anchovies,false positive,correct,correct,correct
30,granny smith apples,correct,false negative,false negative,correct
34,poultry seasoning,false negative,false negative,correct,correct
38,gari,correct,correct,false negative,false negative
41,yoghurt,correct,correct,false positive,correct
46,bow-tie pasta,correct,false negative,correct,false positive
48,goat cheese,correct,correct,false positive,correct
57,lard,false positive,correct,correct,correct


In [None]:
fmodel.filter(specific_examples,bool_format=False)

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,veal,0.704619,0.660747,0.771408,0.758128


In [None]:
del fmodel_internal_model
del fmodel

###**Small Model**

In [28]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(384, 128)),
    ('dr1', nn.Dropout(0.25)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.BatchNorm1d(128)),
    ('fc3', nn.Linear(128, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


In [29]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

Epoch: 1 | Train Loss: 13.468 | Val Loss: 3.541 | Train Acc: 0.261 | Val Acc: 0.591 | Train Pre: 0.976 | Val Pre: 0.988 | Train Rec: 0.647 | Val Rec: 0.837
Epoch: 2 | Train Loss: 10.456 | Val Loss: 2.514 | Train Acc: 0.488 | Val Acc: 0.558 | Train Pre: 0.994 | Val Pre: 0.988 | Train Rec: 0.794 | Val Rec: 0.837
Epoch: 3 | Train Loss: 8.652 | Val Loss: 2.173 | Train Acc: 0.652 | Val Acc: 0.691 | Train Pre: 0.993 | Val Pre: 0.979 | Train Rec: 0.878 | Val Rec: 0.893
Epoch: 4 | Train Loss: 6.814 | Val Loss: 1.849 | Train Acc: 0.787 | Val Acc: 0.728 | Train Pre: 0.993 | Val Pre: 0.984 | Train Rec: 0.932 | Val Rec: 0.91
Epoch: 5 | Train Loss: 5.209 | Val Loss: 1.54 | Train Acc: 0.869 | Val Acc: 0.806 | Train Pre: 0.994 | Val Pre: 0.982 | Train Rec: 0.963 | Val Rec: 0.946
Epoch: 6 | Train Loss: 3.868 | Val Loss: 1.096 | Train Acc: 0.933 | Val Acc: 0.883 | Train Pre: 0.994 | Val Pre: 0.969 | Train Rec: 0.983 | Val Rec: 0.979
Epoch: 7 | Train Loss: 2.93 | Val Loss: 1.053 | Train Acc: 0.961 | Val

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

Accuracy: 0.86
Avg. Precision: 0.9675398427870596 | Full Precision [0.98850575 0.94594595 0.97826087 0.95744681]
Avg. Recall 0.9646305193319608 | Full Recall [1.         0.93333333 0.96774194 0.95744681]


Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
4,creamy peanut butter,correct,false negative,false negative,correct
11,soba noodles,correct,correct,correct,false negative
24,hominy,correct,correct,correct,false negative
31,fish sauce,correct,correct,correct,false positive
36,cotija,correct,false positive,false positive,false negative
39,rigatoni,correct,false negative,correct,false positive
46,confectioners sugar,correct,false negative,false negative,correct
49,tamari soy sauce,correct,correct,correct,false negative
59,puff pastry,correct,false negative,correct,false positive
61,ranch dressing,correct,false positive,false positive,correct


In [None]:
fmodel.filter(specific_examples,bool_format=False)

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,rice,0.994061,0.994568,0.99295,0.992791
1,meat,6e-06,6.6e-05,0.998308,0.996874
2,bread,0.942928,0.00533,0.944616,3.9e-05
3,cheeseburger,0.88269,0.014949,0.04209,0.956107
4,veal,0.984752,0.927512,0.989918,0.983377


In [27]:
del fmodel_internal_model
del fmodel

###**Large Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(384, 256)),
    ('relu1', nn.LeakyReLU()),
    ('bn1', nn.BatchNorm1d(256)),
    ('fc2', nn.Linear(256, 64)),
    ('dr1', nn.Dropout(0.3)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.BatchNorm1d(64)),
    ('fc3', nn.Linear(64, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

Epoch: 1 | Train Loss: 13.149 | Val Loss: 3.735 | Train Acc: 0.332 | Val Acc: 0.19 | Train Pre: 0.985 | Val Pre: 0.985 | Train Rec: 0.985 | Val Rec: 0.985
Epoch: 2 | Train Loss: 10.136 | Val Loss: 2.689 | Train Acc: 0.553 | Val Acc: 0.666 | Train Pre: 0.996 | Val Pre: 0.977 | Train Rec: 0.996 | Val Rec: 0.977
Epoch: 3 | Train Loss: 8.511 | Val Loss: 2.305 | Train Acc: 0.716 | Val Acc: 0.753 | Train Pre: 0.995 | Val Pre: 0.98 | Train Rec: 0.995 | Val Rec: 0.98
Epoch: 4 | Train Loss: 7.28 | Val Loss: 2.066 | Train Acc: 0.768 | Val Acc: 0.772 | Train Pre: 0.997 | Val Pre: 0.965 | Train Rec: 0.997 | Val Rec: 0.965
Epoch: 5 | Train Loss: 5.956 | Val Loss: 1.751 | Train Acc: 0.838 | Val Acc: 0.837 | Train Pre: 0.997 | Val Pre: 0.973 | Train Rec: 0.997 | Val Rec: 0.973
Epoch: 6 | Train Loss: 4.979 | Val Loss: 1.564 | Train Acc: 0.883 | Val Acc: 0.853 | Train Pre: 0.997 | Val Pre: 0.971 | Train Rec: 0.997 | Val Rec: 0.971
Epoch: 7 | Train Loss: 4.007 | Val Loss: 1.357 | Train Acc: 0.911 | Val 

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

Accuracy: 0.84
Avg. Precision: 0.9474844220254257 | Full Precision [0.96511628 0.87837838 0.97802198 0.96842105]
Avg. Recall 0.9801880974406413 | Full Recall [1.         0.94202899 1.         0.9787234 ]


Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,white wine,correct,false negative,correct,correct
5,queso fresco,correct,false positive,false positive,correct
6,white wine vinegar,correct,false negative,correct,correct
10,confectioners sugar,correct,false negative,correct,correct
13,mascarpone,correct,false positive,false positive,correct
29,Gochujang base,correct,correct,correct,false positive
47,pizza sauce,correct,correct,correct,false negative
48,sea scallops,false positive,false positive,correct,correct
50,unsweetened coconut milk,correct,false negative,correct,correct
54,honey,correct,false positive,correct,correct


In [None]:
fmodel.filter(specific_examples,bool_format=False)

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,veal,0.991437,0.994498,0.988592,0.932378


In [None]:
del fmodel_internal_model
del fmodel

##**nomic-ai/nomic-embed-text-v1**

In [None]:
model_name = "nomic-ai/nomic-embed-text-v1"

###**Regression Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(768, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

In [None]:
fmodel.filter(specific_examples,bool_format=False)

In [None]:
del fmodel_internal_model
del fmodel

###**Small Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(768, 128)),
    ('dr1', nn.Dropout(0.25)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.BatchNorm1d(128)),
    ('fc3', nn.Linear(128, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

In [None]:
fmodel.filter(specific_examples,bool_format=False)

In [None]:
del fmodel_internal_model
del fmodel

###**Large Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(768, 256)),
    ('relu1', nn.LeakyReLU()),
    ('bn1', nn.BatchNorm1d(256)),
    ('fc2', nn.Linear(256, 64)),
    ('dr1', nn.Dropout(0.3)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.BatchNorm1d(64)),
    ('fc3', nn.Linear(64, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

In [None]:
fmodel.filter(specific_examples,bool_format=False)

In [None]:
del fmodel_internal_model
del fmodel

##**jonny9f/food_embeddings2**

In [None]:
model_name = "jonny9f/food_embeddings2"

###**Regression Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(768, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

Epoch: 1 | Train Loss: 14.924 | Val Loss: 3.968 | Train Acc: 0.381 | Val Acc: 0.539 | Train Pre: 0.931 | Val Pre: 0.944 | Train Rec: 0.931 | Val Rec: 0.944
Epoch: 2 | Train Loss: 14.042 | Val Loss: 3.773 | Train Acc: 0.572 | Val Acc: 0.6 | Train Pre: 0.94 | Val Pre: 0.953 | Train Rec: 0.94 | Val Rec: 0.953
Epoch: 3 | Train Loss: 13.297 | Val Loss: 3.604 | Train Acc: 0.601 | Val Acc: 0.604 | Train Pre: 0.939 | Val Pre: 0.947 | Train Rec: 0.939 | Val Rec: 0.947
Epoch: 4 | Train Loss: 12.65 | Val Loss: 3.447 | Train Acc: 0.627 | Val Acc: 0.621 | Train Pre: 0.943 | Val Pre: 0.944 | Train Rec: 0.943 | Val Rec: 0.944
Epoch: 5 | Train Loss: 12.093 | Val Loss: 3.297 | Train Acc: 0.636 | Val Acc: 0.625 | Train Pre: 0.943 | Val Pre: 0.948 | Train Rec: 0.943 | Val Rec: 0.948
Epoch: 6 | Train Loss: 11.594 | Val Loss: 3.193 | Train Acc: 0.644 | Val Acc: 0.648 | Train Pre: 0.944 | Val Pre: 0.948 | Train Rec: 0.944 | Val Rec: 0.948
Epoch: 7 | Train Loss: 11.186 | Val Loss: 3.061 | Train Acc: 0.656 | 

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

Accuracy: 0.69
Avg. Precision: 0.9068061369220133 | Full Precision [0.90243902 0.82191781 0.93478261 0.96808511]
Avg. Recall 0.936420385304214 | Full Recall [0.93670886 0.89552239 0.95555556 0.95789474]


Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
7,cream of tartar,correct,false negative,correct,correct
9,baby spinach leaves,correct,false negative,correct,correct
14,steak,false positive,correct,correct,correct
17,semisweet chocolate,correct,false positive,false positive,correct
20,anchovies,false positive,false positive,correct,correct
21,shredded parmesan cheese,correct,false positive,false positive,correct
23,pecorino romano cheese,correct,false positive,false positive,correct
31,ground turkey,correct,false positive,correct,correct
32,beef brisket,correct,correct,false negative,correct
34,poultry seasoning,false negative,false negative,correct,correct


In [None]:
fmodel.filter(specific_examples,bool_format=False)

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,veal,0.596579,0.561355,0.662399,0.612412


In [None]:
del fmodel_internal_model
del fmodel

###**Small Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(768, 128)),
    ('dr1', nn.Dropout(0.25)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.BatchNorm1d(128)),
    ('fc3', nn.Linear(128, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

Epoch: 1 | Train Loss: 13.119 | Val Loss: 3.536 | Train Acc: 0.333 | Val Acc: 0.688 | Train Pre: 0.976 | Val Pre: 0.955 | Train Rec: 0.976 | Val Rec: 0.955
Epoch: 2 | Train Loss: 10.641 | Val Loss: 2.67 | Train Acc: 0.529 | Val Acc: 0.741 | Train Pre: 0.986 | Val Pre: 0.967 | Train Rec: 0.986 | Val Rec: 0.967
Epoch: 3 | Train Loss: 8.887 | Val Loss: 2.411 | Train Acc: 0.683 | Val Acc: 0.755 | Train Pre: 0.99 | Val Pre: 0.955 | Train Rec: 0.99 | Val Rec: 0.955
Epoch: 4 | Train Loss: 7.38 | Val Loss: 2.236 | Train Acc: 0.796 | Val Acc: 0.786 | Train Pre: 0.987 | Val Pre: 0.962 | Train Rec: 0.987 | Val Rec: 0.962
Epoch: 5 | Train Loss: 5.963 | Val Loss: 1.943 | Train Acc: 0.835 | Val Acc: 0.804 | Train Pre: 0.983 | Val Pre: 0.957 | Train Rec: 0.983 | Val Rec: 0.957
Epoch: 6 | Train Loss: 4.678 | Val Loss: 1.605 | Train Acc: 0.882 | Val Acc: 0.853 | Train Pre: 0.987 | Val Pre: 0.952 | Train Rec: 0.987 | Val Rec: 0.952
Epoch: 7 | Train Loss: 3.806 | Val Loss: 1.389 | Train Acc: 0.904 | Val 

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

Accuracy: 0.87
Avg. Precision: 0.9498721576175544 | Full Precision [0.96296296 0.89041096 0.9673913  0.9787234 ]
Avg. Recall 0.9787002418509275 | Full Recall [0.98734177 0.97014925 0.98888889 0.96842105]


Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
15,fish stock,correct,correct,correct,false negative
20,anchovies,correct,false positive,correct,correct
23,pecorino romano cheese,correct,false positive,false positive,correct
32,beef brisket,correct,false positive,correct,correct
34,poultry seasoning,false negative,false negative,correct,correct
57,lard,false positive,false positive,correct,correct
68,crawfish,false positive,false positive,correct,correct
78,pancetta,false positive,false positive,correct,false negative
80,bittersweet chocolate,correct,false positive,false positive,correct
81,oyster sauce,correct,correct,correct,false positive


In [None]:
fmodel.filter(specific_examples,bool_format=False)

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,veal,0.953537,0.901045,0.94686,0.940306


In [None]:
del fmodel_internal_model
del fmodel

###**Large Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(768, 256)),
    ('relu1', nn.LeakyReLU()),
    ('bn1', nn.BatchNorm1d(256)),
    ('fc2', nn.Linear(256, 64)),
    ('dr1', nn.Dropout(0.3)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.BatchNorm1d(64)),
    ('fc3', nn.Linear(64, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch: 1 | Train Loss: 14.102 | Val Loss: 3.82 | Train Acc: 0.204 | Val Acc: 0.053 | Train Pre: 0.974 | Val Pre: 0.714 | Train Rec: 0.974 | Val Rec: 0.714
Epoch: 2 | Train Loss: 11.728 | Val Loss: 3.088 | Train Acc: 0.389 | Val Acc: 0.52 | Train Pre: 0.984 | Val Pre: 0.987 | Train Rec: 0.984 | Val Rec: 0.987
Epoch: 3 | Train Loss: 10.388 | Val Loss: 2.691 | Train Acc: 0.556 | Val Acc: 0.68 | Train Pre: 0.985 | Val Pre: 0.984 | Train Rec: 0.985 | Val Rec: 0.984
Epoch: 4 | Train Loss: 8.862 | Val Loss: 2.446 | Train Acc: 0.662 | Val Acc: 0.708 | Train Pre: 0.991 | Val Pre: 0.988 | Train Rec: 0.991 | Val Rec: 0.988
Epoch: 5 | Train Loss: 7.643 | Val Loss: 2.063 | Train Acc: 0.738 | Val Acc: 0.8 | Train Pre: 0.987 | Val Pre: 0.981 | Train Rec: 0.987 | Val Rec: 0.981
Epoch: 6 | Train Loss: 6.533 | Val Loss: 1.914 | Train Acc: 0.793 | Val Acc: 0.817 | Train Pre: 0.988 | Val Pre: 0.972 | Train Rec: 0.988 | Val Rec: 0.972
Epoch: 7 | Train Loss: 5.861 | Val Loss: 1.717 | Train Acc: 0.81 | Val A

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

Accuracy: 0.82
Avg. Precision: 0.9263549031521578 | Full Precision [0.91666667 0.8630137  0.95698925 0.96875   ]
Avg. Recall 0.9707045772691064 | Full Recall [0.97468354 0.94029851 0.98888889 0.97894737]


Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
20,anchovies,false positive,false positive,correct,correct
23,pecorino romano cheese,correct,false positive,false positive,correct
32,beef brisket,false positive,false positive,correct,correct
34,poultry seasoning,false negative,false negative,correct,correct
46,bow-tie pasta,correct,false negative,correct,correct
48,goat cheese,false negative,correct,false positive,correct
52,eggs,correct,false positive,correct,correct
57,lard,false positive,false positive,correct,false negative
63,tortillas,correct,correct,correct,false positive
68,crawfish,false positive,false positive,correct,correct


In [None]:
fmodel.filter(specific_examples,bool_format=False)

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,veal,0.741713,0.347084,0.953862,0.918298


In [None]:
del fmodel_internal_model
del fmodel

##**DivyaMereddy007/RecipeBert_v5**

In [None]:
model_name = "DivyaMereddy007/RecipeBert_v5"

###**Regression Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(768, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

Epoch: 1 | Train Loss: 13.944 | Val Loss: 3.508 | Train Acc: 0.696 | Val Acc: 0.693 | Train Pre: 0.863 | Val Pre: 0.855 | Train Rec: 0.863 | Val Rec: 0.855
Epoch: 2 | Train Loss: 11.997 | Val Loss: 3.105 | Train Acc: 0.694 | Val Acc: 0.681 | Train Pre: 0.862 | Val Pre: 0.848 | Train Rec: 0.862 | Val Rec: 0.848
Epoch: 3 | Train Loss: 10.592 | Val Loss: 2.819 | Train Acc: 0.697 | Val Acc: 0.681 | Train Pre: 0.864 | Val Pre: 0.848 | Train Rec: 0.864 | Val Rec: 0.848
Epoch: 4 | Train Loss: 9.638 | Val Loss: 2.573 | Train Acc: 0.696 | Val Acc: 0.699 | Train Pre: 0.864 | Val Pre: 0.858 | Train Rec: 0.864 | Val Rec: 0.858
Epoch: 5 | Train Loss: 8.964 | Val Loss: 2.434 | Train Acc: 0.696 | Val Acc: 0.687 | Train Pre: 0.864 | Val Pre: 0.855 | Train Rec: 0.864 | Val Rec: 0.855
Epoch: 6 | Train Loss: 8.48 | Val Loss: 2.333 | Train Acc: 0.696 | Val Acc: 0.687 | Train Pre: 0.865 | Val Pre: 0.852 | Train Rec: 0.865 | Val Rec: 0.852
Epoch: 7 | Train Loss: 8.107 | Val Loss: 2.259 | Train Acc: 0.696 | 

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

Accuracy: 0.79
Avg. Precision: 0.932466982173731 | Full Precision [0.92771084 0.87671233 0.97752809 0.94791667]
Avg. Recall 0.9636172071023961 | Full Recall [0.97468354 0.95522388 0.96666667 0.95789474]


Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
2,salsa,false negative,correct,correct,correct
5,corn flour,correct,correct,correct,false negative
17,semisweet chocolate,correct,correct,correct,false negative
20,anchovies,false positive,false positive,correct,correct
34,poultry seasoning,false negative,false negative,correct,correct
41,yoghurt,correct,false positive,false positive,correct
46,bow-tie pasta,correct,correct,correct,false positive
53,canned low sodium chicken broth,correct,correct,false negative,false negative
57,lard,false positive,false positive,correct,correct
63,tortillas,correct,correct,correct,false positive


In [None]:
fmodel.filter(specific_examples,bool_format=False)

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,veal,0.942064,0.822479,0.916806,0.933062


In [None]:
del fmodel_internal_model
del fmodel

###**Small Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(768, 128)),
    ('dr1', nn.Dropout(0.25)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.BatchNorm1d(128)),
    ('fc3', nn.Linear(128, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

Epoch: 1 | Train Loss: 12.711 | Val Loss: 3.462 | Train Acc: 0.306 | Val Acc: 0.675 | Train Pre: 0.964 | Val Pre: 0.966 | Train Rec: 0.964 | Val Rec: 0.966
Epoch: 2 | Train Loss: 10.202 | Val Loss: 2.457 | Train Acc: 0.506 | Val Acc: 0.708 | Train Pre: 0.971 | Val Pre: 0.981 | Train Rec: 0.971 | Val Rec: 0.981
Epoch: 3 | Train Loss: 8.645 | Val Loss: 2.032 | Train Acc: 0.66 | Val Acc: 0.772 | Train Pre: 0.977 | Val Pre: 0.978 | Train Rec: 0.977 | Val Rec: 0.978
Epoch: 4 | Train Loss: 7.003 | Val Loss: 1.656 | Train Acc: 0.768 | Val Acc: 0.815 | Train Pre: 0.976 | Val Pre: 0.978 | Train Rec: 0.976 | Val Rec: 0.978
Epoch: 5 | Train Loss: 5.872 | Val Loss: 1.344 | Train Acc: 0.827 | Val Acc: 0.874 | Train Pre: 0.978 | Val Pre: 0.97 | Train Rec: 0.978 | Val Rec: 0.97
Epoch: 6 | Train Loss: 4.435 | Val Loss: 1.199 | Train Acc: 0.875 | Val Acc: 0.873 | Train Pre: 0.979 | Val Pre: 0.974 | Train Rec: 0.979 | Val Rec: 0.974
Epoch: 7 | Train Loss: 3.675 | Val Loss: 1.012 | Train Acc: 0.909 | Val

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

Accuracy: 0.84
Avg. Precision: 0.945075421472937 | Full Precision [0.92857143 0.91428571 0.97826087 0.95918367]
Avg. Recall 0.98300983423986 | Full Recall [0.98734177 0.95522388 1.         0.98947368]


Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
7,cream of tartar,correct,false negative,correct,correct
20,anchovies,false positive,false positive,correct,correct
34,poultry seasoning,false negative,false negative,correct,correct
41,yoghurt,correct,correct,false positive,correct
57,lard,false positive,false positive,correct,correct
63,tortillas,correct,correct,correct,false positive
64,unsweetened cocoa powder,correct,false negative,correct,correct
78,pancetta,false positive,correct,correct,correct
80,bittersweet chocolate,correct,false positive,false positive,correct
81,oyster sauce,correct,correct,correct,false positive


In [None]:
fmodel.filter(specific_examples,bool_format=False)

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,veal,0.99652,0.998576,0.998644,0.982679


In [None]:
del fmodel_internal_model
del fmodel

###**Large Model**

In [None]:
fmodel_internal_model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(768, 256)),
    ('relu1', nn.LeakyReLU()),
    ('bn1', nn.BatchNorm1d(256)),
    ('fc2', nn.Linear(256, 64)),
    ('dr1', nn.Dropout(0.3)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.BatchNorm1d(64)),
    ('fc3', nn.Linear(64, 4)),
    ('sg1', nn.Sigmoid())
]))

fmodel = FilteringModel(food_data,"ingredient",model_name,fmodel_internal_model)


In [None]:
fmodel.train_model(epochs=20,batch_size=33,val_split=0.2)

Epoch: 1 | Train Loss: 13.287 | Val Loss: 3.701 | Train Acc: 0.284 | Val Acc: 0.13 | Train Pre: 0.973 | Val Pre: 0.979 | Train Rec: 0.973 | Val Rec: 0.979
Epoch: 2 | Train Loss: 11.153 | Val Loss: 2.695 | Train Acc: 0.487 | Val Acc: 0.704 | Train Pre: 0.98 | Val Pre: 0.979 | Train Rec: 0.98 | Val Rec: 0.979
Epoch: 3 | Train Loss: 9.522 | Val Loss: 2.423 | Train Acc: 0.656 | Val Acc: 0.711 | Train Pre: 0.984 | Val Pre: 0.98 | Train Rec: 0.984 | Val Rec: 0.98
Epoch: 4 | Train Loss: 8.154 | Val Loss: 2.114 | Train Acc: 0.715 | Val Acc: 0.756 | Train Pre: 0.98 | Val Pre: 0.981 | Train Rec: 0.98 | Val Rec: 0.981
Epoch: 5 | Train Loss: 6.92 | Val Loss: 1.737 | Train Acc: 0.779 | Val Acc: 0.827 | Train Pre: 0.986 | Val Pre: 0.967 | Train Rec: 0.986 | Val Rec: 0.967
Epoch: 6 | Train Loss: 6.275 | Val Loss: 1.677 | Train Acc: 0.791 | Val Acc: 0.808 | Train Pre: 0.984 | Val Pre: 0.971 | Train Rec: 0.984 | Val Rec: 0.971
Epoch: 7 | Train Loss: 4.938 | Val Loss: 1.233 | Train Acc: 0.85 | Val Acc: 

In [None]:
#Test the model
test_results = fmodel.filter(food_test_data['ingredient'].to_list())
test_model(test_results,food_test_data,print_results=True)
get_incorrect_results(test_results,food_test_data)

Accuracy: 0.81
Avg. Precision: 0.9480060938298924 | Full Precision [0.92857143 0.92753623 0.97802198 0.95789474]
Avg. Recall 0.9723373196199769 | Full Recall [0.98734177 0.95522388 0.98888889 0.95789474]


Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
8,dry mustard,correct,false negative,correct,correct
20,anchovies,false positive,false positive,correct,correct
23,pecorino romano cheese,correct,correct,correct,false negative
34,poultry seasoning,false negative,false negative,correct,correct
39,parmigiano reggiano cheese,correct,correct,correct,false negative
41,yoghurt,correct,correct,false positive,correct
57,lard,false positive,false positive,correct,correct
63,tortillas,correct,correct,correct,false positive
64,unsweetened cocoa powder,correct,false negative,false negative,correct
78,pancetta,false positive,correct,correct,correct


In [None]:
fmodel.filter(specific_examples,bool_format=False)

Unnamed: 0,ingredient,vegetarian,vegan,dairy_free,gluten_free
0,veal,0.988894,0.991628,0.987421,0.969273


In [None]:
del fmodel_internal_model
del fmodel

#**Single-Encoding Model Corss-Val Testing**

In [None]:
from pandas import DataFrame
import gc

#Method for mass-testing decoder architectures through cross-validation
def mass_cross_val_test(model_name:str,data:DataFrame,model_arcs:list,model_arc_names:list,cv_epochs:list,k_folds=5,verbose=True):
  stats = ['train_loss','train_acc','train_pre_avg','train_rec_avg',
           'val_loss','val_acc','val_pre_avg','val_rec_avg']
  results = {x : {y : [] for y in stats} for x in cv_epochs}

  for i, model_arc in enumerate(model_arcs):
    if verbose:
      print("Started Test: model_arc =",model_arc_names[i])

    fmodel = FilteringModel(data,"ingredient",model_name,model_arc)
    cv_results = fmodel.k_fold_validate(epochs=max(cv_epochs),batch_size=33,k_folds=k_folds,benchmark_at=cv_epochs,verbose=False)
    for benchmark in cv_epochs:
      for stat in stats:
        results[benchmark][stat].append(cv_results[benchmark][stat])

  T.cuda.empty_cache()
  gc.collect()
  return {epoch : DataFrame.from_dict(results[epoch], orient='index',columns=model_arc_names) for epoch in cv_epochs}


##**sentence-transformers/all-MiniLM-L6-v2**

In [None]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_embedding_size = 384

###Dropout

In [None]:
#Define Paramaters and Run
model_arcs = [nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(model_embedding_size, 128)),
    ('dr1', nn.Dropout(0.45)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.LayerNorm(128)),
    ('fc3', nn.Linear(128, 4)),
    ('sg1', nn.Sigmoid())
])),nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(model_embedding_size, 128)),
    ('dr1', nn.Dropout(0.25)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.LayerNorm(128)),
    ('fc3', nn.Linear(128, 4)),
    ('sg1', nn.Sigmoid())
])),nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(model_embedding_size, 128)),
    ('dr1', nn.Dropout(0.1)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.LayerNorm(128)),
    ('fc3', nn.Linear(128, 4)),
    ('sg1', nn.Sigmoid())
])),nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(model_embedding_size, 128)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.LayerNorm(128)),
    ('fc3', nn.Linear(128, 4)),
    ('sg1', nn.Sigmoid())
]))]
model_arc_names = ["high dropout","medium dropout","low dropout","no dropout"]
cv_epochs = [5,7,10,15,20]
mass_cross_val_test(model_name,food_data,model_arcs,model_arc_names,cv_epochs)

Started Test: model_arc = high dropout
Started Test: model_arc = medium dropout
Started Test: model_arc = low dropout
Started Test: model_arc = no dropout


{5:                high dropout  medium dropout  low dropout  no dropout
 train_loss            2.179           1.626        1.321       1.128
 train_acc             0.920           0.944        0.964       0.974
 train_pre_avg         0.977           0.984        0.990       0.992
 train_rec_avg         0.994           0.996        0.997       0.998
 val_loss              0.791           0.737        0.766       0.712
 val_acc               0.876           0.883        0.889       0.892
 val_pre_avg           0.964           0.969        0.969       0.970
 val_rec_avg           0.984           0.980        0.981       0.983,
 7:                high dropout  medium dropout  low dropout  no dropout
 train_loss            1.444           0.961        0.724       0.611
 train_acc             0.954           0.977        0.985       0.992
 train_pre_avg         0.987           0.994        0.997       0.998
 train_rec_avg         0.997           0.999        0.999       0.999
 val_loss    

###Norm Type

In [None]:
#Define Paramaters and Run
model_arcs = [nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(model_embedding_size, 128)),
    ('dr1', nn.Dropout(0.1)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.BatchNorm1d(128)),
    ('fc3', nn.Linear(128, 4)),
    ('sg1', nn.Sigmoid())
])),nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(model_embedding_size, 128)),
    ('dr1', nn.Dropout(0.1)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.LayerNorm(128)),
    ('fc3', nn.Linear(128, 4)),
    ('sg1', nn.Sigmoid())
])),nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(model_embedding_size, 128)),
    ('dr1', nn.Dropout(0.1)),
    ('relu2', nn.LeakyReLU()),
    ('fc3', nn.Linear(128, 4)),
    ('sg1', nn.Sigmoid())
]))]
model_arc_names = ["batch norm","layer norm","no norm"]
cv_epochs = [5,7,10,15,20]
mass_cross_val_test(model_name,food_data,model_arcs,model_arc_names,cv_epochs)

Started Test: model_arc = batch norm
Started Test: model_arc = layer norm
Started Test: model_arc = no norm


{5:                batch norm  layer norm  no norm
 train_loss          4.435       1.342    6.108
 train_acc           0.920       0.957    0.698
 train_pre_avg       0.996       0.988    0.883
 train_rec_avg       0.980       0.997    0.999
 val_loss            1.367       0.774    1.586
 val_acc             0.857       0.891    0.699
 val_pre_avg         0.974       0.967    0.888
 val_rec_avg         0.967       0.983    0.998,
 7:                batch norm  layer norm  no norm
 train_loss          2.380       0.764    4.535
 train_acc           0.974       0.984    0.784
 train_pre_avg       0.998       0.996    0.928
 train_rec_avg       0.995       0.998    0.996
 val_loss            0.960       0.755    1.214
 val_acc             0.889       0.899    0.810
 val_pre_avg         0.974       0.971    0.932
 val_rec_avg         0.978       0.984    0.996,
 10:                batch norm  layer norm  no norm
 train_loss          1.037       0.388    3.153
 train_acc           0.991  

###Intermediate Layer Size

In [None]:
#Define Paramaters and Run
model_arcs = [nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(model_embedding_size, 256)),
    ('dr1', nn.Dropout(0.1)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.LayerNorm(256)),
    ('fc3', nn.Linear(256, 4)),
    ('sg1', nn.Sigmoid())
])),nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(model_embedding_size, 128)),
    ('dr1', nn.Dropout(0.1)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.LayerNorm(128)),
    ('fc3', nn.Linear(128, 4)),
    ('sg1', nn.Sigmoid())
])),nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(model_embedding_size, 64)),
    ('dr1', nn.Dropout(0.1)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.LayerNorm(64)),
    ('fc3', nn.Linear(64, 4)),
    ('sg1', nn.Sigmoid())
])),nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(model_embedding_size, 32)),
    ('dr1', nn.Dropout(0.1)),
    ('relu2', nn.LeakyReLU()),
    ('bn2', nn.LayerNorm(32)),
    ('fc3', nn.Linear(32, 4)),
    ('sg1', nn.Sigmoid())
]))]
model_arc_names = ["256 params","128 params","64 params","32 params"]
cv_epochs = [5,7,10,15,20]
mass_cross_val_test(model_name,food_data,model_arcs,model_arc_names,cv_epochs)

Started Test: model_arc = 256 params
Started Test: model_arc = 128 params
Started Test: model_arc = 64 params
Started Test: model_arc = 32 params


{5:                256 params  128 params  64 params  32 params
 train_loss          0.844       1.502      2.444      3.416
 train_acc           0.979       0.951      0.921      0.905
 train_pre_avg       0.995       0.986      0.976      0.969
 train_rec_avg       0.998       0.996      0.995      0.995
 val_loss            0.636       0.742      0.868      0.978
 val_acc             0.905       0.889      0.874      0.869
 val_pre_avg         0.972       0.965      0.961      0.958
 val_rec_avg         0.987       0.984      0.986      0.988,
 7:                256 params  128 params  64 params  32 params
 train_loss          0.425       0.830      1.535      2.369
 train_acc           0.993       0.984      0.960      0.939
 train_pre_avg       0.998       0.996      0.988      0.981
 train_rec_avg       1.000       0.999      0.998      0.998
 val_loss            0.680       0.702      0.789      0.858
 val_acc             0.896       0.898      0.889      0.875
 val_pre_avg     