## Libraries:

In [2]:
import requests
import json
import difflib
import pandas as pd
import pickle
import numpy as np
import warnings

from json import JSONDecodeError

from src.data.ingredients_llm_processed import final_ingredient_list_df

# Configuration

In [3]:
warnings.filterwarnings("ignore")

## Inputs

In [4]:
# When reading a pickle, the "molecules" key is kept as a list of values, not a string.
with open('../data/external/flavor_db.pkl', 'rb') as f:
    flavor_db_pkl = pickle.load(f)
# In this case, flavor_db_pkl is a dictionary.

## Functions

In [7]:
def get_list_from_synonyms(basket):
    if not basket:
        return []
    basket = basket.lower()
    return basket.split(', ')

def get_similarity_word(x, word):
    x = x.replace('-', ' ')
    word = word.lower()
    if ' ' + word + ' ' in ' ' + x + ' ':
        return 1
    return difflib.SequenceMatcher(None, word, x).ratio()

def json_reader(x):
    return json.loads(x.replace("'", '"'))

## Data Process

In [8]:
# Se lee el archivo pkl y se procesa para convertir en dataframe.
flavor_db = pd.DataFrame(flavor_db_pkl)
flavor_db = flavor_db.T
flavor_db = flavor_db[flavor_db.isna().sum(axis=1)<5] # Quitamos todos los valores nulos
flavor_db = flavor_db.reset_index(drop=True)

In [11]:
# Se agregan columnas adicionales.
flavor_db_df = flavor_db.copy()
flavor_db_df['all_synonyms'] = (
    flavor_db_df["entity_alias"].astype('str') + ', ' + 
    flavor_db_df["entity_alias_basket"].astype('str') + ', ' + 
    flavor_db_df['entity_alias_synonyms'].astype('str')
)
flavor_db_df["entity_alias_synonyms_list"] = flavor_db_df["all_synonyms"].astype("str").apply(get_list_from_synonyms)

exploded_flavor_db = flavor_db_df.explode("entity_alias_synonyms_list")[['entity_id', 'entity_alias_synonyms_list']]
exploded_flavor_db.loc[exploded_flavor_db.entity_alias_synonyms_list == 'nan'] = np.nan
exploded_flavor_db = exploded_flavor_db.dropna()

In [13]:
# Ahora podemos acceder a la lista unica de ingredientes en 
# la usda que fueron procesados por un LLM. 
ingredients_description = final_ingredient_list_df

In [14]:
stopwords = [
    'table',
    'water',
]

In [15]:
preliminary_results = {}
# Para cada elemento en los ingredientes se encuentra la entidad de flavordb que tenga cierta relaciÃ³n o sentido. 
for ingredient_pos in range(ingredients_description.shape[0]):
    word_list = []
    results_test = pd.DataFrame()
    for word in ingredients_description.iloc[ingredient_pos]:
        if word in stopwords:
            word = ''
        if word:
            word_list.append(word)
            results_test[word] = exploded_flavor_db.entity_alias_synonyms_list.apply(get_similarity_word, word=word) > 0.9
    most_relevant_info = pd.concat([exploded_flavor_db[results_test[word]] for word in word_list]).entity_id.value_counts()
    try:
        most_relevant_index = most_relevant_info.index[0]
    except:
        most_relevant_index = -1
    r = flavor_db_df[flavor_db_df.entity_id==int(most_relevant_index)]
    r['ingredient_list'] = [word_list]
    preliminary_results[ingredient_pos] = r

In [33]:
ingredients_description.iloc[10]

ingredient     cheese
procedence           
type                 
descriptors          
Name: 10, dtype: object

In [34]:
preliminary_results[10]

Unnamed: 0,category,entity_id,category_readable,entity_alias_basket,entity_alias_readable,natural_source_name,entity_alias,molecules,natural_source_url,entity_alias_url,entity_alias_synonyms,all_synonyms,entity_alias_synonyms_list,ingredient_list
69,dairy,69,Dairy,"cheese, cheese-domiati",Domiati Cheese,Water Buffalo,cheese-domiati,"[{'bond_stereo_count': 0, 'undefined_atom_ster...",https://en.wikipedia.org/wiki/Water_buffalo,https://en.wikipedia.org/wiki/Domiati,"Domiati Cheese, White Cheese","cheese-domiati, cheese, cheese-domiati, Domiat...","[cheese-domiati, cheese, cheese-domiati, domia...",[cheese]


In [31]:
preliminary_results.keys()
pd.DataFrame(preliminary_results[10].molecules.iloc[0]).taste.unique()

array(['', 'sweet whiskey taste', 'sour', 'slightly bitter taste',
       'cheesy taste', 'fleeting, fruity taste',
       'unpleasant cabbage taste',
       'pleasant, bittersweet flavor reminiscent of pear on dilution.',
       'camphor taste@cheese@bitter', 'fatty, fruity@aromatic flavor',
       'pungent, spicy taste', 'acid taste', 'old fish@saline taste',
       'sweet, pineapple taste@can be tasted in water at a level of 0.450 ppm and at 0.015 ppm in milk',
       'characeteristic fruity taste (on dilution)@green, woody, vegetative, apple, grassy, citrus and orange with a fresh lingering aftertaste',
       'oily, sweet, slightly herbaceous taste',
       'acrid taste@it has sweetish, afterward acrid, taste',
       'slightly bitter taste@burning taste', 'bitter', 'fatty taste',
       'sharp, burning taste@when in very weak solution it has a sweetish taste.',
       'oily, somewhat orange taste', 'characteristic flavor',
       'fruity taste',
       'honey & orris-like flavor 

In [134]:
flavor_dict = {}
for word in test_set:
    word = word.lower( )
    exploded_flavor_db[word] = exploded_flavor_db.entity_alias_synonyms_list.apply(get_similarity_word, word=word) > 0.8
    food_list = exploded_flavor_db[exploded_flavor_db[word]].entity_alias_synonyms_list.drop_duplicates().to_dict()
    flavor_dict[word] = food_list

In [137]:
flavor_dict

{'fat': {89: 'milk fat',
  270: 'beef-fat',
  272: 'chicken-fat',
  275: 'lamb-fat',
  277: 'mutton-fat',
  278: 'pork-fat',
  573: 'fat hen'},
 'fish': {151: 'fish',
  152: 'fatty fish',
  153: 'lean fish',
  154: 'fish oil',
  155: 'smoked fish',
  499: 'devil fish',
  517: 'coney-fish',
  583: 'frog fish and sea devils',
  647: 'bay fish',
  735: 'rose fish'},
 'vanilla extract': {},
 'onion': {348: 'bulb onion', 407: 'redskin onion', 469: 'welsh onion'},
 'vitamin d': {},
 'vinegar': {322: 'vinegar', 383: 'apple cider vinegar'},
 'cereal': {478: 'breakfast cereal'},
 'meat': {276: 'meat-roasted'},
 'vegetable oil': {},
 'tomato': {364: 'tomato-puree',
  443: 'cherry tomato',
  653: 'mexican husk tomato'},
 'celery': {329: 'celery-stalk'},
 'milk': {88: 'milk-heated',
  89: 'milk fat',
  90: 'goat milk',
  91: 'milk powder',
  92: 'sheep milk',
  93: 'skimmed milk'},
 'oil': {57: 'maize oil',
  97: 'buchu-oil',
  98: 'cajeput-oil',
  101: 'cedar-leaf-oil',
  103: 'citronella-oil',
 