In [None]:
### FAZ O CRUZAMENTO DAS BASES DE INGREDIENTES E MOLECULAS (EXTRAIDOS DO FLAVORDB) ###
### A BASE DE MOLECULAS É A QUE CONTEM SABORES ###

In [8]:
# import

import pandas as pd
import ast

import urllib.request
import json
import time

from translatepy import Translator
from collections import Counter





In [8]:
# NOME DAS COLUNAS DOS DATAFRAMES A SEREM CRIADOS


# colunas de moléculas e perfis de sabor
def molecules_df_cols():
    return ['pubchem id', 'common name', 'flavor profile']  



# colunas de comidas
def flavordb_df_cols():
    return [
        'entity id', 'alias', 'synonyms',
        'scientific name', 'category', 'molecules'
    ]


 

In [10]:
def clean_flavordb_dataframes(food_df, molecules_df):
    """
    Helps ensure consistent intra-column typing and converts all strings to lowercase.
    """
    strtype = type('')
    settype = type(set())
    
    # ensuring that these columns have type str
    for k in ['alias', 'scientific name', 'category']:
        food_df[k] = [
            elem.strip().lower() if isinstance(elem, strtype) else ''
            for elem in food_df[k]
        ]
    
    # ensuring that these columns are always a set of str
    def map_to_synonyms_set(elem):
        if isinstance(elem, settype):
            return elem
        elif isinstance(elem, strtype):
            # if it's a string of a set,
            if elem[0] == '{' and elem[-1] == '}':
                # convert it to a set
                return eval(elem)
            else:
                # else it's probably directly from source
                return set(elem.strip().lower().split(', '))
        else:
            return set()
    
    food_df['synonyms'] = [
        map_to_synonyms_set(elem)
        for elem in food_df['synonyms']
    ]
    
    molecules_df['flavor profile'] = [
        set([x.strip().lower() for x in elem])
        for elem in molecules_df['flavor profile']
    ]
    
    return [
        food_df.groupby('entity id').first().reset_index(),
        molecules_df.groupby('pubchem id').first().reset_index()
    ]

In [12]:
# get the missing entries
def missing_entity_ids(food_df):
    """
    Get the IDs of the missing JSON entries for this particular food DataFrame.
    """
    out = []
    entity_id_set = set(food_df['entity id'])
    for i in range(1, 1 + max(entity_id_set)):
        if i not in entity_id_set:
            out.append(i)
    return out


# loads the dataframes from csv files
def load_db():
    settype = type(set())
    
    df0 = pd.read_csv('./datasets/raw/INGREDIENTES.csv')[flavordb_df_cols()]
    df0['synonyms'] = [eval(x) if isinstance(x, settype) else x for x in df0['synonyms']]
    df0['molecules'] = [eval(x) for x in df0['molecules']]
    
    df1 = pd.read_csv('./datasets/raw/MOLECULAS.csv')[molecules_df_cols()]
    df1['flavor profile'] = [eval(x) for x in df1['flavor profile']]
    
    df0, df1 = clean_flavordb_dataframes(df0, df1)
    return df0, df1, missing_entity_ids(df0)

In [14]:
# missing_ids = the missing ids that are less than the max one found
food_df, molecules_df, missing_ids = load_db()


In [16]:
str(set(food_df['category']))


"{'vegetable fruit', 'fruit citrus', 'dish', 'fruit', 'meat', 'dairy', 'plant', 'cabbage', 'berry', 'plant derivative', 'vegetable root', 'fruit essence', 'legume', 'maize', 'seafood', 'herb', 'cereal', 'flower', 'spice', 'beverage alcoholic', 'beverage', 'fruit-berry', 'beverage caffeinated', 'vegetable stem', 'nut', 'essential oil', 'gourd', 'bakery', 'fish', 'fungus', 'additive', 'vegetable', 'seed', 'vegetable tuber'}"

In [18]:
aliases_by_category = ''
for c in set(food_df['category']):
    aliases_by_category += (
        c + ': '
        + str(list(food_df[food_df['category'] == c]['alias']))
        + '\n\n'
    )
# check out the output of this yourself, if you like
print(aliases_by_category)

vegetable fruit: ['capsicum', 'cherry pepper', 'tomato', 'turkey berry']

fruit citrus: ['bergamot', 'citrus fruits', 'grapefruit', 'kumquat', 'lemon', 'lime', 'mandarin orange', 'satsuma orange', 'tangerine', 'pummelo']

dish: ['frankfurter sausage', 'ice cream', 'nougat', 'toffee', 'cake', 'pizza', 'other snack food', 'pastry', 'dragée', 'chewing gum', 'marzipan', 'salad dressing', 'sausage', 'meatball', 'pate', 'meat bouillon', 'dumpling', 'soup', 'remoulade', 'fruit gum', 'zwieback', 'snack bar', 'burrito', 'hamburger', 'chili', 'taco', 'tortilla', 'nachos', 'salad', 'egg roll', 'stew', 'falafel', 'frybread', 'other frozen dessert', 'lasagna', 'pancake', 'pudding', 'waffle', 'meatloaf', 'couscous', 'chimichanga', 'tostada', 'quesadilla', 'baked potato', 'hot dog', 'enchilada', 'other sandwich', 'breakfast sandwich', 'adobo', 'macaroni and cheese', 'hushpuppy', 'relish', 'fruit salad', 'vegetarian food', 'cold cut', 'pie', 'soy cream', 'ice cream cone', 'natto', 'ravioli', 'scrapple

In [20]:
def food_groups():
    return set([
        'grain', 'vegetable', 'fruit', 'protein',
        'dairy', 'fat', 'sugar', 'seasoning',
        'beverage', 'alcohol'
    ])


# don't worry about the details in this! It's just a lot of sorting.
def get_food_group(food, category):
    """
    Maps each food category to a food group.
    
    The food groups include the main five: grain, vegetable, fruit, protein,
    dairy, fat, and sugar. However, they also include others: seasonings,
    beverages, alcohol.
    """
    
    out = None # return None if you don't know/want to classify it
    
    # broadly classify the major food groups
    if category in ['bakery', 'vegetable tuber', 'cereal']:
        out = 'grain'
    elif category in [
        'flower', 'fungus', 'plant', 'cabbage',
        'vegetable fruit', 'herb', 'gourd', 'vegetable'
    ]:
        out = 'vegetable'
    elif category in [
        'fruit-berry', 'berry', 'fruit', 'fruit citrus'
    ]:
        out = 'fruit'
    elif category in [
        'legume', 'nut', 'seed', 'seafood', 'fish', 'meat'
    ]:
        out = 'protein'
    elif category in ['dairy']:
        out = 'dairy'
    elif category in [
        'fruit essence', 'additive', 'spice', 'essential oil'
    ]:
        out = 'seasoning'
    elif category in ['beverage alcoholic']:
        out = 'alcohol'
    elif 'beverage' in category:
        out = 'beverage'
    elif category == 'maize':
        if food in ['corn', 'sweetcorn']:
            out = 'vegetable'
        elif food in ['cornbread', 'corn grits', 'popcorn']:
            out = 'grain'
        elif food == 'corn oil':
            out = 'fat'
    elif category == 'plant derivative':
        if (any(x in food for x in ['sauce', 'vinegar', 'cocoa'])
            or food in ['creosote', 'storax']):
            # creosote is what gives smoky foods that smoky flavor
            # storax is...weird
            out = 'seasoning'
        elif 'seed' in food or food == 'peanut butter':
            # cottonseeds are now available for people to eat!
            out = 'protein'
        elif any([x in food for x in ['butter', 'oil']]):
            out = 'fat'
        elif food == 'fermented tea':
            out = 'beverage'
        elif food in ['honey', 'chocolate', 'chocolate spread']:
            out = 'sugar'
        elif food == 'macaroni':
            out = 'grain'
        elif food in ['jute', 'tofu']:
            out = 'vegetable'
        elif food == 'soy yogurt':
            out = 'dairy'
    elif category == 'additive':
        if 'sugar' in food or food in [
            'fruit preserve', 'syrup', 'icing', 'molasses'
        ]:
            out = 'sugar'
        elif 'margarine' in food or food in ['cooking oil', 'shortening']:
            out = 'fat'
        elif food in ['sauce', 'gelatin dessert', 'spread', 'topping', 'water']:
            out = None # don't know how to classify these items
        elif food == 'stuffing':
            out = 'grain'
        else:
            out = 'seasoning'
    
    # cover exceptions to the rule
    if (
        any([
            food == x + ' oil'
            for x in ['soybean', 'cooking', 'fish', 'peanut', 'canola', 'corn']
        ])
        or food in ['butter', 'ghee']
        or (' butter' in food and food != 'peanut butter')
        or 'margarine' in food
    ):
        out = 'fat'
    elif food in [
        'sugar', 'honey', 'molasses', 'agave', 'dulce de leche'
    ]:
        # these were classified under 'additives/dairy/plant derivative'
        out = 'sugar'
    elif food in ['irish moss', 'kelp', 'kombu', 'wakame']:
        # these were classified under 'seafood'
        out = 'vegetable'
    elif food in ['butternut squash', 'winter squash', 'japanese pumpkin']:
        # these were classified under 'fruit'
        out = 'vegetable'
    elif food in ['sweet custard', 'candy bar', 'chocolate mousse', 'fudge']:
        out = 'sugar'
    elif 'cocoa' in food:
        out = 'seasoning'

    return out


# make a DataFrame saving the results & food groups
food_group_df = food_df.copy()
food_group_df['group'] = [
    get_food_group(food_group_df.at[i, 'alias'], food_group_df.at[i, 'category'])
    for i in food_group_df.index
]
food_group_df = food_group_df[[
    g is not None
    for g in food_group_df['group']
]]
food_group_df = food_group_df.reset_index()
food_group_df.head()

Unnamed: 0,index,entity id,alias,synonyms,scientific name,category,molecules,group
0,0,1,bakery products,{bakery products},poacceae,bakery,"{27457, 31252, 7976, 22201, 26331, 26808}",grain
1,1,2,bread,{bread},poacceae,bakery,"{1031, 1032, 644104, 527, 8723, 31260, 15394, ...",grain
2,2,3,rye bread,{rye bread},rye,bakery,"{32065, 644104, 72, 18635, 460, 332, 12366, 89...",grain
3,3,4,wheaten bread,"{soda farls, soda scones}",wheat,bakery,"{30914, 6915, 5365891, 12170, 14286, 8082, 312...",grain
4,4,5,white bread,{white bread},wheat,bakery,"{7361, 994, 10883, 7362, 11173, 5365891, 11559...",grain


In [22]:
molecules_df.head(10)

Unnamed: 0,pubchem id,common name,flavor profile
0,4,1-Aminopropan-2-ol,{fishy}
1,49,3-Methyl-2-oxobutanoic acid,{fruity}
2,58,2-oxobutanoic acid,"{caramel, creamy, sweet, lactonic, brown}"
3,70,4-Methyl-2-oxovaleric acid,{fruity}
4,72,"3,4-Dihydroxybenzoic Acid","{phenolic, mild, balsamic}"
5,107,3-Phenylpropanoic acid,"{rose, balsamic, sweet, cinnamon, musk, fatty}"
6,125,4-hydroxybenzyl alcohol,"{almond, bitter, sweet, fruity, coconut}"
7,126,4-hydroxybenzaldehyde,"{woody, almond, nutty, sweet, balsam}"
8,135,4-Hydroxybenzoic Acid,"{phenolic, nutty}"
9,176,acetic acid,"{sour, pungent, sharp, vinegar}"


In [24]:
# Função para extrair os sabores associados a um alimento
def contar_sabores_alimento(molecule_ids, molecules_df):
    # Inicializa um contador para armazenar a frequência dos sabores
    contador_sabores = Counter()

    # Para cada ID de molécula associado ao alimento
    for mol_id in molecule_ids:
        # Filtra a molécula correspondente no dataframe
        molecula = molecules_df[molecules_df['pubchem id'] == mol_id]

        if not molecula.empty:
            # Obtém o perfil de sabor
            flavor_profile = molecula['flavor profile'].values[0]

            # Verifica se o perfil de sabor é um conjunto ou uma string
            if isinstance(flavor_profile, set):
                flavors = flavor_profile  # Usar diretamente
            elif isinstance(flavor_profile, str):
                # Converte a string de 'flavor_profile' para um conjunto
                flavors = set(flavor_profile.strip('{}').split(', '))
            else:
                flavors = set()  # Se não for nenhum dos formatos esperados

            # Atualiza o contador com os sabores encontrados
            contador_sabores.update(flavors)

    return contador_sabores


# Função para avaliar o formato do campo 'molecules' e aplicar a função de contagem
def quantificar_sabores_por_alimento(food_group_df, molecules_df):
    def avaliar_molecule_ids(molecule_ids_str):
        # Verifica se o valor é um conjunto de moléculas
        if isinstance(molecule_ids_str, set):
            return molecule_ids_str
        try:
            # Verifica se a string já está no formato de conjunto
            if molecule_ids_str.startswith('{') and molecule_ids_str.endswith('}'):
                return eval(molecule_ids_str)
            # Caso contrário, tenta converter normalmente
            return ast.literal_eval(molecule_ids_str)
        except (ValueError, SyntaxError):
            print(f"Erro ao converter {molecule_ids_str}")
            return set()  # Retorna um conjunto vazio em caso de erro

    food_group_df['flavor_profile_count'] = food_group_df['molecules'].apply(
        lambda x: contar_sabores_alimento(avaliar_molecule_ids(x), molecules_df)
    )
    return food_group_df

# Chama a função para obter o DataFrame com a contagem dos sabores
food_flavor = quantificar_sabores_por_alimento(food_group_df, molecules_df)

# Seleciona as colunas relevantes
cols_food_flavor = ['alias', 'synonyms', 'group', 'flavor_profile_count']
food_flavor_df = food_flavor[cols_food_flavor]

# Exibe os 10 primeiros alimentos com os perfis de sabor
food = food_flavor_df.head(10)
print(food)


              alias                                           synonyms  \
0   bakery products                                  {bakery products}   
1             bread                                            {bread}   
2         rye bread                                        {rye bread}   
3     wheaten bread                          {soda farls, soda scones}   
4       white bread                                      {white bread}   
5  wholewheat bread                                 {wholewheat bread}   
6              wort                                             {wort}   
7            arrack                                             {arak}   
8              beer                                             {beer}   
9        bantu beer  {kaffir beer, opaque beer, malwa, millet beer,...   

      group                               flavor_profile_count  
0     grain  {'raw': 1, 'hazelnut': 2, 'bread': 1, 'nutty':...  
1     grain  {'alcohol': 4, 'fermented': 2, 'alcoholic'

In [28]:
# renomeando colunas
food_flavor_df = food_flavor_df.rename(columns={'alias': 'ingrediente', 'synonyms':'sinonimos', 'group':'categoria', 'flavor_profile_count':'peso_sabores'})



In [37]:
alimentos_expanded_df = food_flavor_df.copy()


In [39]:
# EXPANDINDO A LISTA DE INGREDIENTES COM OS SINONIMOS DA COLUNA 'SINONIMOS'

# Função para garantir que 'synonyms' seja um conjunto
def convert_to_set(x):
    if isinstance(x, set):
        return x  # Retorna o conjunto original se já for um conjunto
    elif isinstance(x, str):
        try:
            return set(eval(x))  # Tenta converter string para conjunto
        except (SyntaxError, ValueError):
            return set()  # Retorna um conjunto vazio se a conversão falhar
    return set()  # Retorna um conjunto vazio se não for nem str nem set

# Converte a coluna 'synonyms' de strings para conjuntos
alimentos_expanded_df['sinonimos'] = alimentos_expanded_df['sinonimos'].apply(convert_to_set)

# Função para expandir os sinônimos e criar novas linhas
def expand_synonyms(df):
    rows = []
    for _, row in df.iterrows():
        # Adiciona a linha original
        rows.append(row.copy())  # Certifique-se de usar copy() aqui
        
        # Verifica se 'synonyms' é um conjunto e não está vazio
        synonyms = row['sinonimos']
        
        if isinstance(synonyms, set) and synonyms:  # Verifica se é um conjunto e não vazio
            for synonym in synonyms:
                if synonym != row['ingrediente']:  # Ignora se for igual ao ingrediente original
                    new_row = row.copy()  # Copia a linha original
                    new_row['ingrediente'] = synonym  # Substitui o nome do ingrediente
                    rows.append(new_row)  # Adiciona a nova linha à lista
    
    # Cria um novo DataFrame com as linhas expandidas e remove duplicatas
    expanded_df = pd.DataFrame(rows).drop(columns=['sinonimos'])  # Remove a coluna de sinônimos
    return expanded_df.drop_duplicates(subset=['ingrediente'])  # Remove duplicatas se houver

# Aplicar a função no DataFrame alimentos_df
expanded_ingredients = expand_synonyms(alimentos_expanded_df)

# Exibir o DataFrame final
print("DataFrame expandido:")
print(expanded_ingredients)



DataFrame expandido:
          ingrediente  categoria  \
0     bakery products      grain   
1               bread      grain   
2           rye bread      grain   
3       wheaten bread      grain   
3          soda farls      grain   
..                ...        ...   
847        guinea hen    protein   
847     original fowl    protein   
847  pet speckled hen    protein   
847       guinea fowl    protein   
848         cucurbita  vegetable   

                                          peso_sabores  
0    {'raw': 1, 'hazelnut': 2, 'bread': 1, 'nutty':...  
1    {'alcohol': 4, 'fermented': 2, 'alcoholic': 4,...  
2    {'roasted': 1, 'baked': 2, 'roast': 1, 'earthy...  
3    {'hazelnut': 2, 'tomato': 1, 'bitter': 3, 'nut...  
3    {'hazelnut': 2, 'tomato': 1, 'bitter': 3, 'nut...  
..                                                 ...  
847         {'very mild': 1, 'grassy': 1, 'bitter': 1}  
847         {'very mild': 1, 'grassy': 1, 'bitter': 1}  
847         {'very mild': 1, 'gra

In [43]:
print(expanded_ingredients)

          ingrediente  categoria  \
0     bakery products      grain   
1               bread      grain   
2           rye bread      grain   
3       wheaten bread      grain   
3          soda farls      grain   
..                ...        ...   
847        guinea hen    protein   
847     original fowl    protein   
847  pet speckled hen    protein   
847       guinea fowl    protein   
848         cucurbita  vegetable   

                                          peso_sabores  
0    {'raw': 1, 'hazelnut': 2, 'bread': 1, 'nutty':...  
1    {'alcohol': 4, 'fermented': 2, 'alcoholic': 4,...  
2    {'roasted': 1, 'baked': 2, 'roast': 1, 'earthy...  
3    {'hazelnut': 2, 'tomato': 1, 'bitter': 3, 'nut...  
3    {'hazelnut': 2, 'tomato': 1, 'bitter': 3, 'nut...  
..                                                 ...  
847         {'very mild': 1, 'grassy': 1, 'bitter': 1}  
847         {'very mild': 1, 'grassy': 1, 'bitter': 1}  
847         {'very mild': 1, 'grassy': 1, 'bitter': 1}

In [45]:
expanded_ingredients.to_csv("./datasets/preprocessed/INGREDIENTES_preprocessed.csv", index=False)

In [12]:
exp_ing = pd.read_csv('./datasets/preprocessed/INGREDIENTES_preprocessed.csv')

In [14]:
print(exp_ing.dtypes)

ingrediente     object
categoria       object
peso_sabores    object
dtype: object
