In [5]:
import pandas as pd
from IPython.core.display_functions import display
from owlready2 import get_ontology, sync_reasoner
import re
from unidecode import unidecode


def load_data(file_path):
    return pd.read_csv(file_path, sep="\t", low_memory=False, usecols=["product_name_pt", "ingredients_text_pt"])


def extract_ingredients(text):
    pattern = r"\s*,\s*|\s+e\s+(?=[a-zA-Z])"
    return [ingredient.strip() for ingredient in re.split(pattern, text)]


def detect_allergens(ingredients, allergens_set):
    return [ingredient for ingredient in ingredients if any(allergen in ingredient for allergen in allergens_set)]


def preprocess_data(df):
    df.dropna(subset=['ingredients_text_pt'], inplace=True)
    df['ingredients_text_pt'] = df['ingredients_text_pt'].str.lower()
    df['ingredients_text_pt'] = df['ingredients_text_pt'].str.strip()
    df['ingredients_text_pt'] = df['ingredients_text_pt'].apply(unidecode)
    df['ingredients_text_pt'] = df['ingredients_text_pt'].replace({
        '[^a-zA-Záéíóúçãõôê\s,]': '',
        ',+': ',',
        ' +': ' '
    }, regex=True)
    return df


def load_allergens_from_ontology(ontology_path):
    onto = get_ontology(ontology_path).load()
    allergens = set()
    sync_reasoner()
    for cls_label in ["Alérgeno", "Alérgeno por Derivação"]:
        cls = onto.search_one(label=cls_label)
        for instance in cls.instances():
            allergens.add(unidecode(instance.label.first().lower()))
    return allergens


def main():
    df = load_data('openfoodfacts_export.csv')
    df = preprocess_data(df)
    
    allergens_set = load_allergens_from_ontology("ontologia.owl")

    df['alergenos'] = df['ingredients_text_pt'].apply(lambda x: detect_allergens(extract_ingredients(x), allergens_set))
    return df


if __name__ == '__main__':
    result_df = main()
    display(result_df)



* Owlready2 * Running HermiT...
    java -Xmx2000M -cp /home/bridge/Documents/tcc/venv/lib/python3.10/site-packages/owlready2/hermit:/home/bridge/Documents/tcc/venv/lib/python3.10/site-packages/owlready2/hermit/HermiT.jar org.semanticweb.HermiT.cli.CommandLine -c -O -D -I file:////tmp/tmp3ic_5ehn
* Owlready2 * HermiT took 0.21178865432739258 seconds
* Owlready * (NB: only changes on entities loaded in Python are shown, other changes are done but not listed)


Unnamed: 0,product_name_pt,ingredients_text_pt,alergenos
1,Thé vert au jasmin,milho,[]
3,Pão para hamburger tradicional,farinha de trigo enriquecida com ferro e acido...,"[farinha de trigo enriquecida com ferro, glute..."
8,,"batata, oleo misto vegetal de palma e soja e s...","[soja, sal alergicos contem derivados de soja,..."
26,Wafer recheado sabor Chocolate e Avelã,"acucar, gordura vegetal, farinha de trigo enri...","[farinha de trigo enriquecida com ferro, leite..."
27,Feijão preto,feijao preto,[]
...,...,...,...
5078,,informacao nutricional informacion nutricional...,[]
5079,,oleos vegetals quidos e interesterificados con...,[lecitina de soja]
5089,Aveia em flocos finos,graos laminados de aveia em particulas finas\n...,"[suas estirpes hibridizadas pode conter trigo,..."
5095,Manteiga com sal,"creme de leite pasteurizado, cloreto de sodio ...",[creme de leite pasteurizado]
