# Introduction

Ce document résume les jeu de données, méthodologie, et statistiques utilisées pour l'estimation de la souffrance contenue dans les boîtes d'oeufs.

Nous commençons par l'import de la base de données complète d'open food facts obtenue le 31 mars 2025.

De cette base de données, nous ne retenons que les colonnes (goodcol) nécessaires au calcul du poids de souffrance, telles que définies dans le code.




In [1]:
import sys
import pandas as pd
import plotly.express as px
import json
from pathlib import Path
from typing import Dict, List, Optional, Any
import logging
import unicodedata
import re

sys.path.insert(0, "../../backend")

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 1000)

In [2]:
eggs_from_parquet = pd.read_csv("../data/eggs_from_parquet.csv")
eggs_from_parquet

Unnamed: 0,code,categories_tags,labels_tags,product_name,generic_name,quantity,product_quantity_unit,product_quantity,allergens_tags,ingredients_tags,ingredients,countries_tags,images
0,00003100,"[""en:farming-products"", ""en:eggs""]",[],"[{""lang"": ""main"", ""text"": ""Hard Boiled Eggs""}, {""lang"": ""fr"", ""text"": ""Hard Boiled Eggs""}]",[],2,,0.0,"[""en:eggs""]","[""fr:eggs"", ""en:e330"", ""fr:sodium-benzoate"", ""fr:nisin-preparation""]","[{""percent_max"":100.0,""percent_min"":100.0,""is_in_taxonomy"":0,""percent_estimate"":100.0,""vegan"":null,""id"":""fr:eggs"",""text"":""Eggs"",""vegetarian"":null,""ciqual_food_code"":null,""percent"":null,""from_palm_oil"":null,""ingredients"":[{""percent_max"":100.0,""percent_min"":25.0,""is_in_taxonomy"":0,""percent_estimate"":62.5,""vegan"":null,""id"":""fr:eggs"",""text"":""Eggs"",""vegetarian"":null,""ciqual_food_code"":null,""percent"":null,""from_palm_oil"":null,""ingredients"":null,""ecobalyse_code"":null,""processing"":null,""labels"":null,""origins"":null,""ecobalyse_proxy_code"":null,""quantity"":null,""quantity_g"":null,""ciqual_proxy_food_code"":null},{""percent_max"":50.0,""percent_min"":0.0,""is_in_taxonomy"":1,""percent_estimate"":18.75,""vegan"":""yes"",""id"":""en:e330"",""text"":""Citric Acid"",""vegetarian"":""yes"",""ciqual_food_code"":null,""percent"":null,""from_palm_oil"":null,""ingredients"":null,""ecobalyse_code"":null,""processing"":null,""labels"":null,""origins"":null,""ecobalyse_proxy_code"":null,""quantity"":null,""quantity_g"":null,""ciqual_proxy_food_code"":null}...","[""en:france""]","[{""key"": ""front"", ""imgid"": 1, ""rev"": 3, ""sizes"": {""100"": {""h"": 100, ""w"": 75}, ""200"": {""h"": 200, ""w"": 150}, ""400"": {""h"": 400, ""w"": 300}, ""full"": {""h"": 2666, ""w"": 2000}}, ""uploaded_t"": null, ""uploader"": null}, {""key"": ""nutrition_fr"", ""imgid"": 3, ""rev"": 18, ""sizes"": {""100"": {""h"": 100, ""w"": 85}, ""200"": {""h"": 200, ""w"": 170}, ""400"": {""h"": 400, ""w"": 340}, ""full"": {""h"": 785, ""w"": 668}}, ""uploaded_t"": null, ""uploader"": null}, {""key"": ""1"", ""imgid"": null, ""rev"": null, ""sizes"": {""100"": {""h"": 100, ""w"": 75}, ""200"": null, ""400"": {""h"": 400, ""w"": 300}, ""full"": {""h"": 2666, ""w"": 2000}}, ""uploaded_t"": 1415119256, ""uploader"": ""openfoodfacts-contributors""}, {""key"": ""ingredients_en"", ""imgid"": 3, ""rev"": 22, ""sizes"": {""100"": {""h"": 16, ""w"": 100}, ""200"": {""h"": 31, ""w"": 200}, ""400"": {""h"": 63, ""w"": 400}, ""full"": {""h"": 98, ""w"": 624}}, ""uploaded_t"": null, ""uploader"": null}, {""key"": ""2"", ""imgid"": null, ""rev"": null, ""sizes"": {""100"": {""h"": 100, ""w"": 70}, ""200"": null, ""400"": {""h"": 400, ""w"": 278}, ""full"": {""h"": 1002,..."
1,0011110797698,"[""en:farming-products"", ""en:eggs"", ""en:undefined""]",,"[{""lang"": ""main"", ""text"": ""Natural Grade Aa Large Brown Eggs""}, {""lang"": ""en"", ""text"": ""Natural Grade Aa Large Brown Eggs""}]",[],50 g,g,50.0,[],"[""en:large-brown-eggs""]","[{""percent_max"":100.0,""percent_min"":100.0,""is_in_taxonomy"":0,""percent_estimate"":100.0,""vegan"":null,""id"":""en:large-brown-eggs"",""text"":""LARGE BROWN EGGS"",""vegetarian"":null,""ciqual_food_code"":null,""percent"":null,""from_palm_oil"":null,""ingredients"":null,""ecobalyse_code"":null,""processing"":null,""labels"":null,""origins"":null,""ecobalyse_proxy_code"":null,""quantity"":null,""quantity_g"":null,""ciqual_proxy_food_code"":null}]","[""en:united-states""]","[{""key"": ""nutrition_en"", ""imgid"": 2, ""rev"": 6, ""sizes"": {""100"": {""h"": 100, ""w"": 43}, ""200"": {""h"": 200, ""w"": 86}, ""400"": {""h"": 400, ""w"": 172}, ""full"": {""h"": 1200, ""w"": 515}}, ""uploaded_t"": null, ""uploader"": null}, {""key"": ""front_en"", ""imgid"": 1, ""rev"": 4, ""sizes"": {""100"": {""h"": 100, ""w"": 45}, ""200"": {""h"": 200, ""w"": 89}, ""400"": {""h"": 400, ""w"": 178}, ""full"": {""h"": 1200, ""w"": 534}}, ""uploaded_t"": null, ""uploader"": null}, {""key"": ""2"", ""imgid"": null, ""rev"": null, ""sizes"": {""100"": {""h"": 100, ""w"": 43}, ""200"": null, ""400"": {""h"": 400, ""w"": 172}, ""full"": {""h"": 1200, ""w"": 515}}, ""uploaded_t"": 1626629588, ""uploader"": ""kiliweb""}, {""key"": ""3"", ""imgid"": null, ""rev"": null, ""sizes"": {""100"": {""h"": 100, ""w"": 83}, ""200"": null, ""400"": {""h"": 400, ""w"": 333}, ""full"": {""h"": 600, ""w"": 500}}, ""uploaded_t"": 1649794716, ""uploader"": ""foodvisor""}, {""key"": ""ingredients_en"", ""imgid"": 4, ""rev"": 10, ""sizes"": {""100"": {""h"": 100, ""w"": 75}, ""200"": {""h"": 200, ""w"": 150}, ""400"": {""h"": 400, ""w"": 300}, ""full"": {""h"": 4000, ""w""..."
2,0011110806543,"[""en:farming-products"", ""en:eggs""]",,"[{""lang"": ""main"", ""text"": ""100% Egg Whites""}, {""lang"": ""en"", ""text"": ""100% Egg Whites""}]",[],,,,"[""en:eggs""]","[""en:egg-white"", ""en:egg""]","[{""percent_max"":100.0,""percent_min"":100.0,""is_in_taxonomy"":1,""percent_estimate"":100.0,""vegan"":""no"",""id"":""en:egg-white"",""text"":""egg whites"",""vegetarian"":""yes"",""ciqual_food_code"":""22001"",""percent"":null,""from_palm_oil"":null,""ingredients"":null,""ecobalyse_code"":""egg-organic-code0"",""processing"":null,""labels"":""en:organic"",""origins"":null,""ecobalyse_proxy_code"":null,""quantity"":null,""quantity_g"":null,""ciqual_proxy_food_code"":null}]","[""en:united-states""]",[]
3,0011110828897,"[""en:farming-products"", ""en:eggs""]",,"[{""lang"": ""main"", ""text"": ""Kroger, break-free, real egg product""}, {""lang"": ""en"", ""text"": ""Kroger, break-free, real egg product""}]",[],,,,"[""en:eggs""]","[""en:egg-white"", ""en:egg"", ""en:contains-1-and-less-of-the-following"", ""en:e415"", ""en:salt"", ""en:onion"", ""en:vegetable"", ""en:root-vegetable"", ""en:onion-family-vegetable"", ""en:natural-flavouring"", ""en:flavouring"", ""en:colour"", ""en:vitamins"", ""en:minerals"", ""en:iron"", ""en:d-alpha-tocopheryl-acetate"", ""en:vitamin-e"", ""en:zinc-sulfate"", ""en:zinc"", ""en:calcium-pantothenate"", ""en:pantothenic-acid"", ""en:vitamin-b12"", ""en:e101"", ""en:thiamin-mononitrate"", ""en:thiamin"", ""en:pyridoxine-hydrochloride"", ""en:vitamin-b6"", ""en:folic-acid"", ""en:folate"", ""en:biotin"", ""en:cholecalciferol"", ""en:vitamin-d"", ""en:e412"", ""en:includes-beta-carotene"", ""en:e516"", ""en:ferric-orthophosphate""]","[{""percent_max"":99.0,""percent_min"":99.0,""is_in_taxonomy"":1,""percent_estimate"":99.0,""vegan"":""no"",""id"":""en:egg-white"",""text"":""Egg whites"",""vegetarian"":""yes"",""ciqual_food_code"":""22001"",""percent"":99.0,""from_palm_oil"":null,""ingredients"":null,""ecobalyse_code"":""egg-indoor-code3"",""processing"":null,""labels"":null,""origins"":null,""ecobalyse_proxy_code"":null,""quantity"":null,""quantity_g"":null,""ciqual_proxy_food_code"":null},{""percent_max"":1.0,""percent_min"":0.0,""is_in_taxonomy"":0,""percent_estimate"":0.5,""vegan"":null,""id"":""en:contains-1-and-less-of-the-following"",""text"":""contains 1% and less of the following"",""vegetarian"":null,""ciqual_food_code"":null,""percent"":null,""from_palm_oil"":null,""ingredients"":[{""percent_max"":1.0,""percent_min"":0.0,""is_in_taxonomy"":1,""percent_estimate"":0.5,""vegan"":""yes"",""id"":""en:e412"",""text"":""guar gum"",""vegetarian"":""yes"",""ciqual_food_code"":null,""percent"":null,""from_palm_oil"":null,""ingredients"":null,""ecobalyse_code"":null,""processing"":null,""labels"":null,""origins"":null,""ecobalyse_...","[""en:united-states""]",[]
4,0011110846037,"[""en:farming-products"", ""en:eggs""]",,"[{""lang"": ""main"", ""text"": ""100% Liquid Egg Whites""}, {""lang"": ""en"", ""text"": ""100% Liquid Egg Whites""}]",[],,,,"[""en:eggs""]","[""en:liquid-egg-white"", ""en:egg"", ""en:egg-white""]","[{""percent_max"":100.0,""percent_min"":100.0,""is_in_taxonomy"":1,""percent_estimate"":100.0,""vegan"":""no"",""id"":""en:liquid-egg-white"",""text"":""liquid egg whites"",""vegetarian"":""yes"",""ciqual_food_code"":""22001"",""percent"":100.0,""from_palm_oil"":null,""ingredients"":null,""ecobalyse_code"":""egg-indoor-code3"",""processing"":null,""labels"":null,""origins"":null,""ecobalyse_proxy_code"":null,""quantity"":null,""quantity_g"":null,""ciqual_proxy_food_code"":null}]","[""en:united-states""]",[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,6287027360032,"[""en:farming-products"", ""en:eggs""]",,"[{""lang"": ""main"", ""text"": ""Rahima Fresh Egg""}, {""lang"": ""en"", ""text"": ""Rahima Fresh Egg""}]",[],,,,[],,,"[""en:saudi-arabia""]","[{""key"": ""front_en"", ""imgid"": 1, ""rev"": 3, ""sizes"": {""100"": {""h"": 100, ""w"": 75}, ""200"": {""h"": 200, ""w"": 150}, ""400"": {""h"": 400, ""w"": 300}, ""full"": {""h"": 1920, ""w"": 1440}}, ""uploaded_t"": null, ""uploader"": null}, {""key"": ""1"", ""imgid"": null, ""rev"": null, ""sizes"": {""100"": {""h"": 100, ""w"": 75}, ""200"": null, ""400"": {""h"": 400, ""w"": 300}, ""full"": {""h"": 1920, ""w"": 1440}}, ""uploaded_t"": 1747978006, ""uploader"": ""openfoodfacts-contributors""}]"
7646,6287027360049,"[""en:farming-products"", ""en:eggs""]",,"[{""lang"": ""main"", ""text"": ""Fresh Egg""}, {""lang"": ""en"", ""text"": ""Fresh Egg""}]",[],,,,[],,,"[""en:saudi-arabia""]","[{""key"": ""front_en"", ""imgid"": 1, ""rev"": 3, ""sizes"": {""100"": {""h"": 100, ""w"": 75}, ""200"": {""h"": 200, ""w"": 150}, ""400"": {""h"": 400, ""w"": 300}, ""full"": {""h"": 1920, ""w"": 1440}}, ""uploaded_t"": null, ""uploader"": null}, {""key"": ""1"", ""imgid"": null, ""rev"": null, ""sizes"": {""100"": {""h"": 100, ""w"": 75}, ""200"": null, ""400"": {""h"": 400, ""w"": 300}, ""full"": {""h"": 1920, ""w"": 1440}}, ""uploaded_t"": 1747978055, ""uploader"": ""openfoodfacts-contributors""}]"
7647,6287004270057,"[""en:farming-products"", ""en:eggs""]",,"[{""lang"": ""main"", ""text"": ""Maknoon Egg""}, {""lang"": ""en"", ""text"": ""Maknoon Egg""}]",[],,,,[],,,"[""en:saudi-arabia""]","[{""key"": ""front_en"", ""imgid"": 1, ""rev"": 3, ""sizes"": {""100"": {""h"": 75, ""w"": 100}, ""200"": {""h"": 150, ""w"": 200}, ""400"": {""h"": 300, ""w"": 400}, ""full"": {""h"": 1440, ""w"": 1920}}, ""uploaded_t"": null, ""uploader"": null}, {""key"": ""1"", ""imgid"": null, ""rev"": null, ""sizes"": {""100"": {""h"": 75, ""w"": 100}, ""200"": null, ""400"": {""h"": 300, ""w"": 400}, ""full"": {""h"": 1440, ""w"": 1920}}, ""uploaded_t"": 1747978104, ""uploader"": ""openfoodfacts-contributors""}]"
7648,6281106110266,"[""en:farming-products"", ""en:eggs""]",,"[{""lang"": ""main"", ""text"": ""Rahima Egg""}, {""lang"": ""en"", ""text"": ""Rahima Egg""}]",[],,,,[],,,"[""en:saudi-arabia""]","[{""key"": ""1"", ""imgid"": null, ""rev"": null, ""sizes"": {""100"": {""h"": 100, ""w"": 75}, ""200"": null, ""400"": {""h"": 400, ""w"": 300}, ""full"": {""h"": 1920, ""w"": 1440}}, ""uploaded_t"": 1747978147, ""uploader"": ""openfoodfacts-contributors""}, {""key"": ""front_en"", ""imgid"": 1, ""rev"": 3, ""sizes"": {""100"": {""h"": 100, ""w"": 75}, ""200"": {""h"": 200, ""w"": 150}, ""400"": {""h"": 400, ""w"": 300}, ""full"": {""h"": 1920, ""w"": 1440}}, ""uploaded_t"": null, ""uploader"": null}]"


In [3]:

def safe_json_loads(s):
    if isinstance(s, str):
        s_strip = s.strip()
        if s_strip.startswith(('[', '{')):
            try:
                return json.loads(s_strip)
            except json.JSONDecodeError:
                pass
    return s

with open("../data/cols_to_json.txt", "r") as f:
    cols_to_json = json.load(f)

for col in cols_to_json:
    eggs_from_parquet[col] = eggs_from_parquet[col].apply(safe_json_loads)

eggs_from_parquet

Unnamed: 0,code,categories_tags,labels_tags,product_name,generic_name,quantity,product_quantity_unit,product_quantity,allergens_tags,ingredients_tags,ingredients,countries_tags,images
0,00003100,"[en:farming-products, en:eggs]",[],"[{'lang': 'main', 'text': 'Hard Boiled Eggs'}, {'lang': 'fr', 'text': 'Hard Boiled Eggs'}]",[],2,,0.0,[en:eggs],"[fr:eggs, en:e330, fr:sodium-benzoate, fr:nisin-preparation]","[{'percent_max': 100.0, 'percent_min': 100.0, 'is_in_taxonomy': 0, 'percent_estimate': 100.0, 'vegan': None, 'id': 'fr:eggs', 'text': 'Eggs', 'vegetarian': None, 'ciqual_food_code': None, 'percent': None, 'from_palm_oil': None, 'ingredients': [{'percent_max': 100.0, 'percent_min': 25.0, 'is_in_taxonomy': 0, 'percent_estimate': 62.5, 'vegan': None, 'id': 'fr:eggs', 'text': 'Eggs', 'vegetarian': None, 'ciqual_food_code': None, 'percent': None, 'from_palm_oil': None, 'ingredients': None, 'ecobalyse_code': None, 'processing': None, 'labels': None, 'origins': None, 'ecobalyse_proxy_code': None, 'quantity': None, 'quantity_g': None, 'ciqual_proxy_food_code': None}, {'percent_max': 50.0, 'percent_min': 0.0, 'is_in_taxonomy': 1, 'percent_estimate': 18.75, 'vegan': 'yes', 'id': 'en:e330', 'text': 'Citric Acid', 'vegetarian': 'yes', 'ciqual_food_code': None, 'percent': None, 'from_palm_oil': None, 'ingredients': None, 'ecobalyse_code': None, 'processing': None, 'labels': None, 'origins': Non...",[en:france],"[{'key': 'front', 'imgid': 1, 'rev': 3, 'sizes': {'100': {'h': 100, 'w': 75}, '200': {'h': 200, 'w': 150}, '400': {'h': 400, 'w': 300}, 'full': {'h': 2666, 'w': 2000}}, 'uploaded_t': None, 'uploader': None}, {'key': 'nutrition_fr', 'imgid': 3, 'rev': 18, 'sizes': {'100': {'h': 100, 'w': 85}, '200': {'h': 200, 'w': 170}, '400': {'h': 400, 'w': 340}, 'full': {'h': 785, 'w': 668}}, 'uploaded_t': None, 'uploader': None}, {'key': '1', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 75}, '200': None, '400': {'h': 400, 'w': 300}, 'full': {'h': 2666, 'w': 2000}}, 'uploaded_t': 1415119256, 'uploader': 'openfoodfacts-contributors'}, {'key': 'ingredients_en', 'imgid': 3, 'rev': 22, 'sizes': {'100': {'h': 16, 'w': 100}, '200': {'h': 31, 'w': 200}, '400': {'h': 63, 'w': 400}, 'full': {'h': 98, 'w': 624}}, 'uploaded_t': None, 'uploader': None}, {'key': '2', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 70}, '200': None, '400': {'h': 400, 'w': 278}, 'full': {'h': 1002,..."
1,0011110797698,"[en:farming-products, en:eggs, en:undefined]",,"[{'lang': 'main', 'text': 'Natural Grade Aa Large Brown Eggs'}, {'lang': 'en', 'text': 'Natural Grade Aa Large Brown Eggs'}]",[],50 g,g,50.0,[],[en:large-brown-eggs],"[{'percent_max': 100.0, 'percent_min': 100.0, 'is_in_taxonomy': 0, 'percent_estimate': 100.0, 'vegan': None, 'id': 'en:large-brown-eggs', 'text': 'LARGE BROWN EGGS', 'vegetarian': None, 'ciqual_food_code': None, 'percent': None, 'from_palm_oil': None, 'ingredients': None, 'ecobalyse_code': None, 'processing': None, 'labels': None, 'origins': None, 'ecobalyse_proxy_code': None, 'quantity': None, 'quantity_g': None, 'ciqual_proxy_food_code': None}]",[en:united-states],"[{'key': 'nutrition_en', 'imgid': 2, 'rev': 6, 'sizes': {'100': {'h': 100, 'w': 43}, '200': {'h': 200, 'w': 86}, '400': {'h': 400, 'w': 172}, 'full': {'h': 1200, 'w': 515}}, 'uploaded_t': None, 'uploader': None}, {'key': 'front_en', 'imgid': 1, 'rev': 4, 'sizes': {'100': {'h': 100, 'w': 45}, '200': {'h': 200, 'w': 89}, '400': {'h': 400, 'w': 178}, 'full': {'h': 1200, 'w': 534}}, 'uploaded_t': None, 'uploader': None}, {'key': '2', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 43}, '200': None, '400': {'h': 400, 'w': 172}, 'full': {'h': 1200, 'w': 515}}, 'uploaded_t': 1626629588, 'uploader': 'kiliweb'}, {'key': '3', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 83}, '200': None, '400': {'h': 400, 'w': 333}, 'full': {'h': 600, 'w': 500}}, 'uploaded_t': 1649794716, 'uploader': 'foodvisor'}, {'key': 'ingredients_en', 'imgid': 4, 'rev': 10, 'sizes': {'100': {'h': 100, 'w': 75}, '200': {'h': 200, 'w': 150}, '400': {'h': 400, 'w': 300}, 'full': {'h': 4000, 'w'..."
2,0011110806543,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': '100% Egg Whites'}, {'lang': 'en', 'text': '100% Egg Whites'}]",[],,,,[en:eggs],"[en:egg-white, en:egg]","[{'percent_max': 100.0, 'percent_min': 100.0, 'is_in_taxonomy': 1, 'percent_estimate': 100.0, 'vegan': 'no', 'id': 'en:egg-white', 'text': 'egg whites', 'vegetarian': 'yes', 'ciqual_food_code': '22001', 'percent': None, 'from_palm_oil': None, 'ingredients': None, 'ecobalyse_code': 'egg-organic-code0', 'processing': None, 'labels': 'en:organic', 'origins': None, 'ecobalyse_proxy_code': None, 'quantity': None, 'quantity_g': None, 'ciqual_proxy_food_code': None}]",[en:united-states],[]
3,0011110828897,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Kroger, break-free, real egg product'}, {'lang': 'en', 'text': 'Kroger, break-free, real egg product'}]",[],,,,[en:eggs],"[en:egg-white, en:egg, en:contains-1-and-less-of-the-following, en:e415, en:salt, en:onion, en:vegetable, en:root-vegetable, en:onion-family-vegetable, en:natural-flavouring, en:flavouring, en:colour, en:vitamins, en:minerals, en:iron, en:d-alpha-tocopheryl-acetate, en:vitamin-e, en:zinc-sulfate, en:zinc, en:calcium-pantothenate, en:pantothenic-acid, en:vitamin-b12, en:e101, en:thiamin-mononitrate, en:thiamin, en:pyridoxine-hydrochloride, en:vitamin-b6, en:folic-acid, en:folate, en:biotin, en:cholecalciferol, en:vitamin-d, en:e412, en:includes-beta-carotene, en:e516, en:ferric-orthophosphate]","[{'percent_max': 99.0, 'percent_min': 99.0, 'is_in_taxonomy': 1, 'percent_estimate': 99.0, 'vegan': 'no', 'id': 'en:egg-white', 'text': 'Egg whites', 'vegetarian': 'yes', 'ciqual_food_code': '22001', 'percent': 99.0, 'from_palm_oil': None, 'ingredients': None, 'ecobalyse_code': 'egg-indoor-code3', 'processing': None, 'labels': None, 'origins': None, 'ecobalyse_proxy_code': None, 'quantity': None, 'quantity_g': None, 'ciqual_proxy_food_code': None}, {'percent_max': 1.0, 'percent_min': 0.0, 'is_in_taxonomy': 0, 'percent_estimate': 0.5, 'vegan': None, 'id': 'en:contains-1-and-less-of-the-following', 'text': 'contains 1% and less of the following', 'vegetarian': None, 'ciqual_food_code': None, 'percent': None, 'from_palm_oil': None, 'ingredients': [{'percent_max': 1.0, 'percent_min': 0.0, 'is_in_taxonomy': 1, 'percent_estimate': 0.5, 'vegan': 'yes', 'id': 'en:e412', 'text': 'guar gum', 'vegetarian': 'yes', 'ciqual_food_code': None, 'percent': None, 'from_palm_oil': None, 'ingredients':...",[en:united-states],[]
4,0011110846037,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': '100% Liquid Egg Whites'}, {'lang': 'en', 'text': '100% Liquid Egg Whites'}]",[],,,,[en:eggs],"[en:liquid-egg-white, en:egg, en:egg-white]","[{'percent_max': 100.0, 'percent_min': 100.0, 'is_in_taxonomy': 1, 'percent_estimate': 100.0, 'vegan': 'no', 'id': 'en:liquid-egg-white', 'text': 'liquid egg whites', 'vegetarian': 'yes', 'ciqual_food_code': '22001', 'percent': 100.0, 'from_palm_oil': None, 'ingredients': None, 'ecobalyse_code': 'egg-indoor-code3', 'processing': None, 'labels': None, 'origins': None, 'ecobalyse_proxy_code': None, 'quantity': None, 'quantity_g': None, 'ciqual_proxy_food_code': None}]",[en:united-states],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,6287027360032,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Rahima Fresh Egg'}, {'lang': 'en', 'text': 'Rahima Fresh Egg'}]",[],,,,[],,,[en:saudi-arabia],"[{'key': 'front_en', 'imgid': 1, 'rev': 3, 'sizes': {'100': {'h': 100, 'w': 75}, '200': {'h': 200, 'w': 150}, '400': {'h': 400, 'w': 300}, 'full': {'h': 1920, 'w': 1440}}, 'uploaded_t': None, 'uploader': None}, {'key': '1', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 75}, '200': None, '400': {'h': 400, 'w': 300}, 'full': {'h': 1920, 'w': 1440}}, 'uploaded_t': 1747978006, 'uploader': 'openfoodfacts-contributors'}]"
7646,6287027360049,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Fresh Egg'}, {'lang': 'en', 'text': 'Fresh Egg'}]",[],,,,[],,,[en:saudi-arabia],"[{'key': 'front_en', 'imgid': 1, 'rev': 3, 'sizes': {'100': {'h': 100, 'w': 75}, '200': {'h': 200, 'w': 150}, '400': {'h': 400, 'w': 300}, 'full': {'h': 1920, 'w': 1440}}, 'uploaded_t': None, 'uploader': None}, {'key': '1', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 75}, '200': None, '400': {'h': 400, 'w': 300}, 'full': {'h': 1920, 'w': 1440}}, 'uploaded_t': 1747978055, 'uploader': 'openfoodfacts-contributors'}]"
7647,6287004270057,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Maknoon Egg'}, {'lang': 'en', 'text': 'Maknoon Egg'}]",[],,,,[],,,[en:saudi-arabia],"[{'key': 'front_en', 'imgid': 1, 'rev': 3, 'sizes': {'100': {'h': 75, 'w': 100}, '200': {'h': 150, 'w': 200}, '400': {'h': 300, 'w': 400}, 'full': {'h': 1440, 'w': 1920}}, 'uploaded_t': None, 'uploader': None}, {'key': '1', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 75, 'w': 100}, '200': None, '400': {'h': 300, 'w': 400}, 'full': {'h': 1440, 'w': 1920}}, 'uploaded_t': 1747978104, 'uploader': 'openfoodfacts-contributors'}]"
7648,6281106110266,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Rahima Egg'}, {'lang': 'en', 'text': 'Rahima Egg'}]",[],,,,[],,,[en:saudi-arabia],"[{'key': '1', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 75}, '200': None, '400': {'h': 400, 'w': 300}, 'full': {'h': 1920, 'w': 1440}}, 'uploaded_t': 1747978147, 'uploader': 'openfoodfacts-contributors'}, {'key': 'front_en', 'imgid': 1, 'rev': 3, 'sizes': {'100': {'h': 100, 'w': 75}, '200': {'h': 200, 'w': 150}, '400': {'h': 400, 'w': 300}, 'full': {'h': 1920, 'w': 1440}}, 'uploaded_t': None, 'uploader': None}]"


## Résultats

On obtient par cette méthode 6228 éléments, soit plus du double.

Des dix éléments affichés, on récupère surtour des oeufs, mais il y a des faux positifs, par exemple 0012009012168 : Chef d'oeuf™avec fromage sur muffin anglais, qui du reste ne serait pas exclu non plus en cherchant "oeuf" dans le champ "product_name".

En revanche, en échantillonnant 50 autres éléments, il semble que les éléments aberrants soient rares, et qu'on ait surtout, à part les packs, des blancs d'oeufs, qui ne posent pas de problème.

Ce filtre pourra être utilisé dans le code principal pour filtrer les éléments ; nous le conservons dans la suite de cette étude en gardant à l'esprit que quelques pourcents des résultats peuvent être incorrects.

## Import de l'OCR

On importe l'analyse par OCR de toutes les images d'oeufs + prédictions de catégories, en vue d'un parsing par regex

In [4]:

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class JSONLProcessor:
    """A class to process JSONL files and convert them to pandas DataFrames."""
    
    DEFAULT_COLUMNS = [
        'code', 'texte_ocr', 'breeding_type_related', 'weight_related',
        'proba_1', 'proba_2', 'proba_3'
    ]
    
    def __init__(self, file_path: str):
        """
        Initialize the JSONL processor.
        
        Args:
            file_path (str): Path to the .jsonl file
        """
        self.file_path = Path(file_path)
        self.processed_data: List[Dict[str, Any]] = []
    
    def _validate_file(self) -> bool:
        """
        Validate if the file exists and is readable.
        
        Returns:
            bool: True if file is valid, False otherwise
        """
        if not self.file_path.exists():
            logger.error(f"File '{self.file_path}' not found")
            return False
        
        if not self.file_path.is_file():
            logger.error(f"'{self.file_path}' is not a file")
            return False
        
        try:
            with open(self.file_path, 'r', encoding='utf-8') as f:
                f.read(1)  # Try to read first character
            return True
        except (PermissionError, UnicodeDecodeError) as e:
            logger.error(f"Cannot read file '{self.file_path}': {e}")
            return False
    
    def _extract_nested_field(self, record: Dict, *keys: str, default: Any = None) -> Any:
        """
        Safely extract nested fields from a dictionary.
        
        Args:
            record (Dict): The dictionary to extract from
            *keys: Sequence of keys to traverse
            default: Default value if any key is missing
            
        Returns:
            The extracted value or default
        """
        current = record
        for key in keys:
            if isinstance(current, dict) and key in current:
                current = current[key]
            else:
                return default
        return current
    
    def _process_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
        """
        Process a single JSON record and extract required fields.
        
        Args:
            record (Dict): JSON record to process
            
        Returns:
            Dict: Processed record with extracted fields
        """
        return {
            'code': record.get('code', {}),
            'texte_ocr': record.get('ocr_text', {}),
            'breeding_type_related': self._extract_nested_field(
                record, 'groq_spans', 'breeding_type_related', default={}
            ),
            'weight_related': self._extract_nested_field(
                record, 'groq_spans', 'weight_related', default={}
            ),
            'proba_1': self._extract_nested_field(
                record, 'lewagon_prediction', 'proba_1'
            ),
            'proba_2': self._extract_nested_field(
                record, 'lewagon_prediction', 'proba_2'
            ),
            'proba_3': self._extract_nested_field(
                record, 'lewagon_prediction', 'proba_3'
            )
        }
    
    def _process_line(self, line: str, line_num: int) -> Optional[Dict[str, Any]]:
        """
        Process a single line from the JSONL file.
        
        Args:
            line (str): Line to process
            line_num (int): Line number for error reporting
            
        Returns:
            Optional[Dict]: Processed record or None if error occurred
        """
        stripped_line = line.strip()
        if not stripped_line:
            logger.debug(f"Line {line_num} is empty, skipping")
            return None
        
        try:
            record = json.loads(stripped_line)
            return self._process_record(record)
        except json.JSONDecodeError as e:
            logger.warning(f"JSON decode error at line {line_num}: {e}")
            logger.debug(f"Problematic line: {stripped_line[:100]}...")
            return None
        except Exception as e:
            logger.warning(f"Unexpected error processing line {line_num}: {e}")
            return None
    
    def process_file(self) -> pd.DataFrame:
        """
        Process the entire JSONL file and return a DataFrame.
        
        Returns:
            pd.DataFrame: DataFrame with extracted data
        """
        if not self._validate_file():
            return pd.DataFrame(columns=self.DEFAULT_COLUMNS)
        
        self.processed_data = []
        successful_lines = 0
        total_lines = 0
        
        try:
            with open(self.file_path, 'r', encoding='utf-8') as f:
                logger.info(f"Processing file: {self.file_path}")
                for line_num, line in enumerate(f, 1):
                    total_lines += 1
                    processed_record = self._process_line(line, line_num)
                    if processed_record is not None:
                        self.processed_data.append(processed_record)
                        successful_lines += 1
        
        except Exception as e:
            logger.error(f"Unexpected error reading file: {e}")
            return pd.DataFrame(columns=self.DEFAULT_COLUMNS)
        
        logger.info(f"Processing complete. Successfully processed {successful_lines}/{total_lines} lines")
        return pd.DataFrame(self.processed_data)


def create_dataframe_from_jsonl(file_path: str) -> pd.DataFrame:
    """
    Create a pandas DataFrame from a JSONL file.
    
    This function extracts specific fields from each JSON record:
    - code, ocr_text from root level
    - breeding_type_related, weight_related from groq_spans
    - proba_1, proba_2, proba_3 from lewagon_prediction
    
    Args:
        file_path (str): Path to the .jsonl file
        
    Returns:
        pd.DataFrame: DataFrame with extracted data, or empty DataFrame if error occurs
    """
    processor = JSONLProcessor(file_path)
    return processor.process_file()


# Configuration
JSONL_FILE_PATH = r"..\neural_category_predictions\data\dfoeufs_with_predictions_with_ground_truth_with_groq.jsonl"

# Process the file
try:
    code_ocr = create_dataframe_from_jsonl(JSONL_FILE_PATH)
    
    if not code_ocr.empty:
        print(f"DataFrame created successfully with {len(code_ocr)} rows and {len(code_ocr.columns)} columns")
        display(code_ocr)
    else:
        print("Empty DataFrame created - check file path and content")
        
except Exception as e:
    logger.error(f"Failed to process file: {e}")

print("Merge avec l'import eggs, renommé eggs")
eggs = eggs_from_parquet.merge(code_ocr, on='code', how = 'left')
eggs

INFO:__main__:Processing file: ..\neural_category_predictions\data\dfoeufs_with_predictions_with_ground_truth_with_groq.jsonl
INFO:__main__:Processing complete. Successfully processed 7647/7650 lines


DataFrame created successfully with 7647 rows and 7 columns


Unnamed: 0,code,texte_ocr,breeding_type_related,weight_related,proba_1,proba_2,proba_3
0,00003100,"10 CADROJAC\nFR\n(31.091.010)\nCE\npc10\nSovivo\nCovivo 31150 BRUG\n44493652\n10000 31001\nCHAIR A SAUCISSE\nINGREDIENTS VIANDES DE PORC (86%),EAU,ACIDIFIANT\n(E326),SEL, DEXTROSE,ACIDIFIANT(E263),\nEXHAUSTEUR DE GOUT(E26-1),ANTIOXYGENE(E316),\nCOLORANTS(E 120,E160C),AROMES.\nCONDITIONNE SOLIS ATMOSPHERE PROTECTRICE\nA consommer apres cuisson a cosur\nOrigine FRANCE\nElabore en FRANCE\namballe Le 31.10.14\nà consommer jusqu'au 10.11.14\nPrix/kg\n5.99€\nPRIX\nA conserver entre 0 et\nPoids net\n246277 031099\n0,792kg\n4,74€ cheffrest\nfresh made goodness to go\nHard Boiled Eggs (2 ea)\nPrep\n03/29/22\nEnjoy By 04/03/22\nCONTAINS EGGS\n$1.39\n0100000 00310\nPRODUCED BY METZ CULINARY Am unt\nrvin\nHard Boiled Eggs (2 ea)\nNutrition Facts\nServ Size: EA\nServings\nDV Amount Serving\nCaloriesh hade go\nTM\nDV\nTotal Fat 10g\n13%\nSat Fat 3g\n15% Total Carb. 2g\n1%\nTrans Fat 0.0g\nFiber Og\n0%\nCholest. 375mg\n125% Sugars\n2g\nSodium 140mg\n6% Protein\n12g\nVitamin A\n10% Vitamin C\n0%\n...",,"2 ea, 0.792kg",processed meat,meat,sweetened beverages
1,0011110797698,"SELL BY\n100 mg Omega 3 Fatty Acids Per Egg\nExcellent Source of Vitamin E\nSEE NUTRITION INFORMATION INSIDE FOR CHOLESTEROL CONTENT\nsimple\ntruth\nNatural\nCage Free\nGrain Fed\nLARGE\nBROWN EGGS\nGRADE AA\nBELOW\n12 EGGS NET WT 24 OZ (1 LB 8 02) 6819ULATIONS PROMCAST THE USE OF MORMONES\nruth\nNatural Cage Free, Grain Fed\nLARGE BROWN EGGS (GRADE AA\nCA SEFS COMPLIANT\n12 EGGS\nFrom hens raised\nin a humane\nenvironment with\nvegetarian feed.\nEggs do not\ncontain added\nhormones\nSELL BY Y saning of food\nontribuiss to e\ndaly\ncalories a day is\nIt's the Simple Truth\nEasy to find, understand & afford,\nSimple Truth from Kroger makes\nenjoying organic, natural & wellness\nproducts simply better.\nAll Items are free from 101 artificial\npreservatives & ingredients so you can\nfeel better about the delicious goodness\nWe promise\nquality and delicious\nevery\nselighted,\nhigh\nWe will make it right with\n& replacement er refund\nat your store.\n800-632-6900\nthat's in the food y...","Natural Cage Free, Grain Fed",12 EGGS NET WT 24 OZ (1 LB 8 02) 681g,biscuits and cakes,eggs,one dish meals
2,0011110806543,,,,,,
3,0011110828897,,,,,,
4,0011110846037,,,,,,
...,...,...,...,...,...,...,...
7642,6287004520343,صحه\nSAHA\nSELENIUM EGGS\nصحة\nبيض بالسيلينيوم\nSAHA\nSELENIUM EGGS\nبيض بالسيلينيوم\nيده\nصد\ness\nيحمي\nوالتلف,,,milk and yogurt,salty and fatty products,one dish meals
7643,6287027360032,7027 360032\nمؤسسة عنوان الثقة للتجارة\nENWAN AL THEOQAEST FOR TRADING\nالعنوان\nالنقم\nمؤسسة عنوان الثقة للتجارة\nبيض طازج يومياً\nحقائق غذائية لكل 100 غ\nالسعرات الحرارية ۲۷ کیلو کالوري.\nاجمالي الدهون.\nكوليسترول\nجمالي الكربوهيدرات\nإجمالي السكريات\nتتضمن عسكر مناف\nبوتاسيوم\nNutrition Facts Per 100 g\nSodium\nCalories\nTotal Fat\nSaturated Fats\n3.10\n277 kcal\n13%\n16%\nXIT\nE\nTrans Fats\n.0.19\nCholesterol\n394 mg\n131%\n145 mg\nTotal Carbohydrate-\nDietary Fibers\n510\n19\n7%\n0%\n4%\nZY\n24\nTotal Sugers\ninciuates 0 Added Sugar\n42.20\n0%\nProtein\n1-8 xog\n13%\nمراجع\n36 ing\n6%\nPolysarum\nVitamin A\n138 mg\n78\n20%\n2:05 mpg\n14908\nSA\nبيض مائدة طازج ٦ حبة\nس - ١٠١٠٦٤٧٨١٤٠٥ - المملكة العربية السعودية - الرياض - الربوة - جوال ٠٥٥٨٨٨٦١٠٨\nKingdom of Saudi Arabia-Riyadh-Al-Rabwa-Mobile: 0558886108 - C.R1010647814,,"٦ حبة, 100 غ",biscuits and cakes,one dish meals,vegetables
7644,6287027360049,KDF\nKSA 10\nمؤسسة حلوان القصة للتجارة\nENVOVANYA SUZE CAEST FORTRADING\nبيض طازج يومياً\nبيض مائدة طازج ١٥ حبة\nت المملكة العربية السعودية الرياضي\nKingdom gisand Arab Syst. A.Rabwe Mobile 0554885109-CR-010647313\nمؤسسة عنوان الثقة للتجارة\nاعنوان\nنقم\nمؤسسهعنوان الثقة للتجارة\nEMMAN AL THEDA EST FOR TRADING\nENWAN AL THEQA EST FOR TRADING\nبيض طازج يومياً\nحقائق غذائية لكل 100 -\nبيض مائدة طازج ١٥ حبة\nست : ١٠١٠٦٤٧٨١٤ - المملكة العربية السعودية - الرياض - الربوة - جوال ٠٥٥٨٦١٠٨\nKingdom of Saudi Arabia Riyadh Al-Rabwa-Mobile: 0558886108-CR 1010647814\nPC BAR 10.95,,بيض مائدة طازج ١٥ حبة,vegetables,biscuits and cakes,dressings and sauces
7645,6287004270057,"طارع يوميا\nمن المزرعة\nلغازية طبيعية 100\nبيض\nاللؤلؤ\nالمكنون\nALLOLO AL MAKNOON\n15\npieces\nCERTIFIED\nISO\n9001:2015\nOMPAN\nVis0\n22000:2005\n053 214 8056\n054 007 4000\nalolo.m.poultry@gmail.com\n6 287004 270057""",من المزرعة لغازية طبيعية,15 pieces,cereals,fish and seafood,one dish meals


Merge avec l'import eggs, renommé eggs


Unnamed: 0,code,categories_tags,labels_tags,product_name,generic_name,quantity,product_quantity_unit,product_quantity,allergens_tags,ingredients_tags,ingredients,countries_tags,images,texte_ocr,breeding_type_related,weight_related,proba_1,proba_2,proba_3
0,00003100,"[en:farming-products, en:eggs]",[],"[{'lang': 'main', 'text': 'Hard Boiled Eggs'}, {'lang': 'fr', 'text': 'Hard Boiled Eggs'}]",[],2,,0.0,[en:eggs],"[fr:eggs, en:e330, fr:sodium-benzoate, fr:nisin-preparation]","[{'percent_max': 100.0, 'percent_min': 100.0, 'is_in_taxonomy': 0, 'percent_estimate': 100.0, 'vegan': None, 'id': 'fr:eggs', 'text': 'Eggs', 'vegetarian': None, 'ciqual_food_code': None, 'percent': None, 'from_palm_oil': None, 'ingredients': [{'percent_max': 100.0, 'percent_min': 25.0, 'is_in_taxonomy': 0, 'percent_estimate': 62.5, 'vegan': None, 'id': 'fr:eggs', 'text': 'Eggs', 'vegetarian': None, 'ciqual_food_code': None, 'percent': None, 'from_palm_oil': None, 'ingredients': None, 'ecobalyse_code': None, 'processing': None, 'labels': None, 'origins': None, 'ecobalyse_proxy_code': None, 'quantity': None, 'quantity_g': None, 'ciqual_proxy_food_code': None}, {'percent_max': 50.0, 'percent_min': 0.0, 'is_in_taxonomy': 1, 'percent_estimate': 18.75, 'vegan': 'yes', 'id': 'en:e330', 'text': 'Citric Acid', 'vegetarian': 'yes', 'ciqual_food_code': None, 'percent': None, 'from_palm_oil': None, 'ingredients': None, 'ecobalyse_code': None, 'processing': None, 'labels': None, 'origins': Non...",[en:france],"[{'key': 'front', 'imgid': 1, 'rev': 3, 'sizes': {'100': {'h': 100, 'w': 75}, '200': {'h': 200, 'w': 150}, '400': {'h': 400, 'w': 300}, 'full': {'h': 2666, 'w': 2000}}, 'uploaded_t': None, 'uploader': None}, {'key': 'nutrition_fr', 'imgid': 3, 'rev': 18, 'sizes': {'100': {'h': 100, 'w': 85}, '200': {'h': 200, 'w': 170}, '400': {'h': 400, 'w': 340}, 'full': {'h': 785, 'w': 668}}, 'uploaded_t': None, 'uploader': None}, {'key': '1', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 75}, '200': None, '400': {'h': 400, 'w': 300}, 'full': {'h': 2666, 'w': 2000}}, 'uploaded_t': 1415119256, 'uploader': 'openfoodfacts-contributors'}, {'key': 'ingredients_en', 'imgid': 3, 'rev': 22, 'sizes': {'100': {'h': 16, 'w': 100}, '200': {'h': 31, 'w': 200}, '400': {'h': 63, 'w': 400}, 'full': {'h': 98, 'w': 624}}, 'uploaded_t': None, 'uploader': None}, {'key': '2', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 70}, '200': None, '400': {'h': 400, 'w': 278}, 'full': {'h': 1002,...","10 CADROJAC\nFR\n(31.091.010)\nCE\npc10\nSovivo\nCovivo 31150 BRUG\n44493652\n10000 31001\nCHAIR A SAUCISSE\nINGREDIENTS VIANDES DE PORC (86%),EAU,ACIDIFIANT\n(E326),SEL, DEXTROSE,ACIDIFIANT(E263),\nEXHAUSTEUR DE GOUT(E26-1),ANTIOXYGENE(E316),\nCOLORANTS(E 120,E160C),AROMES.\nCONDITIONNE SOLIS ATMOSPHERE PROTECTRICE\nA consommer apres cuisson a cosur\nOrigine FRANCE\nElabore en FRANCE\namballe Le 31.10.14\nà consommer jusqu'au 10.11.14\nPrix/kg\n5.99€\nPRIX\nA conserver entre 0 et\nPoids net\n246277 031099\n0,792kg\n4,74€ cheffrest\nfresh made goodness to go\nHard Boiled Eggs (2 ea)\nPrep\n03/29/22\nEnjoy By 04/03/22\nCONTAINS EGGS\n$1.39\n0100000 00310\nPRODUCED BY METZ CULINARY Am unt\nrvin\nHard Boiled Eggs (2 ea)\nNutrition Facts\nServ Size: EA\nServings\nDV Amount Serving\nCaloriesh hade go\nTM\nDV\nTotal Fat 10g\n13%\nSat Fat 3g\n15% Total Carb. 2g\n1%\nTrans Fat 0.0g\nFiber Og\n0%\nCholest. 375mg\n125% Sugars\n2g\nSodium 140mg\n6% Protein\n12g\nVitamin A\n10% Vitamin C\n0%\n...",,"2 ea, 0.792kg",processed meat,meat,sweetened beverages
1,0011110797698,"[en:farming-products, en:eggs, en:undefined]",,"[{'lang': 'main', 'text': 'Natural Grade Aa Large Brown Eggs'}, {'lang': 'en', 'text': 'Natural Grade Aa Large Brown Eggs'}]",[],50 g,g,50.0,[],[en:large-brown-eggs],"[{'percent_max': 100.0, 'percent_min': 100.0, 'is_in_taxonomy': 0, 'percent_estimate': 100.0, 'vegan': None, 'id': 'en:large-brown-eggs', 'text': 'LARGE BROWN EGGS', 'vegetarian': None, 'ciqual_food_code': None, 'percent': None, 'from_palm_oil': None, 'ingredients': None, 'ecobalyse_code': None, 'processing': None, 'labels': None, 'origins': None, 'ecobalyse_proxy_code': None, 'quantity': None, 'quantity_g': None, 'ciqual_proxy_food_code': None}]",[en:united-states],"[{'key': 'nutrition_en', 'imgid': 2, 'rev': 6, 'sizes': {'100': {'h': 100, 'w': 43}, '200': {'h': 200, 'w': 86}, '400': {'h': 400, 'w': 172}, 'full': {'h': 1200, 'w': 515}}, 'uploaded_t': None, 'uploader': None}, {'key': 'front_en', 'imgid': 1, 'rev': 4, 'sizes': {'100': {'h': 100, 'w': 45}, '200': {'h': 200, 'w': 89}, '400': {'h': 400, 'w': 178}, 'full': {'h': 1200, 'w': 534}}, 'uploaded_t': None, 'uploader': None}, {'key': '2', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 43}, '200': None, '400': {'h': 400, 'w': 172}, 'full': {'h': 1200, 'w': 515}}, 'uploaded_t': 1626629588, 'uploader': 'kiliweb'}, {'key': '3', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 83}, '200': None, '400': {'h': 400, 'w': 333}, 'full': {'h': 600, 'w': 500}}, 'uploaded_t': 1649794716, 'uploader': 'foodvisor'}, {'key': 'ingredients_en', 'imgid': 4, 'rev': 10, 'sizes': {'100': {'h': 100, 'w': 75}, '200': {'h': 200, 'w': 150}, '400': {'h': 400, 'w': 300}, 'full': {'h': 4000, 'w'...","SELL BY\n100 mg Omega 3 Fatty Acids Per Egg\nExcellent Source of Vitamin E\nSEE NUTRITION INFORMATION INSIDE FOR CHOLESTEROL CONTENT\nsimple\ntruth\nNatural\nCage Free\nGrain Fed\nLARGE\nBROWN EGGS\nGRADE AA\nBELOW\n12 EGGS NET WT 24 OZ (1 LB 8 02) 6819ULATIONS PROMCAST THE USE OF MORMONES\nruth\nNatural Cage Free, Grain Fed\nLARGE BROWN EGGS (GRADE AA\nCA SEFS COMPLIANT\n12 EGGS\nFrom hens raised\nin a humane\nenvironment with\nvegetarian feed.\nEggs do not\ncontain added\nhormones\nSELL BY Y saning of food\nontribuiss to e\ndaly\ncalories a day is\nIt's the Simple Truth\nEasy to find, understand & afford,\nSimple Truth from Kroger makes\nenjoying organic, natural & wellness\nproducts simply better.\nAll Items are free from 101 artificial\npreservatives & ingredients so you can\nfeel better about the delicious goodness\nWe promise\nquality and delicious\nevery\nselighted,\nhigh\nWe will make it right with\n& replacement er refund\nat your store.\n800-632-6900\nthat's in the food y...","Natural Cage Free, Grain Fed",12 EGGS NET WT 24 OZ (1 LB 8 02) 681g,biscuits and cakes,eggs,one dish meals
2,0011110806543,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': '100% Egg Whites'}, {'lang': 'en', 'text': '100% Egg Whites'}]",[],,,,[en:eggs],"[en:egg-white, en:egg]","[{'percent_max': 100.0, 'percent_min': 100.0, 'is_in_taxonomy': 1, 'percent_estimate': 100.0, 'vegan': 'no', 'id': 'en:egg-white', 'text': 'egg whites', 'vegetarian': 'yes', 'ciqual_food_code': '22001', 'percent': None, 'from_palm_oil': None, 'ingredients': None, 'ecobalyse_code': 'egg-organic-code0', 'processing': None, 'labels': 'en:organic', 'origins': None, 'ecobalyse_proxy_code': None, 'quantity': None, 'quantity_g': None, 'ciqual_proxy_food_code': None}]",[en:united-states],[],,,,,,
3,0011110828897,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Kroger, break-free, real egg product'}, {'lang': 'en', 'text': 'Kroger, break-free, real egg product'}]",[],,,,[en:eggs],"[en:egg-white, en:egg, en:contains-1-and-less-of-the-following, en:e415, en:salt, en:onion, en:vegetable, en:root-vegetable, en:onion-family-vegetable, en:natural-flavouring, en:flavouring, en:colour, en:vitamins, en:minerals, en:iron, en:d-alpha-tocopheryl-acetate, en:vitamin-e, en:zinc-sulfate, en:zinc, en:calcium-pantothenate, en:pantothenic-acid, en:vitamin-b12, en:e101, en:thiamin-mononitrate, en:thiamin, en:pyridoxine-hydrochloride, en:vitamin-b6, en:folic-acid, en:folate, en:biotin, en:cholecalciferol, en:vitamin-d, en:e412, en:includes-beta-carotene, en:e516, en:ferric-orthophosphate]","[{'percent_max': 99.0, 'percent_min': 99.0, 'is_in_taxonomy': 1, 'percent_estimate': 99.0, 'vegan': 'no', 'id': 'en:egg-white', 'text': 'Egg whites', 'vegetarian': 'yes', 'ciqual_food_code': '22001', 'percent': 99.0, 'from_palm_oil': None, 'ingredients': None, 'ecobalyse_code': 'egg-indoor-code3', 'processing': None, 'labels': None, 'origins': None, 'ecobalyse_proxy_code': None, 'quantity': None, 'quantity_g': None, 'ciqual_proxy_food_code': None}, {'percent_max': 1.0, 'percent_min': 0.0, 'is_in_taxonomy': 0, 'percent_estimate': 0.5, 'vegan': None, 'id': 'en:contains-1-and-less-of-the-following', 'text': 'contains 1% and less of the following', 'vegetarian': None, 'ciqual_food_code': None, 'percent': None, 'from_palm_oil': None, 'ingredients': [{'percent_max': 1.0, 'percent_min': 0.0, 'is_in_taxonomy': 1, 'percent_estimate': 0.5, 'vegan': 'yes', 'id': 'en:e412', 'text': 'guar gum', 'vegetarian': 'yes', 'ciqual_food_code': None, 'percent': None, 'from_palm_oil': None, 'ingredients':...",[en:united-states],[],,,,,,
4,0011110846037,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': '100% Liquid Egg Whites'}, {'lang': 'en', 'text': '100% Liquid Egg Whites'}]",[],,,,[en:eggs],"[en:liquid-egg-white, en:egg, en:egg-white]","[{'percent_max': 100.0, 'percent_min': 100.0, 'is_in_taxonomy': 1, 'percent_estimate': 100.0, 'vegan': 'no', 'id': 'en:liquid-egg-white', 'text': 'liquid egg whites', 'vegetarian': 'yes', 'ciqual_food_code': '22001', 'percent': 100.0, 'from_palm_oil': None, 'ingredients': None, 'ecobalyse_code': 'egg-indoor-code3', 'processing': None, 'labels': None, 'origins': None, 'ecobalyse_proxy_code': None, 'quantity': None, 'quantity_g': None, 'ciqual_proxy_food_code': None}]",[en:united-states],[],,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,6287027360032,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Rahima Fresh Egg'}, {'lang': 'en', 'text': 'Rahima Fresh Egg'}]",[],,,,[],,,[en:saudi-arabia],"[{'key': 'front_en', 'imgid': 1, 'rev': 3, 'sizes': {'100': {'h': 100, 'w': 75}, '200': {'h': 200, 'w': 150}, '400': {'h': 400, 'w': 300}, 'full': {'h': 1920, 'w': 1440}}, 'uploaded_t': None, 'uploader': None}, {'key': '1', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 75}, '200': None, '400': {'h': 400, 'w': 300}, 'full': {'h': 1920, 'w': 1440}}, 'uploaded_t': 1747978006, 'uploader': 'openfoodfacts-contributors'}]",7027 360032\nمؤسسة عنوان الثقة للتجارة\nENWAN AL THEOQAEST FOR TRADING\nالعنوان\nالنقم\nمؤسسة عنوان الثقة للتجارة\nبيض طازج يومياً\nحقائق غذائية لكل 100 غ\nالسعرات الحرارية ۲۷ کیلو کالوري.\nاجمالي الدهون.\nكوليسترول\nجمالي الكربوهيدرات\nإجمالي السكريات\nتتضمن عسكر مناف\nبوتاسيوم\nNutrition Facts Per 100 g\nSodium\nCalories\nTotal Fat\nSaturated Fats\n3.10\n277 kcal\n13%\n16%\nXIT\nE\nTrans Fats\n.0.19\nCholesterol\n394 mg\n131%\n145 mg\nTotal Carbohydrate-\nDietary Fibers\n510\n19\n7%\n0%\n4%\nZY\n24\nTotal Sugers\ninciuates 0 Added Sugar\n42.20\n0%\nProtein\n1-8 xog\n13%\nمراجع\n36 ing\n6%\nPolysarum\nVitamin A\n138 mg\n78\n20%\n2:05 mpg\n14908\nSA\nبيض مائدة طازج ٦ حبة\nس - ١٠١٠٦٤٧٨١٤٠٥ - المملكة العربية السعودية - الرياض - الربوة - جوال ٠٥٥٨٨٨٦١٠٨\nKingdom of Saudi Arabia-Riyadh-Al-Rabwa-Mobile: 0558886108 - C.R1010647814,,"٦ حبة, 100 غ",biscuits and cakes,one dish meals,vegetables
7646,6287027360049,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Fresh Egg'}, {'lang': 'en', 'text': 'Fresh Egg'}]",[],,,,[],,,[en:saudi-arabia],"[{'key': 'front_en', 'imgid': 1, 'rev': 3, 'sizes': {'100': {'h': 100, 'w': 75}, '200': {'h': 200, 'w': 150}, '400': {'h': 400, 'w': 300}, 'full': {'h': 1920, 'w': 1440}}, 'uploaded_t': None, 'uploader': None}, {'key': '1', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 75}, '200': None, '400': {'h': 400, 'w': 300}, 'full': {'h': 1920, 'w': 1440}}, 'uploaded_t': 1747978055, 'uploader': 'openfoodfacts-contributors'}]",KDF\nKSA 10\nمؤسسة حلوان القصة للتجارة\nENVOVANYA SUZE CAEST FORTRADING\nبيض طازج يومياً\nبيض مائدة طازج ١٥ حبة\nت المملكة العربية السعودية الرياضي\nKingdom gisand Arab Syst. A.Rabwe Mobile 0554885109-CR-010647313\nمؤسسة عنوان الثقة للتجارة\nاعنوان\nنقم\nمؤسسهعنوان الثقة للتجارة\nEMMAN AL THEDA EST FOR TRADING\nENWAN AL THEQA EST FOR TRADING\nبيض طازج يومياً\nحقائق غذائية لكل 100 -\nبيض مائدة طازج ١٥ حبة\nست : ١٠١٠٦٤٧٨١٤ - المملكة العربية السعودية - الرياض - الربوة - جوال ٠٥٥٨٦١٠٨\nKingdom of Saudi Arabia Riyadh Al-Rabwa-Mobile: 0558886108-CR 1010647814\nPC BAR 10.95,,بيض مائدة طازج ١٥ حبة,vegetables,biscuits and cakes,dressings and sauces
7647,6287004270057,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Maknoon Egg'}, {'lang': 'en', 'text': 'Maknoon Egg'}]",[],,,,[],,,[en:saudi-arabia],"[{'key': 'front_en', 'imgid': 1, 'rev': 3, 'sizes': {'100': {'h': 75, 'w': 100}, '200': {'h': 150, 'w': 200}, '400': {'h': 300, 'w': 400}, 'full': {'h': 1440, 'w': 1920}}, 'uploaded_t': None, 'uploader': None}, {'key': '1', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 75, 'w': 100}, '200': None, '400': {'h': 300, 'w': 400}, 'full': {'h': 1440, 'w': 1920}}, 'uploaded_t': 1747978104, 'uploader': 'openfoodfacts-contributors'}]","طارع يوميا\nمن المزرعة\nلغازية طبيعية 100\nبيض\nاللؤلؤ\nالمكنون\nALLOLO AL MAKNOON\n15\npieces\nCERTIFIED\nISO\n9001:2015\nOMPAN\nVis0\n22000:2005\n053 214 8056\n054 007 4000\nalolo.m.poultry@gmail.com\n6 287004 270057""",من المزرعة لغازية طبيعية,15 pieces,cereals,fish and seafood,one dish meals
7648,6281106110266,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Rahima Egg'}, {'lang': 'en', 'text': 'Rahima Egg'}]",[],,,,[],,,[en:saudi-arabia],"[{'key': '1', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 100, 'w': 75}, '200': None, '400': {'h': 400, 'w': 300}, 'full': {'h': 1920, 'w': 1440}}, 'uploaded_t': 1747978147, 'uploader': 'openfoodfacts-contributors'}, {'key': 'front_en', 'imgid': 1, 'rev': 3, 'sizes': {'100': {'h': 100, 'w': 75}, '200': {'h': 200, 'w': 150}, '400': {'h': 400, 'w': 300}, 'full': {'h': 1920, 'w': 1440}}, 'uploaded_t': None, 'uploader': None}]","رحيمة\nRAHIMA\neggs\n15 بيضة\nدرجة A\nالصلاحية ٣ أشهر. يُحفظ مبرداً بين إلى المئوية .\nFor 3 months validity, store between 4 to 7°c.\nمتناعية الجيدة\nVegetarian-fed\nتغذية بأعلاف نباتية .\nبيض طازج من دجاج مغذي بأعلاف نباتية ١٠٠٪\nخالية من أي هرمونات أو إضافات حيوانية\nية للممارسف\nSaudi G.AP\nSG/2021/13/000001\nFresh eggs from hens fed by 100% vegetarian\nfeed free from hormones or animal products\nبيض طازج غير مخصب من إنتاج\nمزرعة دواجن رديمة\nمن الجودة VALITY\nمند 1969 Since\nعلامة الجودة\nSAUDI SASO\nMADE\nQUALITY MARK\nعاماً\nYEARS\nFresh table eggs unfertilized produced\nby Rahima Poultry Farm\nwww.rahimaeggs.com\nClass A\nEGGS\n5355\n6 281106 110266",تغذية بأعلاف نباتية . بيض طازج من دجاج مغذي بأعلاف نباتية ١٠٠٪ ية للممارسف Vegetarian-fed من أي هرمونات أو إضافات حيوانية مزرعة دواجن رديمة من الجودة VALITY,15 بيضة 5355,biscuits and cakes,milk and yogurt,one dish meals


In [5]:
eggs['texte_ocr'] = eggs['texte_ocr'].str.replace(r'\n|\r\n|\r', ' . ', regex=True).str.lower()
eggs['texte_ocr']

0       10 cadrojac . fr . (31.091.010) . ce . pc10 . sovivo . covivo 31150 brug . 44493652 . 10000 31001 . chair a saucisse . ingredients viandes de porc (86%),eau,acidifiant . (e326),sel, dextrose,acidifiant(e263), . exhausteur de gout(e26-1),antioxygene(e316), . colorants(e 120,e160c),aromes. . conditionne solis atmosphere protectrice . a consommer apres cuisson a cosur . origine france . elabore en france . amballe le 31.10.14 . à consommer jusqu'au 10.11.14 . prix/kg . 5.99€ . prix . a conserver entre 0 et . poids net . 246277 031099 . 0,792kg . 4,74€ cheffrest . fresh made goodness to go . hard boiled eggs (2 ea) . prep . 03/29/22 . enjoy by 04/03/22 . contains eggs . $1.39 . 0100000 00310 . produced by metz culinary am unt . rvin . hard boiled eggs (2 ea) . nutrition facts . serv size: ea . servings . dv amount serving . caloriesh hade go . tm . dv . total fat 10g . 13% . sat fat 3g . 15% total carb. 2g . 1% . trans fat 0.0g . fiber og . 0% . cholest. 375mg . 125% sugars . 2g . 

# Analyse OCR

In [6]:
from app.enums.open_food_facts.breeding_type_enums import (
    COUNTRIES_WHERE_CAGES_ARE_FURNISHED,
    get_barn_regex,
    get_cage_regex,
    get_free_range_regex,
    BREEDING_PATTERNS_ALL_LANGUAGES,
    FREE_RANGE_BREEDINGS,
)
from app.enums.open_food_facts.enums import AnimalType, BreedingType, LayingHenBreedingType
from app.schemas.open_food_facts.external import ProductData
from app.schemas.open_food_facts.internal import ProductType
from app.business.open_food_facts.egg_weight_calculator import get_number_of_eggs
from app.business.open_food_facts.breeding_type_calculator import BreedingTypeCalculator
from app.business.open_food_facts.egg_weight_calculator import get_egg_weight_from_quantity


In [8]:
def get_regex(breeding_type) -> str:
    """
    Constructs a regex pattern that matches 'barn' breeding types.
    Here no need for exclusions
    Returns:
        str: A regex pattern that matches any of the 'barn' breeding types.
    """
    if breeding_type == "free-range":
        set_all_free_range_not_organic = set()
        for breeding in FREE_RANGE_BREEDINGS:
            if breeding == "organic":
                continue
            set_all_free_range_not_organic.update(BREEDING_PATTERNS_ALL_LANGUAGES[breeding])
        return r"\b(?:" + "|".join(set_all_free_range_not_organic) + r")\b"

    else:
        return r"\b(?:" + "|".join(BREEDING_PATTERNS_ALL_LANGUAGES[breeding_type]) + r")\b"


def clean(s: str | None) -> str:
    """
    Cleans a string by removing accents, replacing punctuation and digits,
    converting to lowercase, and replacing 'œ' with 'oe' before regex matching.
    Args:     s (str | None): The string to clean.

    Returns:  str: The cleaned string.
    """

    if pd.isna(s):
        return ''
    if not s:
        return ""
    s = s.lower().replace("œ", "oe").replace("\n", " ")
    s = unicodedata.normalize("NFD", s)
    s = "".join(c for c in s if unicodedata.category(c) != "Mn")
    s = re.sub(r"[^\w\s]|\d+", " ", s)
    return s

eggs['cage_from_OCR'] = eggs['breeding_type_related'].apply(clean).str.findall(get_regex('cage'))
eggs['cage_from_OCR_2'] = eggs['texte_ocr'].apply(clean).str.findall(get_regex('cage'))
eggs['barn_from_OCR'] = eggs['breeding_type_related'].apply(clean).str.findall(get_regex('barn'))
eggs['barn_from_OCR_2'] = eggs['texte_ocr'].apply(clean).str.findall(get_regex('barn'))
eggs['free_range_from_OCR'] = eggs['breeding_type_related'].apply(clean).str.findall(get_regex('free-range'))
eggs['free_range_from_OCR_2'] = eggs['texte_ocr'].apply(clean).str.findall(get_regex('free-range'))
eggs['organic_from_OCR'] = eggs['breeding_type_related'].apply(clean).str.findall(get_regex('organic'))
eggs['organic_from_OCR_2'] = eggs['texte_ocr'].apply(clean).str.findall(get_regex('organic'))


eggs

Unnamed: 0,code,categories_tags,labels_tags,product_name,generic_name,quantity,product_quantity_unit,product_quantity,allergens_tags,ingredients_tags,...,proba_2,proba_3,cage_from_OCR,cage_from_OCR_2,barn_from_OCR,barn_from_OCR_2,free_range_from_OCR,free_range_from_OCR_2,organic_from_OCR,organic_from_OCR_2
0,00003100,"[en:farming-products, en:eggs]",[],"[{'lang': 'main', 'text': 'Hard Boiled Eggs'}, {'lang': 'fr', 'text': 'Hard Boiled Eggs'}]",[],2,,0.0,[en:eggs],"[fr:eggs, en:e330, fr:sodium-benzoate, fr:nisin-preparation]",...,meat,sweetened beverages,[],[],[],[],[],[],[],[]
1,0011110797698,"[en:farming-products, en:eggs, en:undefined]",,"[{'lang': 'main', 'text': 'Natural Grade Aa Large Brown Eggs'}, {'lang': 'en', 'text': 'Natural Grade Aa Large Brown Eggs'}]",[],50 g,g,50.0,[],[en:large-brown-eggs],...,eggs,one dish meals,[cage],"[cage, cage, cage]",[],[],[],[],[],[organic]
2,0011110806543,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': '100% Egg Whites'}, {'lang': 'en', 'text': '100% Egg Whites'}]",[],,,,[en:eggs],"[en:egg-white, en:egg]",...,,,[],[],[],[],[],[],[],[]
3,0011110828897,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Kroger, break-free, real egg product'}, {'lang': 'en', 'text': 'Kroger, break-free, real egg product'}]",[],,,,[en:eggs],"[en:egg-white, en:egg, en:contains-1-and-less-of-the-following, en:e415, en:salt, en:onion, en:vegetable, en:root-vegetable, en:onion-family-vegetable, en:natural-flavouring, en:flavouring, en:colour, en:vitamins, en:minerals, en:iron, en:d-alpha-tocopheryl-acetate, en:vitamin-e, en:zinc-sulfate, en:zinc, en:calcium-pantothenate, en:pantothenic-acid, en:vitamin-b12, en:e101, en:thiamin-mononitrate, en:thiamin, en:pyridoxine-hydrochloride, en:vitamin-b6, en:folic-acid, en:folate, en:biotin, en:cholecalciferol, en:vitamin-d, en:e412, en:includes-beta-carotene, en:e516, en:ferric-orthophosphate]",...,,,[],[],[],[],[],[],[],[]
4,0011110846037,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': '100% Liquid Egg Whites'}, {'lang': 'en', 'text': '100% Liquid Egg Whites'}]",[],,,,[en:eggs],"[en:liquid-egg-white, en:egg, en:egg-white]",...,,,[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,6287027360032,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Rahima Fresh Egg'}, {'lang': 'en', 'text': 'Rahima Fresh Egg'}]",[],,,,[],,...,one dish meals,vegetables,[],[],[],[],[],[],[],[]
7646,6287027360049,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Fresh Egg'}, {'lang': 'en', 'text': 'Fresh Egg'}]",[],,,,[],,...,biscuits and cakes,dressings and sauces,[],[],[],[],[],[],[],[]
7647,6287004270057,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Maknoon Egg'}, {'lang': 'en', 'text': 'Maknoon Egg'}]",[],,,,[],,...,fish and seafood,one dish meals,[],[],[],[],[],[],[],[]
7648,6281106110266,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Rahima Egg'}, {'lang': 'en', 'text': 'Rahima Egg'}]",[],,,,[],,...,milk and yogurt,one dish meals,[],[],[],[],[],[],[],[]


In [9]:
eggs['weight_from_OCR'] = eggs['weight_related'].apply(lambda x: 0 if pd.isna(x) else  get_egg_weight_from_quantity(x))
eggs['quantity_from_OCR'] = eggs['weight_related'].apply(lambda x: 0 if pd.isna(x) else  get_egg_weight_from_quantity(x)/50)
eggs['size_from_OCR'] = ""

eggs


Could not parse quantity: large size calibre gros
Could not parse quantity: One Dozen Large
Could not parse quantity: NET WT 1602 (1LB) 454g
Could not parse quantity: Minimum net weight 328g
Could not parse quantity: Gewichtsklasse
Could not parse quantity: Poids net minimal 53g
Could not parse quantity: Poids net minimal des oeufs: inférieur à 53g
Could not parse quantity: pour 100g:
Could not parse quantity: pour 100g
Could not parse quantity: neuvième jour après la date de ponte
Could not parse quantity: Poids net minimal des oeufs : 43g
Could not parse quantity: CALIBRE GROS
Could not parse quantity: CAL GROS
Could not parse quantity: Calibre moyen
Could not parse quantity: poids net minimal 48g
Could not parse quantity: pour 100g
Could not parse quantity: Poids net minimal 53g
Could not parse quantity: 760150
Could not parse quantity: Poids net minimal:63g, Valeurs nutritionnelles moyennes pour 100g
Could not parse quantity: Gewichtsverlust durch Kochen. Mindestens haltbar
Could n

Unnamed: 0,code,categories_tags,labels_tags,product_name,generic_name,quantity,product_quantity_unit,product_quantity,allergens_tags,ingredients_tags,...,cage_from_OCR_2,barn_from_OCR,barn_from_OCR_2,free_range_from_OCR,free_range_from_OCR_2,organic_from_OCR,organic_from_OCR_2,weight_from_OCR,quantity_from_OCR,size_from_OCR
0,00003100,"[en:farming-products, en:eggs]",[],"[{'lang': 'main', 'text': 'Hard Boiled Eggs'}, {'lang': 'fr', 'text': 'Hard Boiled Eggs'}]",[],2,,0.0,[en:eggs],"[fr:eggs, en:e330, fr:sodium-benzoate, fr:nisin-preparation]",...,[],[],[],[],[],[],[],100.0,2.0,
1,0011110797698,"[en:farming-products, en:eggs, en:undefined]",,"[{'lang': 'main', 'text': 'Natural Grade Aa Large Brown Eggs'}, {'lang': 'en', 'text': 'Natural Grade Aa Large Brown Eggs'}]",[],50 g,g,50.0,[],[en:large-brown-eggs],...,"[cage, cage, cage]",[],[],[],[],[],[organic],600.0,12.0,
2,0011110806543,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': '100% Egg Whites'}, {'lang': 'en', 'text': '100% Egg Whites'}]",[],,,,[en:eggs],"[en:egg-white, en:egg]",...,[],[],[],[],[],[],[],0.0,0.0,
3,0011110828897,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Kroger, break-free, real egg product'}, {'lang': 'en', 'text': 'Kroger, break-free, real egg product'}]",[],,,,[en:eggs],"[en:egg-white, en:egg, en:contains-1-and-less-of-the-following, en:e415, en:salt, en:onion, en:vegetable, en:root-vegetable, en:onion-family-vegetable, en:natural-flavouring, en:flavouring, en:colour, en:vitamins, en:minerals, en:iron, en:d-alpha-tocopheryl-acetate, en:vitamin-e, en:zinc-sulfate, en:zinc, en:calcium-pantothenate, en:pantothenic-acid, en:vitamin-b12, en:e101, en:thiamin-mononitrate, en:thiamin, en:pyridoxine-hydrochloride, en:vitamin-b6, en:folic-acid, en:folate, en:biotin, en:cholecalciferol, en:vitamin-d, en:e412, en:includes-beta-carotene, en:e516, en:ferric-orthophosphate]",...,[],[],[],[],[],[],[],0.0,0.0,
4,0011110846037,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': '100% Liquid Egg Whites'}, {'lang': 'en', 'text': '100% Liquid Egg Whites'}]",[],,,,[en:eggs],"[en:liquid-egg-white, en:egg, en:egg-white]",...,[],[],[],[],[],[],[],0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,6287027360032,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Rahima Fresh Egg'}, {'lang': 'en', 'text': 'Rahima Fresh Egg'}]",[],,,,[],,...,[],[],[],[],[],[],[],300.0,6.0,
7646,6287027360049,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Fresh Egg'}, {'lang': 'en', 'text': 'Fresh Egg'}]",[],,,,[],,...,[],[],[],[],[],[],[],750.0,15.0,
7647,6287004270057,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Maknoon Egg'}, {'lang': 'en', 'text': 'Maknoon Egg'}]",[],,,,[],,...,[],[],[],[],[],[],[],750.0,15.0,
7648,6281106110266,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': 'Rahima Egg'}, {'lang': 'en', 'text': 'Rahima Egg'}]",[],,,,[],,...,[],[],[],[],[],[],[],750.0,15.0,


# Proportion d'oeufs identifiés

Nous récupérons les fonctions correspondantes dans le code principal, et définissons quelques fonctions utilitaires de conversion.

In [10]:
import app.business.open_food_facts.pain_report_calculator as prc
from app.schemas.open_food_facts.external import ProductData
from app.business.open_food_facts.egg_weight_calculator import calculate_egg_weight


def is_egg_packb(product_data: ProductData, strict=False) -> bool:
    """
    Quick function to check whether we're dealing with egg pack
    product_data : product data
    strict: if true, returns only "en:chicken-eggs" in category,
    otherwise must have "en:eggs" but not other identified animals.

    Returns:
        True if egg, False if ovoproduct or otherwise
    """
    tags=product_data.categories_tags
    if tags is None:
        return False
    elif 'en:eggs' not in tags:
        return False
    elif strict:
        return  "en:chicken-eggs" in tags
    else:
        no_chicken={'en:chocolate-eggs',
            'en:duck-eggs',
            'en:easter-eggs',
            'en:fish-eggs',
            'en:free-range-duck-eggs',
            'en:quail-eggs',
            'en:raw-quail-eggs',
            'en:savoury-eggs',
            'en:scotch-eggs',
            'en:streamed-eggs',
            'en:meals',
            'en:snacks',
            'en:meats-and-their-products',
            'en:breads'
        }
        return len(no_chicken.intersection(tags)) == 0
        
def clean_value(val):
    if isinstance(val, (list, dict)):
        return val  # on ne touche pas aux objets JSON désérialisés
    else:
        return None if pd.isna(val) else val


def row2productdata(row):
    drow=row.to_dict()

    for key in drow:
        drow[key] = clean_value(drow[key])

    if drow["ingredients"] is not None:
        drow["ingredients"]=(drow["ingredients"])
    if len(drow["product_name"])>0:
        drow["product_name"]=drow["product_name"][0]["text"]
    else:
        drow["product_name"]=""
    if len(drow["generic_name"])>0:
        drow["generic_name"]=drow["generic_name"][0]["text"]
    else:
        drow["generic_name"]=""
    product_data=ProductData.model_validate(drow)
    return product_data

def row2number(row):
    product_data=row2productdata(row)
    return calculate_egg_weight(product_data)


def row2breedingtype(row):
    product_data=row2productdata(row)
    report=prc.PainReportCalculator(product_data)
    gbt=report._get_breeding_types()
    return gbt['laying_hen'].value if 'laying_hen' in gbt else "None"


def testrow(df, nrow):
    row=df.iloc[nrow]
    return row2number, row2breedingtype(row), row


testrow(eggs, 0)


(<function __main__.row2number(row)>,
 'unknown',
 code                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [11]:
eggs['w_eggs'] = eggs.apply(row2number, axis=1)
eggs['breeding'] = eggs.apply(row2breedingtype, axis=1)
eggs["product_quantity"]=eggs["product_quantity"].astype(float)


Could not parse quantity: pièces
Could not parse quantity: pièces
Could not parse quantity: Single twin pack 
Could not parse quantity: Biographie suisses 
Could not parse quantity: talla l
Could not parse quantity: hi
Could not parse quantity: pcs
Could not parse quantity: calibre moyen
Could not parse quantity: pièces
Could not parse quantity: pièces
Could not parse quantity: Single twin pack 
Could not parse quantity: Biographie suisses 
Could not parse quantity: talla l
Could not parse quantity: hi
Could not parse quantity: pcs
Could not parse quantity: calibre moyen


In [12]:
eggs.breeding.value_counts(dropna=False)

breeding
unknown              3554
free_range           2991
barn                  865
furnished_cage        213
conventional_cage      27
Name: count, dtype: int64

In [13]:
"en:france" in eggs["countries_tags"]

False

In [14]:
eggs.groupby('w_eggs').agg( sample=('code', lambda x: x.head(10).tolist()), w_eggs=('w_eggs', lambda x: x.head(10).tolist()),  total_count=('w_eggs', 'size') )

Unnamed: 0_level_0,sample,w_eggs,total_count
w_eggs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.000,"[0011110806543, 0011110828897, 0011110846037, 0013372111168, 0014616221100, 0014616221612, 0015204113111, 0018894318965, 0018894318972, 0018894319016]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",4506
4.000,[3251320070410],[4.0],1
6.000,"[3760214470293, 4260115850082, 4005211120468, 4250780307134, 8681695138050]","[6.0, 6.0, 6.0, 6.0, 6.0]",5
10.000,"[3760073541042, 4316268579179]","[10.0, 10.0]",2
12.360,[4316268567985],[12.36],1
...,...,...,...
26500.000,[5701607583167],[26500.0],1
28800.000,[3596710521463],[28800.0],1
38989.723,[4311501315798],[38989.723],1
57600.000,"[3596710521494, 3596710521500]","[57600.0, 57600.0]",2


In [15]:
eggs["has_breeding_type"]=eggs["breeding"].apply(lambda x: "computed" if x == "barn" or x == "furnished_cage" or x == "conventional_cage" else "Aucun" if x == "None" else x)
eggs["has_egg_weight"]= eggs["w_eggs"]>0 & ~eggs["w_eggs"].isna()
eggs["has_egg_weight_s"] = eggs["has_egg_weight"].apply(lambda x: "has weight" if x else "no weight")
eggs["french"]=eggs["countries_tags"].fillna("").apply(lambda x:  len(x)>0 and "en:france" in x)
eggs["french_s"]=eggs["french"].apply(lambda x: "français" if x else "pas français")
eggs[["has_breeding_type", "has_egg_weight"]].value_counts().to_frame().unstack().fillna(0).astype(int).style.background_gradient(axis=None)

Unnamed: 0_level_0,count,count
has_egg_weight,False,True
has_breeding_type,Unnamed: 1_level_2,Unnamed: 2_level_2
computed,484,621
free_range,1500,1491
unknown,2522,1032


In [16]:
eggs[["breeding", "has_egg_weight"]].value_counts(normalize=True).to_frame().unstack().fillna(0).style.format('{:.1%}').background_gradient(axis=None)

Unnamed: 0_level_0,proportion,proportion
has_egg_weight,False,True
breeding,Unnamed: 1_level_2,Unnamed: 2_level_2
barn,5.5%,5.8%
conventional_cage,0.0%,0.3%
free_range,19.6%,19.5%
furnished_cage,0.8%,2.0%
unknown,33.0%,13.5%


In [17]:
eggs_fr = eggs[eggs["french"]]
eggs_fr.to_csv("../data/eggs_is_suffering_computed_fr.csv", index=False)
eggs.to_csv("../data/eggs_is_suffering_computed.csv", index=False)
eggs_fr


Unnamed: 0,code,categories_tags,labels_tags,product_name,generic_name,quantity,product_quantity_unit,product_quantity,allergens_tags,ingredients_tags,...,weight_from_OCR,quantity_from_OCR,size_from_OCR,w_eggs,breeding,has_breeding_type,has_egg_weight,has_egg_weight_s,french,french_s
0,00003100,"[en:farming-products, en:eggs]",[],"[{'lang': 'main', 'text': 'Hard Boiled Eggs'}, {'lang': 'fr', 'text': 'Hard Boiled Eggs'}]",[],2,,0.0,[en:eggs],"[fr:eggs, en:e330, fr:sodium-benzoate, fr:nisin-preparation]",...,100.0,2.0,,100.0,unknown,unknown,True,has weight,True,français
22,00257961,"[en:farming-products, en:eggs]",,"[{'lang': 'main', 'text': '6 Oeufs Bleus'}, {'lang': 'fr', 'text': '6 Oeufs Bleus'}, {'lang': 'en', 'text': '6 eggs'}]",[],,,,[],,...,0.0,0.0,,0.0,unknown,unknown,False,no weight,True,français
33,0055000002240,"[en:farming-products, en:eggs]",[],"[{'lang': 'main', 'text': 'Œufs'}, {'lang': 'fr', 'text': 'Œufs'}]",[],,,,[],,...,0.0,0.0,,0.0,unknown,unknown,False,no weight,True,français
38,0061719011930,"[en:farming-products, en:eggs, en:chicken-eggs, en:barn-chicken-eggs]",[],"[{'lang': 'main', 'text': '12 œufs blanc calibre gros'}, {'lang': 'fr', 'text': '12 œufs blanc calibre gros'}, {'lang': 'en', 'text': 'Free Run Large White Eggs'}]",[],12,,0.0,[],,...,720.0,14.4,,600.0,barn,computed,True,has weight,True,français
106,00901796,"[en:farming-products, en:eggs, en:chicken-eggs, en:free-range-chicken-eggs]","[en:green-dot, fr:categorie-a]","[{'lang': 'main', 'text': '6 Œufs Moyens de Poules Élevées en Plein Air'}, {'lang': 'fr', 'text': '6 Œufs Moyens de Poules Élevées en Plein Air'}]",[],6 œufs,,0.0,[en:eggs],[en:egg],...,300.0,6.0,,300.0,free_range,free_range,True,has weight,True,français
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7614,0764450111661,"[en:farming-products, en:eggs]","[en:nutriscore, fr:bleu-blanc-coeur]",[],[],,,,[],,...,0.0,0.0,,0.0,unknown,unknown,False,no weight,True,français
7615,3700864040064,"[en:farming-products, en:eggs, en:chicken-eggs, en:free-range-chicken-eggs]",[en:nutriscore],"[{'lang': 'main', 'text': 'Gros oeufs plein air x6'}, {'lang': 'en', 'text': 'Gros oeufs plein air x6'}]",[],,,,[],,...,0.0,0.0,,0.0,free_range,free_range,False,no weight,True,français
7627,3700133918872,"[en:farming-products, en:eggs, en:chicken-eggs, en:free-range-chicken-eggs]",[fr:label-rouge],"[{'lang': 'main', 'text': 'Oeufs frais plein air x12'}, {'lang': 'fr', 'text': 'Oeufs frais plein air x12'}]",[],,,,[],,...,0.0,0.0,,0.0,free_range,free_range,False,no weight,True,français
7628,3700864017127,"[en:farming-products, en:eggs, en:chicken-eggs, en:free-range-chicken-eggs]",[],"[{'lang': 'main', 'text': 'Gros oeufs plein air x12'}, {'lang': 'fr', 'text': 'Gros oeufs plein air x12'}]",[],,,,[],,...,0.0,0.0,,0.0,free_range,free_range,False,no weight,True,français


In [18]:
fig = px.sunburst(
    eggs,
    path=[px.Constant("all"), 'french_s', 'has_egg_weight_s', 'has_breeding_type']
)

fig.update_traces(texttemplate="%{label} : %{value}")

# 🔍 Agrandir la figure
fig.update_layout(
    title = "All eggs : is french, has weight, has breeding type - World",
    width=600,   # Largeur en pixels
    height=600,   # Hauteur en pixels
    margin=dict(t=40, l=10, r=10, b=10)  # Réduit les marges pour maximiser l’espace utile
)


fig.show()

In [19]:

fig = px.sunburst(
    eggs_fr,
    path=[px.Constant("all"), 'has_egg_weight_s', 'has_breeding_type']
)

fig.update_traces(
    texttemplate="%{label}<br>%{percentRoot:.1%}<br>%{value}",
    textfont=dict(size=12),
    insidetextorientation='horizontal'

)

fig.update_layout(
    title = "French eggs : has weight, has breeding type",
    width=500,
    height=500,
    margin=dict(t=40, l=10, r=10, b=10)
)

fig.show()


In [20]:
fig = px.sunburst(
    eggs,
    path=[px.Constant("all"), 'has_egg_weight_s', 'has_breeding_type']
)

fig.update_traces(
    texttemplate="%{label}<br>%{percentRoot:.1%}<br>%{value}",
    textfont=dict(size=12),
    insidetextorientation='horizontal'

)

fig.update_layout(
    title = "All eggs (World) : has weight, has breeding type",
    width=500,
    height=500,
    margin=dict(t=40, l=10, r=10, b=10)
)

fig.show()
