# Introduction

Ce document résume les jeu de données, méthodologie, et statistiques utilisées pour l'estimation de la souffrance contenue dans les boîtes d'oeufs.

Nous commençons par l'import de la base de données complète d'open food facts obtenue le 31 mars 2025.

De cette base de données, nous ne retenons que les colonnes (goodcol) nécessaires au calcul du poids de souffrance, telles que définies dans le code.




In [None]:
import sys
import pandas as pd
import plotly.express as px
import json
from pathlib import Path
from typing import Dict, List, Optional, Any
import logging
import unicodedata
import re

sys.path.insert(0, "../../backend")

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 1000)

In [None]:
eggs_from_parquet = pd.read_csv("../data/eggs_from_parquet.csv")
eggs_from_parquet

In [None]:

def safe_json_loads(s):
    if isinstance(s, str):
        s_strip = s.strip()
        if s_strip.startswith(('[', '{')):
            try:
                return json.loads(s_strip)
            except json.JSONDecodeError:
                pass
    return s

with open("../data/cols_to_json.txt", "r") as f:
    cols_to_json = json.load(f)

for col in cols_to_json:
    eggs_from_parquet[col] = eggs_from_parquet[col].apply(safe_json_loads)

eggs_from_parquet

## Résultats

On obtient par cette méthode 6228 éléments, soit plus du double.

Des dix éléments affichés, on récupère surtour des oeufs, mais il y a des faux positifs, par exemple 0012009012168 : Chef d'oeuf™avec fromage sur muffin anglais, qui du reste ne serait pas exclu non plus en cherchant "oeuf" dans le champ "product_name".

En revanche, en échantillonnant 50 autres éléments, il semble que les éléments aberrants soient rares, et qu'on ait surtout, à part les packs, des blancs d'oeufs, qui ne posent pas de problème.

Ce filtre pourra être utilisé dans le code principal pour filtrer les éléments ; nous le conservons dans la suite de cette étude en gardant à l'esprit que quelques pourcents des résultats peuvent être incorrects.

## Import de l'OCR

On importe l'analyse par OCR de toutes les images d'oeufs + prédictions de catégories, en vue d'un parsing par regex

In [None]:

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class JSONLProcessor:
    """A class to process JSONL files and convert them to pandas DataFrames."""
    
    DEFAULT_COLUMNS = [
        'code', 'texte_ocr', 'breeding_type_related', 'weight_related',
        'proba_1', 'proba_2', 'proba_3'
    ]
    
    def __init__(self, file_path: str):
        """
        Initialize the JSONL processor.
        
        Args:
            file_path (str): Path to the .jsonl file
        """
        self.file_path = Path(file_path)
        self.processed_data: List[Dict[str, Any]] = []
    
    def _validate_file(self) -> bool:
        """
        Validate if the file exists and is readable.
        
        Returns:
            bool: True if file is valid, False otherwise
        """
        if not self.file_path.exists():
            logger.error(f"File '{self.file_path}' not found")
            return False
        
        if not self.file_path.is_file():
            logger.error(f"'{self.file_path}' is not a file")
            return False
        
        try:
            with open(self.file_path, 'r', encoding='utf-8') as f:
                f.read(1)  # Try to read first character
            return True
        except (PermissionError, UnicodeDecodeError) as e:
            logger.error(f"Cannot read file '{self.file_path}': {e}")
            return False
    
    def _extract_nested_field(self, record: Dict, *keys: str, default: Any = None) -> Any:
        """
        Safely extract nested fields from a dictionary.
        
        Args:
            record (Dict): The dictionary to extract from
            *keys: Sequence of keys to traverse
            default: Default value if any key is missing
            
        Returns:
            The extracted value or default
        """
        current = record
        for key in keys:
            if isinstance(current, dict) and key in current:
                current = current[key]
            else:
                return default
        return current
    
    def _process_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
        """
        Process a single JSON record and extract required fields.
        
        Args:
            record (Dict): JSON record to process
            
        Returns:
            Dict: Processed record with extracted fields
        """
        return {
            'code': record.get('code', {}),
            'texte_ocr': record.get('ocr_text', {}),
            'breeding_type_related': self._extract_nested_field(
                record, 'groq_spans', 'breeding_type_related', default={}
            ),
            'weight_related': self._extract_nested_field(
                record, 'groq_spans', 'weight_related', default={}
            ),
            'proba_1': self._extract_nested_field(
                record, 'lewagon_prediction', 'proba_1'
            ),
            'proba_2': self._extract_nested_field(
                record, 'lewagon_prediction', 'proba_2'
            ),
            'proba_3': self._extract_nested_field(
                record, 'lewagon_prediction', 'proba_3'
            )
        }
    
    def _process_line(self, line: str, line_num: int) -> Optional[Dict[str, Any]]:
        """
        Process a single line from the JSONL file.
        
        Args:
            line (str): Line to process
            line_num (int): Line number for error reporting
            
        Returns:
            Optional[Dict]: Processed record or None if error occurred
        """
        stripped_line = line.strip()
        if not stripped_line:
            logger.debug(f"Line {line_num} is empty, skipping")
            return None
        
        try:
            record = json.loads(stripped_line)
            return self._process_record(record)
        except json.JSONDecodeError as e:
            logger.warning(f"JSON decode error at line {line_num}: {e}")
            logger.debug(f"Problematic line: {stripped_line[:100]}...")
            return None
        except Exception as e:
            logger.warning(f"Unexpected error processing line {line_num}: {e}")
            return None
    
    def process_file(self) -> pd.DataFrame:
        """
        Process the entire JSONL file and return a DataFrame.
        
        Returns:
            pd.DataFrame: DataFrame with extracted data
        """
        if not self._validate_file():
            return pd.DataFrame(columns=self.DEFAULT_COLUMNS)
        
        self.processed_data = []
        successful_lines = 0
        total_lines = 0
        
        try:
            with open(self.file_path, 'r', encoding='utf-8') as f:
                logger.info(f"Processing file: {self.file_path}")
                for line_num, line in enumerate(f, 1):
                    total_lines += 1
                    processed_record = self._process_line(line, line_num)
                    if processed_record is not None:
                        self.processed_data.append(processed_record)
                        successful_lines += 1
        
        except Exception as e:
            logger.error(f"Unexpected error reading file: {e}")
            return pd.DataFrame(columns=self.DEFAULT_COLUMNS)
        
        logger.info(f"Processing complete. Successfully processed {successful_lines}/{total_lines} lines")
        return pd.DataFrame(self.processed_data)


def create_dataframe_from_jsonl(file_path: str) -> pd.DataFrame:
    """
    Create a pandas DataFrame from a JSONL file.
    
    This function extracts specific fields from each JSON record:
    - code, ocr_text from root level
    - breeding_type_related, weight_related from groq_spans
    - proba_1, proba_2, proba_3 from lewagon_prediction
    
    Args:
        file_path (str): Path to the .jsonl file
        
    Returns:
        pd.DataFrame: DataFrame with extracted data, or empty DataFrame if error occurs
    """
    processor = JSONLProcessor(file_path)
    return processor.process_file()


# Configuration
JSONL_FILE_PATH = r"..\neural_category_predictions\data\dfoeufs_with_predictions_with_ground_truth_with_groq.jsonl"

# Process the file
try:
    code_ocr = create_dataframe_from_jsonl(JSONL_FILE_PATH)
    
    if not code_ocr.empty:
        print(f"DataFrame created successfully with {len(code_ocr)} rows and {len(code_ocr.columns)} columns")
        display(code_ocr)
    else:
        print("Empty DataFrame created - check file path and content")
        
except Exception as e:
    logger.error(f"Failed to process file: {e}")

print("Merge avec l'import eggs, renommé eggs")
eggs = eggs_from_parquet.merge(code_ocr, on='code', how = 'left')
eggs

In [None]:
eggs['texte_ocr'] = eggs['texte_ocr'].str.replace(r'\n|\r\n|\r', ' . ', regex=True).str.lower()
eggs['texte_ocr']

# Analyse OCR

In [None]:
from app.enums.open_food_facts.breeding_type_enums import (
    COUNTRIES_WHERE_CAGES_ARE_FURNISHED,
    get_barn_regex,
    get_cage_regex,
    get_free_range_regex,
    BREEDING_PATTERNS_ALL_LANGUAGES,
    FREE_RANGE_BREEDINGS,
)
from app.enums.open_food_facts.enums import AnimalType, BreedingType, LayingHenBreedingType
from app.schemas.open_food_facts.external import ProductData
from app.schemas.open_food_facts.internal import ProductType
from app.business.open_food_facts.egg_weight_calculator import get_number_of_eggs
from app.business.open_food_facts.breeding_type_calculator import BreedingTypeCalculator
from app.business.open_food_facts.egg_weight_calculator import get_egg_weight_from_quantity


In [None]:
def get_regex(breeding_type) -> str:
    """
    Constructs a regex pattern that matches 'barn' breeding types.
    Here no need for exclusions
    Returns:
        str: A regex pattern that matches any of the 'barn' breeding types.
    """
    if breeding_type == "free-range":
        set_all_free_range_not_organic = set()
        for breeding in FREE_RANGE_BREEDINGS:
            if breeding == "organic":
                continue
            set_all_free_range_not_organic.update(BREEDING_PATTERNS_ALL_LANGUAGES[breeding])
        return r"\b(?:" + "|".join(set_all_free_range_not_organic) + r")\b"

    else:
        return r"\b(?:" + "|".join(BREEDING_PATTERNS_ALL_LANGUAGES[breeding_type]) + r")\b"


def clean(s: str | None) -> str:
    """
    Cleans a string by removing accents, replacing punctuation and digits,
    converting to lowercase, and replacing 'œ' with 'oe' before regex matching.
    Args:     s (str | None): The string to clean.

    Returns:  str: The cleaned string.
    """

    if pd.isna(s):
        return ''
    if not s:
        return ""
    s = s.lower().replace("œ", "oe").replace("\n", " ")
    s = unicodedata.normalize("NFD", s)
    s = "".join(c for c in s if unicodedata.category(c) != "Mn")
    s = re.sub(r"[^\w\s]|\d+", " ", s)
    return s

eggs['cage_from_OCR'] = eggs['breeding_type_related'].apply(clean).str.findall(get_regex('cage'))
eggs['cage_from_OCR_2'] = eggs['texte_ocr'].apply(clean).str.findall(get_regex('cage'))
eggs['barn_from_OCR'] = eggs['breeding_type_related'].apply(clean).str.findall(get_regex('barn'))
eggs['barn_from_OCR_2'] = eggs['texte_ocr'].apply(clean).str.findall(get_regex('barn'))
eggs['free_range_from_OCR'] = eggs['breeding_type_related'].apply(clean).str.findall(get_regex('free-range'))
eggs['free_range_from_OCR_2'] = eggs['texte_ocr'].apply(clean).str.findall(get_regex('free-range'))
eggs['organic_from_OCR'] = eggs['breeding_type_related'].apply(clean).str.findall(get_regex('organic'))
eggs['organic_from_OCR_2'] = eggs['texte_ocr'].apply(clean).str.findall(get_regex('organic'))


eggs

In [None]:
eggs['weight_from_OCR'] = eggs['weight_related'].apply(lambda x: 0 if pd.isna(x) else  get_egg_weight_from_quantity(x))
eggs['quantity_from_OCR'] = eggs['weight_related'].apply(lambda x: 0 if pd.isna(x) else  get_egg_weight_from_quantity(x)/50)
eggs['size_from_OCR'] = ""

eggs


# Proportion d'oeufs identifiés

Nous récupérons les fonctions correspondantes dans le code principal, et définissons quelques fonctions utilitaires de conversion.

In [None]:
import app.business.open_food_facts.pain_report_calculator as prc
from app.schemas.open_food_facts.external import ProductData
from app.business.open_food_facts.egg_weight_calculator import calculate_egg_weight


def is_egg_packb(product_data: ProductData, strict=False) -> bool:
    """
    Quick function to check whether we're dealing with egg pack
    product_data : product data
    strict: if true, returns only "en:chicken-eggs" in category,
    otherwise must have "en:eggs" but not other identified animals.

    Returns:
        True if egg, False if ovoproduct or otherwise
    """
    tags=product_data.categories_tags
    if tags is None:
        return False
    elif 'en:eggs' not in tags:
        return False
    elif strict:
        return  "en:chicken-eggs" in tags
    else:
        no_chicken={'en:chocolate-eggs',
            'en:duck-eggs',
            'en:easter-eggs',
            'en:fish-eggs',
            'en:free-range-duck-eggs',
            'en:quail-eggs',
            'en:raw-quail-eggs',
            'en:savoury-eggs',
            'en:scotch-eggs',
            'en:streamed-eggs',
            'en:meals',
            'en:snacks',
            'en:meats-and-their-products',
            'en:breads'
        }
        return len(no_chicken.intersection(tags)) == 0
        
def clean_value(val):
    if isinstance(val, (list, dict)):
        return val  # on ne touche pas aux objets JSON désérialisés
    else:
        return None if pd.isna(val) else val


def row2productdata(row):
    drow=row.to_dict()

    for key in drow:
        drow[key] = clean_value(drow[key])

    if drow["ingredients"] is not None:
        drow["ingredients"]=(drow["ingredients"])
    if len(drow["product_name"])>0:
        drow["product_name"]=drow["product_name"][0]["text"]
    else:
        drow["product_name"]=""
    if len(drow["generic_name"])>0:
        drow["generic_name"]=drow["generic_name"][0]["text"]
    else:
        drow["generic_name"]=""
    product_data=ProductData.model_validate(drow)
    return product_data

def row2number(row):
    product_data=row2productdata(row)
    return calculate_egg_weight(product_data)


def row2breedingtype(row):
    product_data=row2productdata(row)
    report=prc.PainReportCalculator(product_data)
    gbt=report._get_breeding_types()
    return gbt['laying_hen'].value if 'laying_hen' in gbt else "None"


def testrow(df, nrow):
    row=df.iloc[nrow]
    return row2number, row2breedingtype(row), row


testrow(eggs, 0)


In [None]:
eggs['w_eggs'] = eggs.apply(row2number, axis=1)
eggs['breeding'] = eggs.apply(row2breedingtype, axis=1)
eggs["product_quantity"]=eggs["product_quantity"].astype(float)


In [None]:
eggs.breeding.value_counts(dropna=False)

In [None]:
"en:france" in eggs["countries_tags"]

In [None]:
eggs.groupby('w_eggs').agg( sample=('code', lambda x: x.head(10).tolist()), w_eggs=('w_eggs', lambda x: x.head(10).tolist()),  total_count=('w_eggs', 'size') )

In [None]:
eggs["has_breeding_type"]=eggs["breeding"].apply(lambda x: "computed" if x == "barn" or x == "furnished_cage" or x == "conventional_cage" else "Aucun" if x == "None" else x)
eggs["has_egg_weight"]= eggs["w_eggs"]>0 & ~eggs["w_eggs"].isna()
eggs["has_egg_weight_s"] = eggs["has_egg_weight"].apply(lambda x: "has weight" if x else "no weight")
eggs["french"]=eggs["countries_tags"].fillna("").apply(lambda x:  len(x)>0 and "en:france" in x)
eggs["french_s"]=eggs["french"].apply(lambda x: "français" if x else "pas français")
eggs[["has_breeding_type", "has_egg_weight"]].value_counts().to_frame().unstack().fillna(0).astype(int).style.background_gradient(axis=None)

In [None]:
eggs[["breeding", "has_egg_weight"]].value_counts(normalize=True).to_frame().unstack().fillna(0).style.format('{:.1%}').background_gradient(axis=None)

In [None]:
eggs_fr = eggs[eggs["french"]]
eggs_fr.to_csv("../data/eggs_is_suffering_computed_fr.csv", index=False)
eggs.to_csv("../data/eggs_is_suffering_computed.csv", index=False)
eggs_fr


In [None]:
fig = px.sunburst(
    eggs,
    path=[px.Constant("all"), 'french_s', 'has_egg_weight_s', 'has_breeding_type']
)

fig.update_traces(texttemplate="%{label} : %{value}")

# 🔍 Agrandir la figure
fig.update_layout(
    title = "All eggs : is french, has weight, has breeding type - World",
    width=600,   # Largeur en pixels
    height=600,   # Hauteur en pixels
    margin=dict(t=40, l=10, r=10, b=10)  # Réduit les marges pour maximiser l’espace utile
)


fig.show()

In [None]:

fig = px.sunburst(
    eggs_fr,
    path=[px.Constant("all"), 'has_egg_weight_s', 'has_breeding_type']
)

fig.update_traces(
    texttemplate="%{label}<br>%{percentRoot:.1%}<br>%{value}",
    textfont=dict(size=12),
    insidetextorientation='horizontal'

)

fig.update_layout(
    title = "French eggs : has weight, has breeding type",
    width=500,
    height=500,
    margin=dict(t=40, l=10, r=10, b=10)
)

fig.show()


In [None]:
fig = px.sunburst(
    eggs,
    path=[px.Constant("all"), 'has_egg_weight_s', 'has_breeding_type']
)

fig.update_traces(
    texttemplate="%{label}<br>%{percentRoot:.1%}<br>%{value}",
    textfont=dict(size=12),
    insidetextorientation='horizontal'

)

fig.update_layout(
    title = "All eggs (World) : has weight, has breeding type",
    width=500,
    height=500,
    margin=dict(t=40, l=10, r=10, b=10)
)

fig.show()
