In [None]:
import re
import json
import pandas as pd

def extract_parameters(text):
    """Extrait les paramètres d'entraînement à partir du texte d'un article."""
    
    # Expressions régulières pour capturer les données importantes
    pf_days_pattern = re.findall(r'(\d+[\.\d+]*)\s*PF-days', text)
    model_size_pattern = re.findall(r'(\d+[\.\d+]*\s*(million|billion)\s*parameters)', text, re.IGNORECASE)
    dataset_size_pattern = re.findall(r'(\d+[\.\d+]*)\s*(million|billion)\s*tokens', text, re.IGNORECASE)
    
    # Extraction des valeurs
    compute_pf_days = float(pf_days_pattern[0]) if pf_days_pattern else None
    model_size = model_size_pattern[0][0] if model_size_pattern else None
    dataset_size = dataset_size_pattern[0][0] if dataset_size_pattern else None
    
    return {
        "compute_pf_days": compute_pf_days,
        "model_size": model_size,
        "dataset_size": dataset_size
    }

def calculate_energy_consumption(compute_pf_days):
    """Convertit les PF-days en kWh."""
    pf_to_kwh = 432  # 1 PF-day ≈ 432 kWh
    return compute_pf_days * pf_to_kwh if compute_pf_days else None

def calculate_carbon_footprint(energy_kwh, emission_factor=450):
    """Calcule l'empreinte carbone en kgCO2."""
    return energy_kwh * (emission_factor / 1000) if energy_kwh else None

def main(article_text):
    """Pipeline principal pour extraire les infos et estimer l'empreinte carbone."""
    extracted_data = extract_parameters(article_text)
    energy_kwh = calculate_energy_consumption(extracted_data['compute_pf_days'])
    carbon_footprint = calculate_carbon_footprint(energy_kwh)
    
    results = {
        "Compute PF-days": extracted_data['compute_pf_days'],
        "Energy Consumption (kWh)": energy_kwh,
        "Carbon Footprint (kgCO2)": carbon_footprint,
        "Model Size": extracted_data['model_size'],
        "Dataset Size": extracted_data['dataset_size']
    }
    
    return results

# Exemple d'utilisation avec un texte fictif
article_text = """
This study used a neural network with 1.5 billion parameters, trained on 23 billion tokens,
with a compute budget of approximately 3.1 × 10^8 PF-days.
"""

result = main(article_text)
print(json.dumps(result, indent=4))
