# Extraction des donn√©es financi√®res des 10k

In [14]:
import json
import boto3
from pathlib import Path
from datetime import datetime
import pandas as pd
import time
import re
from typing import Dict, List, Optional
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
import sys

sys.path.append(str(Path.cwd().parent))
from config import FILLINGS_DIR, PROJECT_DIR, PROCESSED_DIR, FILLINGS_DIR, AWS_REGION, MODEL_ID

# Visualisation
import plotly.express as px
import plotly.graph_objects as go

# Configuration
import warnings
warnings.filterwarnings('ignore')

In [15]:
# Configuration AWS Bedrock
bedrock_client = boto3.client('bedrock-runtime', region_name=AWS_REGION)

# Mod√®le √† utiliser pour extraction financi√®re

print(f"Client Bedrock configur√©")
print(f"   Mod√®le: {MODEL_ID}")
print(f"   R√©gion: us-west-2")

Client Bedrock configur√©
   Mod√®le: anthropic.claude-3-5-sonnet-20241022-v2:0
   R√©gion: us-west-2


In [16]:
# Sch√©ma JSON attendu pour l'extraction des donn√©es 10-K
COMPANY_10K_SCHEMA = {
    "company_name": "string",
    "ticker": "string",
    "fiscal_year": "integer",
    "fiscal_period": "string",
    
    "business_segment": "string", 
    "primary_industry": "string",  
    
    "revenue": {
        "products": "number or null",
        "services": "number or null",
        "total": "number or null"
    },
    
    "revenue_by_geography": {
        "americas": "number or null",
        "europe": "number or null",
        "greater_china": "number or null",
        "japan": "number or null",
        "asia_pacific": "number or null",
        "other": "number or null"
    },
    
    "purchase_obligations": {
        "manufacturing": "number or null",
        "other": "number or null",
        "total": "number or null"
    },
    
    "tax_info": {
        "provision_for_income_taxes": "number or null",
        "income_before_taxes": "number or null",
        "effective_tax_rate": "number or null"
    },
    
    "other_metrics": {
        "total_assets": "number or null",
        "total_liabilities": "number or null",
        "shareholders_equity": "number or null",
        "net_income": "number or null",
        "operating_income": "number or null"
    }
}

print("Sch√©ma JSON Company10K d√©fini")
print(f"\n Structure attendue:")
print(json.dumps(COMPANY_10K_SCHEMA, indent=2))

Sch√©ma JSON Company10K d√©fini

 Structure attendue:
{
  "company_name": "string",
  "ticker": "string",
  "fiscal_year": "integer",
  "fiscal_period": "string",
  "business_segment": "string",
  "primary_industry": "string",
  "revenue": {
    "products": "number or null",
    "services": "number or null",
    "total": "number or null"
  },
  "revenue_by_geography": {
    "americas": "number or null",
    "europe": "number or null",
    "greater_china": "number or null",
    "japan": "number or null",
    "asia_pacific": "number or null",
    "other": "number or null"
  },
  "purchase_obligations": {
    "manufacturing": "number or null",
    "other": "number or null",
    "total": "number or null"
  },
  "tax_info": {
    "provision_for_income_taxes": "number or null",
    "income_before_taxes": "number or null",
    "effective_tax_rate": "number or null"
  },
  "other_metrics": {
    "total_assets": "number or null",
    "total_liabilities": "number or null",
    "shareholders_equi

In [17]:
# Scanner tous les fichiers 10-K
all_10k_files = list(FILLINGS_DIR.rglob('*-10k-*.html'))

print(f"\n SCAN DES RAPPORTS 10-K")
print("="*60)
print(f"Fichiers trouv√©s: {len(all_10k_files)}")

# Organiser par ticker
files_by_ticker = {}
files_by_ticker = {}
for file_path in all_10k_files:
    ticker = file_path.parent.name
    if ticker != '.ipynb_checkpoints':  # Filtrer les fichiers syst√®me
        files_by_ticker[ticker] = file_path

print(f"Tickers uniques: {len(files_by_ticker)}")
print(f"\n Premiers tickers: {', '.join(list(files_by_ticker.keys())[:10])}")


 SCAN DES RAPPORTS 10-K
Fichiers trouv√©s: 504
Tickers uniques: 500

 Premiers tickers: A, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADBE, ADI, ADM


In [18]:
def extract_text_from_10k(file_path: Path, max_chars: int = 500000) -> tuple[str, dict]:
    """
    Extrait le texte brut d'un rapport 10-K HTML.
    
    Args:
        file_path: Chemin vers le fichier HTML
        max_chars: Limite de caract√®res (pour √©viter d√©passement tokens)
    
    Returns:
        Tuple (texte_extrait, m√©tadonn√©es)
    """
    try:
        # Lire le fichier HTML
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            html_content = f.read()
        
        # Parser avec BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Extraire le texte (retirer scripts et styles)
        for script in soup(["script", "style"]):
            script.decompose()
        
        text = soup.get_text(separator='\n', strip=True)
        
        # Nettoyer le texte
        lines = [line.strip() for line in text.splitlines() if line.strip()]
        text = '\n'.join(lines)
        
        # Limiter la taille si n√©cessaire
        truncated = False
        if len(text) > max_chars:
            text = text[:max_chars]
            truncated = True
        
        # M√©tadonn√©es
        metadata = {
            'file_path': str(file_path),
            'file_size_kb': file_path.stat().st_size / 1024,
            'text_length': len(text),
            'truncated': truncated,
            'extraction_success': True
        }
        
        return text, metadata
        
    except Exception as e:
        metadata = {
            'file_path': str(file_path),
            'extraction_success': False,
            'error': str(e)
        }
        return "", metadata

# Test sur un fichier
if all_10k_files:
    test_file = all_10k_files[0]
    test_text, test_meta = extract_text_from_10k(test_file)
    print(f"\n TEST D'EXTRACTION")
    print("="*60)
    print(f"Fichier: {test_file.name}")
    print(f"Taille fichier: {test_meta['file_size_kb']:.1f} KB")
    print(f"Longueur texte: {test_meta['text_length']:,} caract√®res")
    print(f"Tronqu√©: {test_meta['truncated']}")
    print(f"\n Premiers 500 caract√®res:")
    print(test_text[:500])
    print("...")


 TEST D'EXTRACTION
Fichier: 2024-12-20-10k-A.html
Taille fichier: 3637.9 KB
Longueur texte: 500,000 caract√®res
Tronqu√©: True

 Premiers 500 caract√®res:
a-20241031
0001090872
false
2024
FY
http://fasb.org/us-gaap/2024#CostOfGoodsAndServicesSold
http://fasb.org/us-gaap/2024#CostOfGoodsAndServicesSold
http://fasb.org/us-gaap/2024#ResearchAndDevelopmentExpense
http://fasb.org/us-gaap/2024#ResearchAndDevelopmentExpense
http://fasb.org/us-gaap/2024#SellingGeneralAndAdministrativeExpense
http://fasb.org/us-gaap/2024#SellingGeneralAndAdministrativeExpense
http://fasb.org/us-gaap/2024#OtherAssetsNoncurrent
http://fasb.org/us-gaap/2024#OtherAssetsNoncurr
...


In [30]:
SYSTEM_PROMPT = """You are an expert financial analyst. 
You will extract structured information from a company's annual Form 10-K filing. 
Output must strictly match the provided JSON schema (Company10K) and use only the official numbers from the filing. 
Do NOT combine lines unless the 10-K explicitly labels them as part of the same category.

CRITICAL - NUMBER FORMATTING:
- 10-K reports often show numbers in millions or thousands
- ALWAYS convert to actual USD amounts (no abbreviations)
- If you see "Revenue: 391,035 (in millions)" ‚Üí output 391035000000
- If you see "$41.95 billion" ‚Üí output 41950000000
- If you see "11,102 thousand" ‚Üí output 11102000
- Look for the unit indicator (usually at top of table: "in millions", "in thousands")
- Output as integer, no decimals, no commas

For Products revenue: use only the line labeled "Products" in Note 3 ‚Äì Revenue by Product and Services.
For Services revenue: use only the line labeled "Services" in Note 3 ‚Äì Revenue by Product and Services.
For region shares: use the table or disclosure that shows revenue by geography.
For Purchase obligations: separate manufacturing and other obligations exactly as disclosed.
Effective tax rate: compute as Provision for Income Taxes / Income Before Taxes if not explicitly given.
If a value is missing in the 10-K, output null for that field.
Ignore footnotes and subtotals unless explicitly part of the category.
"""


def create_10k_extraction_prompt(file_content: str, ticker: str) -> str:
    """
    Cr√©e le prompt d'extraction pour un rapport 10-K.
    
    Args:
        file_content: Texte du rapport 10-K
        ticker: Symbole du ticker (ex: AAPL)
    
    Returns:
        Prompt format√©
    """
    prompt = f"""{file_content}
    
    Return the result strictly in JSON format matching this schema.

    CRITICAL UNIT CONVERSION EXAMPLES:
    Example 1: If table says "Revenue: 391,035" with header "(in millions)"
    ‚Üí Output: "total": 391035000000
    
    Example 2: If text says "$6.51 billion in revenue"  
    ‚Üí Output: "total": 6510000000
    
    Example 3: If table shows "11,102" with note "amounts in thousands"
    ‚Üí Output: "total": 11102000
    
    ALWAYS look for unit indicators like "in millions", "in thousands", "in billions" at the top of financial tables!
    
    {{
      "company_name": "Full company name",
      "ticker": "{ticker}",
      "fiscal_year": 2024,
      "fiscal_period": "FY",
      
      "business_segment": "Primary business segment",
      "primary_industry": "Main industry",
      
      "revenue": {{
        "products": null,
        "services": null,
        "total": null
      }},
      
      "revenue_by_geography": {{
        "americas": null,
        "europe": null,
        "greater_china": null,
        "japan": null,
        "asia_pacific": null,
        "other": null
      }},
      
      "purchase_obligations": {{
        "manufacturing": null,
        "other": null,
        "total": null
      }},
      
      "tax_info": {{
        "provision_for_income_taxes": null,
        "income_before_taxes": null,
        "effective_tax_rate": null
      }},
      
      "other_metrics": {{
        "total_assets": null,
        "total_liabilities": null,
        "shareholders_equity": null,
        "net_income": null,
        "operating_income": null
      }}
    }}
    
    REMEMBER: 
    - Revenue in millions? Multiply by 1,000,000
    - Revenue in thousands? Multiply by 1,000
    - Revenue in billions? Multiply by 1,000,000,000
    - Output as integer with NO commas, NO decimals
    - Output ONLY the JSON, no markdown, no extra text.
    """
    return prompt


In [27]:
def call_bedrock_for_10k(file_content: str, ticker: str, max_tokens: int = 4096) -> tuple[dict, dict]:
    """
    Appelle AWS Bedrock pour extraire les donn√©es d'un rapport 10-K.
    
    Args:
        file_content: Texte du rapport 10-K
        ticker: Symbole du ticker
        max_tokens: Nombre maximum de tokens de r√©ponse
    
    Returns:
        Tuple (donn√©es_extraites, m√©tadonn√©es)
    """
    start_time = time.time()
    
    try:
        # Cr√©er le prompt
        user_prompt = create_10k_extraction_prompt(file_content, ticker)
        
        # Configuration de la requ√™te Bedrock
        request_body = {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": max_tokens,
            "temperature": 0.0,  # Pr√©cision maximale pour extraction
            "system": SYSTEM_PROMPT,
            "messages": [
                {
                    "role": "user",
                    "content": user_prompt
                }
            ]
        }
        
        # Appel √† Bedrock
        response = bedrock_client.invoke_model(
            modelId=MODEL_ID,
            body=json.dumps(request_body)
        )
        
        # Parser la r√©ponse
        response_body = json.loads(response['body'].read())
        response_text = response_body['content'][0]['text']
        
        # Nettoyer le JSON (retirer markdown si pr√©sent)
        response_text = response_text.strip()
        if response_text.startswith('```json'):
            response_text = response_text[7:]
        if response_text.startswith('```'):
            response_text = response_text[3:]
        if response_text.endswith('```'):
            response_text = response_text[:-3]
        response_text = response_text.strip()
        
        # Parser le JSON
        extracted_data = json.loads(response_text)
        
        # M√©tadonn√©es
        duration = time.time() - start_time
        metadata = {
            'extraction_success': True,
            'duration_seconds': duration,
            'input_tokens': response_body.get('usage', {}).get('input_tokens', 0),
            'output_tokens': response_body.get('usage', {}).get('output_tokens', 0),
            'model_id': MODEL_ID
        }
        
        return extracted_data, metadata
        
    except json.JSONDecodeError as e:
        duration = time.time() - start_time
        metadata = {
            'extraction_success': False,
            'error': f'JSON parsing error: {str(e)}',
            'raw_response': response_text[:500] if 'response_text' in locals() else 'N/A',
            'duration_seconds': duration
        }
        return {}, metadata
        
    except Exception as e:
        duration = time.time() - start_time
        metadata = {
            'extraction_success': False,
            'error': str(e),
            'duration_seconds': duration
        }
        return {}, metadata


In [28]:
def process_single_10k(ticker: str, file_path: Path) -> dict:
    """
    Traite un seul rapport 10-K.
    
    Args:
        ticker: Symbole du ticker
        file_path: Chemin vers le fichier 10-K
    
    Returns:
        Dictionnaire avec les donn√©es extraites et m√©tadonn√©es
    """
    result = {
        'ticker': ticker,
        'file_path': str(file_path),
        'timestamp': datetime.now().isoformat()
    }
    
    # √âtape 1: Extraction du texte
    text, text_metadata = extract_text_from_10k(file_path)
    result.update(text_metadata)
    
    if not text_metadata['extraction_success']:
        result['overall_success'] = False
        return result
    
    # √âtape 2: Extraction avec Bedrock
    extracted_data, bedrock_metadata = call_bedrock_for_10k(text, ticker)
    result.update(bedrock_metadata)
    
    if bedrock_metadata['extraction_success']:
        result['data'] = extracted_data
        result['overall_success'] = True
    else:
        result['overall_success'] = False
    
    return result


In [31]:
# Tester sur un √©chantillon de 3-5 fichiers 
SAMPLE_SIZE = 5
sample_tickers = list(files_by_ticker.keys())[:SAMPLE_SIZE]

print(f" TEST SUR √âCHANTILLON DE {SAMPLE_SIZE} FICHIERS")
print("="*60)
print(f"Tickers: {', '.join(sample_tickers)}\n")

sample_results = []

for ticker in sample_tickers:
    print(f"\nüìä Traitement de {ticker}...")
    file_path = files_by_ticker[ticker]
    
    result = process_single_10k(ticker, file_path)
    sample_results.append(result)
    
    if result['overall_success']:
        print(f"   ‚úÖ Succ√®s ({result['duration_seconds']:.1f}s)")
        # Afficher quelques donn√©es extraites
        data = result['data']
        print(f"      Entreprise: {data.get('company_name', 'N/A')}")
        print(f"      Ann√©e fiscale: {data.get('fiscal_year', 'N/A')}")
        if data.get('revenue', {}).get('total'):
            print(f"      Revenue total: ${data['revenue']['total']:,.0f}")
    else:
        print(f"   ‚ùå √âchec: {result.get('error', 'Unknown error')}")

# Statistiques de l'√©chantillon
success_count = sum(1 for r in sample_results if r['overall_success'])
print(f"\n\nüìä R√âSULTATS DE L'√âCHANTILLON")
print("="*60)
print(f"Succ√®s: {success_count}/{SAMPLE_SIZE}")
print(f"Taux de succ√®s: {success_count/SAMPLE_SIZE*100:.1f}%")

 TEST SUR √âCHANTILLON DE 5 FICHIERS
Tickers: A, AAPL, ABBV, ABNB, ABT


üìä Traitement de A...
   ‚úÖ Succ√®s (16.2s)
      Entreprise: Agilent Technologies, Inc.
      Ann√©e fiscale: 2024
      Revenue total: $6,510,000,000

üìä Traitement de AAPL...
   ‚úÖ Succ√®s (9.0s)
      Entreprise: Apple Inc.
      Ann√©e fiscale: 2024
      Revenue total: $391,035,000,000

üìä Traitement de ABBV...
   ‚úÖ Succ√®s (17.9s)
      Entreprise: AbbVie Inc.
      Ann√©e fiscale: 2024
      Revenue total: $56,334,000,000

üìä Traitement de ABNB...
   ‚úÖ Succ√®s (14.0s)
      Entreprise: Airbnb, Inc.
      Ann√©e fiscale: 2024
      Revenue total: $11,102,000,000

üìä Traitement de ABT...
   ‚úÖ Succ√®s (13.7s)
      Entreprise: Abbott Laboratories
      Ann√©e fiscale: 2024
      Revenue total: $41,950,000,000


üìä R√âSULTATS DE L'√âCHANTILLON
Succ√®s: 5/5
Taux de succ√®s: 100.0%


In [None]:
MODEL_ID