# OSE Dataset Extraction and Visualization - Agroalimentaire Sector

This notebook combines data extraction and visualization for companies in the **agroalimentaire** sector.

**Workflow:**
1. Load pre-processed `agroalimentaire.json` from `data/sector_slices/` (from sector_split notebook)
2. Convert to JSONL format for pipeline compatibility
3. Extract all 9 datasets using the fast pipeline
4. Export datasets as CSV files to `data/extracted_datasets/`
5. Load and visualize all datasets

**Key advantage:** Uses pre-processed sector data, so you don't need to run the sector_split notebook anymore!

**Input:** 
- Pre-processed `agroalimentaire.json` from `data/sector_slices/`
- Article and project files from `data/new_data/`

**Output:** 
- 9 CSV files in `data/extracted_datasets/`
- Visualizations for each dataset


In [18]:
# Configuration
import sys
from pathlib import Path
import os
import json
import tempfile
from typing import Dict, List, Any, Optional

# Add project root to path so we can import from src
cwd = Path(os.getcwd())

# Check if we're in the project root (has src/ and Notebooks/ or notebooks/ directories)
if (cwd / 'src').exists() and ((cwd / 'Notebooks').exists() or (cwd / 'notebooks').exists()):
    project_root = cwd
# Check if we're in Notebooks/ or notebooks/ directory
elif (cwd.parent / 'src').exists() and ((cwd.parent / 'Notebooks').exists() or (cwd.parent / 'notebooks').exists()):
    project_root = cwd.parent
# Fallback: try relative path from Notebooks/
else:
    project_root = Path('..').resolve()

sys.path.insert(0, str(project_root))
print(f"Project root: {project_root}")
print(f"src exists: {(project_root / 'src').exists()}")

# Configuration - Data Sources (all under src/ose_core/data)
SECTOR_SLICE_FILE = Path('../src/ose_core/data/sector_slices/agroalimentaire.json')
NEW_DATA_DIR = Path('../src/ose_core/data/new_data')
COMPANY_FALLBACK_PATH = NEW_DATA_DIR / 'app_company.json'
company_path = str(COMPANY_FALLBACK_PATH)
FILE_PATHS = {
    'company': company_path,
    'article': NEW_DATA_DIR / 'app_article-003.json',
    'project': NEW_DATA_DIR / 'app_project.json'
}
# Store outputs inside the package data directory used by config
OUTPUT_DIR = Path('../src/ose_core/data/extracted_datasets')
CHUNK_SIZE = 10000  # Number of lines to process per chunk

# Create output directory if it doesn't exist
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import extraction pipeline
from src.ose_core.pipelines.extract_pipeline_v3_fast import make_extract_pipeline_v3_fast

# Set random seeds for reproducibility
SEED = 42
np.random.seed(SEED)

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Configuration loaded successfully!")
print(f"Sector slice file: {SECTOR_SLICE_FILE}")
print(f"New data directory: {NEW_DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Chunk size: {CHUNK_SIZE}")

# Verify files exist
print("\nVerifying input files:")
print(f"  {'✓' if SECTOR_SLICE_FILE.exists() else '✗'} sector_slice: {SECTOR_SLICE_FILE}")
for file_type, file_path in FILE_PATHS.items():
    # Convert to Path object if it's a string
    path_obj = Path(file_path) if isinstance(file_path, str) else file_path
    exists = path_obj.exists()
    status = "✓" if exists else "✗"
    print(f"  {status} {file_type}: {file_path}")


Project root: /Users/jlb/Documents/Python Course/git_OSE/ose-main
src exists: True
Configuration loaded successfully!
Sector slice file: ../src/ose_core/data/sector_slices/agroalimentaire.json
New data directory: ../src/ose_core/data/new_data
Output directory: ../src/ose_core/data/extracted_datasets
Chunk size: 10000

Verifying input files:
  ✓ sector_slice: ../src/ose_core/data/sector_slices/agroalimentaire.json
  ✓ company: ../src/ose_core/data/new_data/app_company.json
  ✓ article: ../src/ose_core/data/new_data/app_article-003.json
  ✓ project: ../src/ose_core/data/new_data/app_project.json


## Part 1: Load Pre-processed Sector Data

Load the pre-processed `agroalimentaire.json` file from sector_split and convert it to JSONL format for the extraction pipeline.


In [19]:
# Load pre-processed agroalimentaire.json and convert to JSONL format
# Note: The sector slice may contain article data, so we'll check and use the actual company file
print(f"Loading pre-processed sector data from {SECTOR_SLICE_FILE}...")

temp_company_path = None
use_sector_slice = False

if SECTOR_SLICE_FILE.exists():
    try:
        # Load JSON array from sector slice file
        with open(SECTOR_SLICE_FILE, 'r', encoding='utf-8') as f:
            sector_data = json.load(f)

        print(f"✓ Loaded {len(sector_data):,} records from sector slice")
        
        # Check if the sector slice contains company data (has socialName field)
        # If it contains article data (has title field), we'll use the company file instead
        if sector_data and isinstance(sector_data[0], dict):
            first_record = sector_data[0]
            has_company_fields = 'socialName' in first_record or 'siren' in first_record
            has_article_fields = 'title' in first_record or 'contentClean' in first_record
            
            if has_company_fields and not has_article_fields:
                # Sector slice contains company data - use it
                use_sector_slice = True
                print("  ✓ Sector slice contains company data - will use for extraction")
                
                # Convert to JSONL format (one JSON object per line)
                # Wrap in Elasticsearch format if needed
                print("Converting to JSONL format...")
                with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, encoding='utf-8') as tmp_file:
                    for record in sector_data:
                        # Ensure record is in Elasticsearch format with _source
                        if '_source' not in record:
                            es_record = {
                                '_source': record,
                                '_id': str(record.get('id', '')),
                                '_index': 'company'
                            }
                        else:
                            es_record = record
                        json.dump(es_record, tmp_file, ensure_ascii=False)
                        tmp_file.write('\n')
                    temp_company_path = tmp_file.name

                print(f"✓ Temporary JSONL file created: {temp_company_path}")
                print(f"  Records written: {len(sector_data):,}")
            else:
                # Sector slice contains article data - use company file instead
                print("  ⚠ Sector slice contains article data, not company data")
                print("  → Will use full company file for extraction")
        else:
            print("  ⚠ Could not determine data type in sector slice")
            print("  → Will use full company file for extraction")

    except Exception as e:
        print(f"❌ Error loading sector slice: {e}")
        import traceback
        traceback.print_exc()
        temp_company_path = None
else:
    print(f"⚠ Sector slice file not found: {SECTOR_SLICE_FILE}")
    print("  → Will use full company file for extraction")

# Choose company path: prefer sector slice if it contains company data, otherwise use company file
if temp_company_path and use_sector_slice:
    company_path = temp_company_path
else:
    company_path = str(COMPANY_FALLBACK_PATH)

FILE_PATHS['company'] = company_path

print("\nCompany file selection:")
print(f"  Sector slice JSONL: {temp_company_path if temp_company_path else 'not created'}")
print(f"  Full company file: {COMPANY_FALLBACK_PATH}")
print(f"  Using: {company_path}")
if use_sector_slice:
    print("  → Using filtered sector slice data")
else:
    print("  → Using full company file (recommended for complete extraction)")


Loading pre-processed sector data from ../src/ose_core/data/sector_slices/agroalimentaire.json...
✓ Loaded 18,784 companies from sector slice
Converting to JSONL format...
✓ Temporary JSONL file created: /var/folders/5c/m3nm1tb53nj8kd1sy83fykk40000gn/T/tmp584edxvx.jsonl
  Records written: 18,784

Company file selection:
  Temporary JSONL: /var/folders/5c/m3nm1tb53nj8kd1sy83fykk40000gn/T/tmp584edxvx.jsonl
  Fallback path: ../src/ose_core/data/new_data/app_company.json
  Using: /var/folders/5c/m3nm1tb53nj8kd1sy83fykk40000gn/T/tmp584edxvx.jsonl


## Part 2: Extract Datasets

Run the extraction pipeline to generate all 9 datasets from the filtered company data and related articles/projects.


In [20]:
# Prepare file paths for extraction pipeline
extraction_file_paths = {
    'company': company_path,
    'article': str(FILE_PATHS['article']),
    'project': str(FILE_PATHS['project'])
}

print("File paths prepared for extraction:")
if temp_company_path:
    print(f"  Company (from sector slice): {extraction_file_paths['company']}")
else:
    print(f"  Company (fallback): {extraction_file_paths['company']}")
print(f"  Article: {extraction_file_paths['article']}")
print(f"  Project: {extraction_file_paths['project']}")

# Verify files exist
if not Path(extraction_file_paths['company']).exists():
    print(f"  ⚠ Warning: Company file not found: {extraction_file_paths['company']}")
if not Path(extraction_file_paths['article']).exists():
    print(f"  ⚠ Warning: Article file not found: {extraction_file_paths['article']}")
if not Path(extraction_file_paths['project']).exists():
    print(f"  ⚠ Warning: Project file not found: {extraction_file_paths['project']}")


File paths prepared for extraction:
  Company (from sector slice): /var/folders/5c/m3nm1tb53nj8kd1sy83fykk40000gn/T/tmp584edxvx.jsonl
  Article: ../src/ose_core/data/new_data/app_article-003.json
  Project: ../src/ose_core/data/new_data/app_project.json


In [21]:
# Run extraction pipeline
if extraction_file_paths:
    print(f"Processing data through extraction pipeline...")
    print(f"{'='*80}")

    try:
        # Run the fast extraction pipeline
        datasets = make_extract_pipeline_v3_fast(
            file_paths=extraction_file_paths,
            chunk_size=CHUNK_SIZE
        )

        print(f"\n✅ Extraction complete!")
        print(f"   Datasets extracted: {len(datasets)}")

        # Display dataset summary
        print(f"\nExtracted {len(datasets)} datasets:")
        for name in sorted(datasets.keys()):
            df = datasets[name]
            if not df.empty:
                print(f"  - {name}: {df.shape}")
            else:
                print(f"  - {name}: (empty)")

    except Exception as e:
        print(f"\n❌ Error during extraction: {e}")
        import traceback
        traceback.print_exc()
        datasets = {}
else:
    print("❌ Cannot run extraction - file paths not prepared")
    datasets = {}


Processing data through extraction pipeline...

Processing company file: /var/folders/5c/m3nm1tb53nj8kd1sy83fykk40000gn/T/tmp584edxvx.jsonl
Loading data from /var/folders/5c/m3nm1tb53nj8kd1sy83fykk40000gn/T/tmp584edxvx.jsonl in chunks of 10000...
Total lines loaded: 18784
Total chunks processed: 2, Total records: 18784

Processing article file: ../src/ose_core/data/new_data/app_article-003.json
Loading data from ../src/ose_core/data/new_data/app_article-003.json in chunks of 10000...
  Processed 100000 lines...
  Processed 10 chunks (100000 records)...
  Processed 200000 lines...
  Processed 20 chunks (200000 records)...
  Processed 300000 lines...
  Processed 30 chunks (300000 records)...
  Processed 400000 lines...
  Processed 40 chunks (400000 records)...
  Processed 500000 lines...
  Processed 50 chunks (500000 records)...
  Processed 600000 lines...
  Processed 60 chunks (600000 records)...
  Processed 700000 lines...
  Processed 70 chunks (700000 records)...
  Processed 794553 li

## Part 3: Export Datasets to CSV

Export all extracted datasets as CSV files for later use and visualization.


In [22]:
# Export 9 datasets to CSV files
if 'datasets' in locals() and datasets:
    print(f"Exporting {len(datasets)} datasets to CSV files...")
    print(f"{'='*80}")

    # Ensure output directory exists
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    # Expected dataset names in order
    expected_datasets = [
        '01_company_basic_info',
        '02_financial_data',
        '03_workforce_data',
        '04_company_structure',
        '05_classification_flags',
        '06_contact_metrics',
        '07_kpi_data',
        '08_signals',
        '09_articles'
    ]

    # Export each dataset
    exported_count = 0
    for name in expected_datasets:
        if name in datasets:
            df = datasets[name]
            if df.empty:
                print(f"  ⚠ Skipping {name}.csv (empty)")
                continue

            # Create filename: 01_company_basic_info.csv
            csv_path = OUTPUT_DIR / f"{name}.csv"

            # Export to CSV
            df.to_csv(csv_path, index=False, encoding='utf-8')

            print(f"  ✓ {name}.csv: {df.shape[0]:,} rows, {df.shape[1]} columns")
            exported_count += 1
        else:
            print(f"  ✗ {name}.csv: MISSING")

    print(f"\n✅ All datasets exported to: {OUTPUT_DIR}")
    print(f"   Total datasets exported: {exported_count}")
    print(f"   Files are ready for visualization")

    # Clean up temporary file
    if 'temp_company_path' in locals() and temp_company_path:
        try:
            os.unlink(temp_company_path)
            print(f"\n  Temporary file cleaned up: {temp_company_path}")
        except Exception as e:
            print(f"\n  ⚠ Could not clean up temporary file: {e}")
else:
    print("❌ No datasets available for export")


Exporting 9 datasets to CSV files...
  ✓ 01_company_basic_info.csv: 18,784 rows, 17 columns
  ✓ 02_financial_data.csv: 18,784 rows, 12 columns
  ✓ 03_workforce_data.csv: 18,784 rows, 8 columns
  ✓ 04_company_structure.csv: 18,784 rows, 10 columns
  ✓ 05_classification_flags.csv: 18,784 rows, 17 columns
  ✓ 06_contact_metrics.csv: 18,784 rows, 10 columns
  ⚠ Skipping 07_kpi_data.csv (empty)
  ✓ 08_signals.csv: 2,741,487 rows, 17 columns
  ✓ 09_articles.csv: 907,270 rows, 15 columns

✅ All datasets exported to: ../src/ose_core/data/extracted_datasets
   Total datasets exported: 8
   Files are ready for visualization

  Temporary file cleaned up: /var/folders/5c/m3nm1tb53nj8kd1sy83fykk40000gn/T/tmp584edxvx.jsonl


## Part 4: Load Datasets for Visualization

Load the exported CSV files for visualization and analysis.


In [23]:
# Load datasets from CSV files
print("Loading datasets from CSV files...")
print("="*80)

data = {}
dataset_files = {
    '01_company_basic_info': '01_company_basic_info.csv',
    '02_financial_data': '02_financial_data.csv',
    '03_workforce_data': '03_workforce_data.csv',
    '04_company_structure': '04_company_structure.csv',
    '05_classification_flags': '05_classification_flags.csv',
    '06_contact_metrics': '06_contact_metrics.csv',
    '07_kpi_data': '07_kpi_data.csv',
    '08_signals': '08_signals.csv',
    '09_articles': '09_articles.csv'
}

for name, filename in dataset_files.items():
    filepath = OUTPUT_DIR / filename
    if filepath.exists():
        try:
            df = pd.read_csv(filepath, low_memory=False)
            data[name] = df
            print(f"  ✓ {name}: {df.shape}")
        except Exception as e:
            print(f"  ✗ {name}: Error loading - {e}")
            data[name] = pd.DataFrame()
    else:
        # File not found - likely skipped during export because it was empty
        # Skip warning to avoid confusion (Part 3 already notified about skipped files)
        data[name] = pd.DataFrame()

print(f"\n✅ Loaded {len([k for k, v in data.items() if not v.empty])} datasets")
print(f"   Total datasets: {len(data)}")


Loading datasets from CSV files...
  ✓ 01_company_basic_info: (18784, 17)
  ✓ 02_financial_data: (18784, 12)
  ✓ 03_workforce_data: (18784, 8)
  ✓ 04_company_structure: (18784, 10)
  ✓ 05_classification_flags: (18784, 17)
  ✓ 06_contact_metrics: (18784, 10)
  ✓ 08_signals: (2741487, 17)
  ✓ 09_articles: (907270, 15)

✅ Loaded 8 datasets
   Total datasets: 9
