# 01 - Data Acquisition
## Section 1.1: Defining Mass Extinction

---

**Notebook Purpose**: Acquire, validate, and store raw data for extinction rate analysis.

**Author**: Dennis 'dnoice' Smaltz  
**AI Acknowledgement**: Claude Opus 4  
**Version**: 0.1 (Template)  
**Date**: 2025-12-12  
**Signature**: Ô∏ª„Éá‚ïê‚Äî¬∑¬∑¬∑ üéØ = Aim Twice, Shoot Once!

---

### Data Sources

| Source ID | Name | Access Method |
|-----------|------|---------------|
| DS-PA-001 | IUCN Red List | API |
| DS-PA-002 | IUCN Guidelines | PDF/Manual |
| DS-PR-001 | Barnosky 2011 | Supplementary Data |
| DS-PR-002 | Ceballos 2015 | Supplementary Data |

---

## 1. Environment Setup

In [None]:
# Standard library imports
import os
import json
import logging
from datetime import datetime
from pathlib import Path

# Data manipulation
import pandas as pd
import numpy as np

# API access
import requests

# Configuration
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Project paths
PROJECT_ROOT = Path('../../../../').resolve()
SECTION_PATH = Path('../').resolve()
RAW_DATA_PATH = SECTION_PATH / 'data' / 'raw'
PROCESSED_DATA_PATH = SECTION_PATH / 'data' / 'processed'

# Ensure directories exist
RAW_DATA_PATH.mkdir(parents=True, exist_ok=True)
PROCESSED_DATA_PATH.mkdir(parents=True, exist_ok=True)

print(f"Project Root: {PROJECT_ROOT}")
print(f"Section Path: {SECTION_PATH}")
print(f"Raw Data Path: {RAW_DATA_PATH}")
print(f"Acquisition Date: {datetime.now().strftime('%Y-%m-%d')}")

## 2. IUCN Red List Data Acquisition

### 2.1 API Configuration

**Note**: You need an IUCN API token. Apply at: https://apiv3.iucnredlist.org/api/v3/token

In [None]:
# IUCN API Configuration
# IMPORTANT: Store your token securely, not in code!
# Options: environment variable, .env file, or secrets manager

IUCN_API_BASE = "https://apiv3.iucnredlist.org/api/v3"

# Load token from environment variable
IUCN_TOKEN = os.environ.get('IUCN_API_TOKEN', None)

if IUCN_TOKEN is None:
    logger.warning("IUCN_API_TOKEN not set. Set it using:")
    logger.warning("  export IUCN_API_TOKEN='your_token_here'")
    logger.warning("Proceeding with placeholder data for template...")

def iucn_api_request(endpoint: str, params: dict = None) -> dict:
    """
    Make authenticated request to IUCN Red List API.
    
    Parameters
    ----------
    endpoint : str
        API endpoint (e.g., '/species/page/0')
    params : dict, optional
        Additional query parameters
        
    Returns
    -------
    dict
        JSON response from API
    """
    if IUCN_TOKEN is None:
        raise ValueError("IUCN_API_TOKEN not configured")
    
    url = f"{IUCN_API_BASE}{endpoint}"
    params = params or {}
    params['token'] = IUCN_TOKEN
    
    response = requests.get(url, params=params)
    response.raise_for_status()
    
    return response.json()

### 2.2 Fetch Extinction Statistics

In [None]:
def get_species_by_category(category: str) -> pd.DataFrame:
    """
    Fetch species list for a given IUCN category.
    
    Parameters
    ----------
    category : str
        IUCN category code (EX, EW, CR, EN, VU, NT, LC, DD)
        
    Returns
    -------
    pd.DataFrame
        DataFrame with species information
    """
    # TODO: Implement when API token available
    # endpoint = f"/species/category/{category}"
    # data = iucn_api_request(endpoint)
    # return pd.DataFrame(data['result'])
    
    logger.info(f"Placeholder: Would fetch category {category}")
    return pd.DataFrame()

def get_threat_summary() -> dict:
    """
    Get summary statistics of threatened species.
    
    Returns
    -------
    dict
        Summary statistics by category and taxon
    """
    # Current statistics (manually updated from IUCN v2025-2)
    # These serve as fallback when API is unavailable
    
    summary = {
        'version': '2025-2',
        'assessed_species': 169420,
        'threatened_species': 47187,
        'by_category': {
            'EX': 943,  # Extinct
            'EW': 90,   # Extinct in Wild
            'CR': 9800, # Critically Endangered (approximate)
            'EN': 16500, # Endangered (approximate)
            'VU': 20900, # Vulnerable (approximate)
        },
        'by_taxon': {
            'Mammals': {'assessed': 6025, 'threatened': 1625, 'pct_threatened': 27},
            'Birds': {'assessed': 11185, 'threatened': 1286, 'pct_threatened': 11.5},
            'Reptiles': {'assessed': 10196, 'threatened': 2141, 'pct_threatened': 21},
            'Amphibians': {'assessed': 8009, 'threatened': 3284, 'pct_threatened': 41},
            'Fishes': {'assessed': 25000, 'threatened': 6250, 'pct_threatened': 25},
            'Sharks_Rays': {'assessed': 1200, 'threatened': 444, 'pct_threatened': 37},
            'Corals': {'assessed': 900, 'threatened': 396, 'pct_threatened': 44},
        }
    }
    
    return summary

# Get summary
iucn_summary = get_threat_summary()
print(json.dumps(iucn_summary, indent=2))

### 2.3 Save Raw Data

In [None]:
# Save IUCN summary to raw data
iucn_output_path = RAW_DATA_PATH / 'iucn_summary_2025-2.json'

with open(iucn_output_path, 'w') as f:
    json.dump(iucn_summary, f, indent=2)

logger.info(f"Saved IUCN summary to: {iucn_output_path}")

## 3. Paleontological Extinction Data

### 3.1 Big Five Mass Extinctions Reference Data

In [None]:
# Big Five Mass Extinctions - Reference Data
# Sources: Barnosky et al. 2011, Raup & Sepkoski 1982, various

big_five = pd.DataFrame([
    {
        'event': 'End-Ordovician',
        'age_ma': 443,
        'duration_my': 1.5,
        'species_loss_pct': 85,
        'genera_loss_pct': 57,
        'primary_cause': 'Glaciation, sea level drop',
        'source': 'Barnosky_2011'
    },
    {
        'event': 'Late Devonian',
        'age_ma': 372,
        'duration_my': 10,
        'species_loss_pct': 75,
        'genera_loss_pct': 50,
        'primary_cause': 'Anoxia, climate change',
        'source': 'Barnosky_2011'
    },
    {
        'event': 'End-Permian',
        'age_ma': 252,
        'duration_my': 0.1,
        'species_loss_pct': 96,
        'genera_loss_pct': 83,
        'primary_cause': 'Siberian Traps volcanism, warming',
        'source': 'Barnosky_2011'
    },
    {
        'event': 'End-Triassic',
        'age_ma': 201,
        'duration_my': 0.3,
        'species_loss_pct': 76,
        'genera_loss_pct': 48,
        'primary_cause': 'CAMP volcanism, CO2 increase',
        'source': 'Barnosky_2011'
    },
    {
        'event': 'End-Cretaceous',
        'age_ma': 66,
        'duration_my': 0.05,
        'species_loss_pct': 76,
        'genera_loss_pct': 40,
        'primary_cause': 'Chicxulub impact, Deccan volcanism',
        'source': 'Barnosky_2011'
    }
])

# Calculate estimated E/MSY for each event
# This is a rough approximation
big_five['estimated_emsy'] = (
    big_five['species_loss_pct'] / 100 * 1e6  # Assume ~1M species
) / (1e6 * big_five['duration_my'])  # Convert to E/MSY

print(big_five.to_string())

In [None]:
# Save Big Five data
big_five_path = RAW_DATA_PATH / 'big_five_mass_extinctions.csv'
big_five.to_csv(big_five_path, index=False)

logger.info(f"Saved Big Five data to: {big_five_path}")

## 4. Modern Extinction Records

### 4.1 Documented Extinctions Since 1500

In [None]:
# Modern extinction summary data
# Source: IUCN Red List, Ceballos et al. 2015

modern_extinctions = {
    'time_period': {
        'start_year': 1500,
        'end_year': 2025,
        'duration_years': 525
    },
    'by_taxon': {
        'Mammals': {
            'extinct': 96,
            'extinct_in_wild': 2,
            'species_count': 6500,
            'source': 'IUCN_2025'
        },
        'Birds': {
            'extinct': 162,
            'extinct_in_wild': 5,
            'species_count': 11200,
            'source': 'IUCN_2025'
        },
        'Reptiles': {
            'extinct': 37,
            'extinct_in_wild': 1,
            'species_count': 12000,
            'source': 'IUCN_2025'
        },
        'Amphibians': {
            'extinct': 37,
            'extinct_in_wild': 2,
            'species_count': 8500,
            'source': 'IUCN_2025'
        },
        'Fishes': {
            'extinct': 90,
            'extinct_in_wild': 15,
            'species_count': 35000,
            'source': 'IUCN_2025'
        },
        'Invertebrates': {
            'extinct': 359,
            'extinct_in_wild': 45,
            'species_count': 1500000,  # Very approximate
            'source': 'IUCN_2025'
        },
        'Plants': {
            'extinct': 162,
            'extinct_in_wild': 20,
            'species_count': 400000,
            'source': 'IUCN_2025'
        }
    },
    'total': {
        'extinct': 943,
        'extinct_in_wild': 90
    }
}

# Save to JSON
modern_ext_path = RAW_DATA_PATH / 'modern_extinctions_since_1500.json'
with open(modern_ext_path, 'w') as f:
    json.dump(modern_extinctions, f, indent=2)

logger.info(f"Saved modern extinctions to: {modern_ext_path}")
print(json.dumps(modern_extinctions, indent=2))

## 5. Background Extinction Rate References

### 5.1 Literature Estimates

In [None]:
# Background extinction rate estimates from literature

background_rates = pd.DataFrame([
    {
        'source': 'Raup_1991',
        'taxa': 'Marine species',
        'rate_emsy': 0.25,
        'uncertainty_low': 0.1,
        'uncertainty_high': 0.5,
        'method': 'Fossil record genus duration'
    },
    {
        'source': 'Pimm_1995',
        'taxa': 'Vertebrates',
        'rate_emsy': 1.0,
        'uncertainty_low': 0.5,
        'uncertainty_high': 2.0,
        'method': 'Species duration in fossil record'
    },
    {
        'source': 'Barnosky_2011',
        'taxa': 'All species',
        'rate_emsy': 0.72,
        'uncertainty_low': 0.1,
        'uncertainty_high': 2.0,
        'method': 'Review of multiple estimates'
    },
    {
        'source': 'De_Vos_2015',
        'taxa': 'Vertebrates',
        'rate_emsy': 0.1,
        'uncertainty_low': 0.05,
        'uncertainty_high': 0.2,
        'method': 'Molecular phylogenies'
    },
    {
        'source': 'Ceballos_2015',
        'taxa': 'Mammals',
        'rate_emsy': 2.0,
        'uncertainty_low': 1.0,
        'uncertainty_high': 3.0,
        'method': 'Conservative fossil-based'
    }
])

# Save
bg_rates_path = RAW_DATA_PATH / 'background_extinction_rates.csv'
background_rates.to_csv(bg_rates_path, index=False)

logger.info(f"Saved background rates to: {bg_rates_path}")
print(background_rates.to_string())

## 6. Data Validation

### 6.1 Completeness Checks

In [None]:
# Validate all required data files exist

required_files = [
    RAW_DATA_PATH / 'iucn_summary_2025-2.json',
    RAW_DATA_PATH / 'big_five_mass_extinctions.csv',
    RAW_DATA_PATH / 'modern_extinctions_since_1500.json',
    RAW_DATA_PATH / 'background_extinction_rates.csv'
]

print("Data Validation Report")
print("=" * 50)

all_present = True
for filepath in required_files:
    exists = filepath.exists()
    status = "‚úì Present" if exists else "‚úó MISSING"
    print(f"{status}: {filepath.name}")
    if not exists:
        all_present = False

print("=" * 50)
if all_present:
    print("All required data files present. Ready for analysis.")
else:
    print("WARNING: Some data files missing!")

## 7. Data Manifest

### Summary of Acquired Data

In [None]:
# Generate data manifest

manifest = {
    'acquisition_date': datetime.now().isoformat(),
    'section': '1.1 - Defining Mass Extinction',
    'files': []
}

for filepath in RAW_DATA_PATH.glob('*'):
    if filepath.is_file():
        manifest['files'].append({
            'filename': filepath.name,
            'size_bytes': filepath.stat().st_size,
            'modified': datetime.fromtimestamp(filepath.stat().st_mtime).isoformat()
        })

# Save manifest
manifest_path = RAW_DATA_PATH / 'DATA_MANIFEST.json'
with open(manifest_path, 'w') as f:
    json.dump(manifest, f, indent=2)

print("Data Manifest:")
print(json.dumps(manifest, indent=2))

---

## Next Steps

1. **02_analysis_core.ipynb**: Calculate extinction rates and perform comparisons
2. **03_visualization.ipynb**: Generate publication-quality figures

---

*Ô∏ª„Éá‚ïê‚Äî¬∑¬∑¬∑ üéØ = Aim Twice, Shoot Once!*