# üì¶ SECTION 1: DATA COLLECTION

This notebook handles data collection from the EIA API.

## Setup

In [167]:
# Import libraries
import sys
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
import importlib

# Add project root to path - try multiple methods
def find_project_root():
    """Find the project root directory."""
    cwd = Path.cwd()
    print(f"üîç Searching for project root from: {cwd}")
    
    if cwd.name == 'notebooks':
        candidate = cwd.parent
        if (candidate / 'src').exists():
            print(f"   ‚úÖ Found via Method 1: {candidate}")
            return candidate
    
    if (cwd / 'src').exists():
        print(f"   ‚úÖ Found via Method 2: {cwd}")
        return cwd
    
    if (cwd.parent / 'src').exists():
        print(f"   ‚úÖ Found via Method 3: {cwd.parent}")
        return cwd.parent
    
    if 'notebooks' in str(cwd):
        parts = cwd.parts
        if 'notebooks' in parts:
            idx = parts.index('notebooks')
            candidate = Path(*parts[:idx])
            if (candidate / 'src').exists():
                print(f"   ‚úÖ Found via Method 4: {candidate}")
                return candidate
    
    print(f"   ‚ùå All methods failed. Current dir: {cwd}")
    raise FileNotFoundError(f"Cannot find project root. Current dir: {cwd}")

try:
    project_root = find_project_root().resolve()
    
    # Remove any existing src from sys.modules to force fresh import
    modules_to_remove = [m for m in sys.modules.keys() if m.startswith('src')]
    for mod in modules_to_remove:
        del sys.modules[mod]
    
    # Add project root to path
    if str(project_root) not in sys.path:
        sys.path.insert(0, str(project_root))
    
    print(f"‚úÖ Project root added to path: {project_root}")
    print(f"   sys.path[0]: {sys.path[0]}")
    print(f"   src directory exists: {(project_root / 'src').exists()}")
    print(f"   src/data_collection exists: {(project_root / 'src' / 'data_collection').exists()}")
    
except Exception as e:
    print(f"‚ùå Error finding project root: {e}")
    raise

# Load environment variables
env_path = project_root / '.env'
if env_path.exists():
    load_dotenv(dotenv_path=env_path)
else:
    load_dotenv()

# Import data collection functions
print("\nüì¶ Attempting imports...")

# IMPORTANT: Import config first (eia_api depends on it)
try:
    from src.config import RAW_DATA_DIR
    print("   ‚úÖ Imported RAW_DATA_DIR (config)")
except Exception as e:
    print(f"   ‚ùå Failed to import config: {e}")
    raise

# Import data collection module, then get the function
try:
    from src.data_collection import eia_api
    fetch_all_data = eia_api.fetch_all_data
    print("   ‚úÖ Imported fetch_all_data")
except Exception as e:
    print(f"   ‚ùå Failed to import: {e}")
    import traceback
    traceback.print_exc()
    raise

print("\n‚úÖ Setup complete!")
print(f"   Project root: {project_root}")


üîç Searching for project root from: /Users/divyanshisachan/Desktop/Electricity-Consumption/notebooks
   ‚úÖ Found via Method 1: /Users/divyanshisachan/Desktop/Electricity-Consumption
‚úÖ Project root added to path: /Users/divyanshisachan/Desktop/Electricity-Consumption
   sys.path[0]: /Users/divyanshisachan/Desktop/Electricity-Consumption
   src directory exists: True
   src/data_collection exists: True

üì¶ Attempting imports...
   ‚úÖ Imported RAW_DATA_DIR (config)
   ‚úÖ Imported fetch_all_data

‚úÖ Setup complete!
   Project root: /Users/divyanshisachan/Desktop/Electricity-Consumption


## Fetch Data from EIA API

In [171]:
# Fetch ALL data (handles pagination automatically)
try:
    df_raw = fetch_all_data()
    print(f"\n‚úÖ Data loaded successfully! Shape: {df_raw.shape}")
except Exception as e:
    print(f"‚ùå Error fetching data: {e}")
    print("   Please check your API key and internet connection, then re-run this cell.")
    raise

üîÑ Fetching data from EIA API...
   Fetching batch 1 (offset: 0)... ‚úÖ Got 5000 records
   Fetching batch 2 (offset: 5000)... ‚úÖ Got 5000 records
   Fetching batch 3 (offset: 10000)... ‚úÖ Got 5000 records
   Fetching batch 4 (offset: 15000)... ‚úÖ Got 5000 records
   Fetching batch 5 (offset: 20000)... ‚úÖ Got 5000 records
   Fetching batch 6 (offset: 25000)... ‚úÖ Got 5000 records
   Fetching batch 7 (offset: 30000)... ‚úÖ Got 5000 records
   Fetching batch 8 (offset: 35000)... ‚úÖ Got 5000 records
   Fetching batch 9 (offset: 40000)... ‚úÖ Got 5000 records
   Fetching batch 10 (offset: 45000)... ‚úÖ Got 5000 records
   Fetching batch 11 (offset: 50000)... ‚úÖ Got 5000 records
   Fetching batch 12 (offset: 55000)... ‚úÖ Got 5000 records
   Fetching batch 13 (offset: 60000)... ‚úÖ Got 5000 records
   Fetching batch 14 (offset: 65000)... ‚úÖ Got 5000 records
   Fetching batch 15 (offset: 70000)... ‚úÖ Got 5000 records
   Fetching batch 16 (offset: 75000)... ‚úÖ Got 5000 records
   

In [172]:
# Save raw data
from datetime import datetime
from src.config import RAW_DATA_DIR

RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"eia_retail_sales_raw_{timestamp}.csv"
file_path = RAW_DATA_DIR / filename

df_raw.to_csv(file_path, index=False)
print(f"‚úÖ Saved raw data to: {file_path}")

‚úÖ Saved raw data to: /Users/divyanshisachan/Desktop/Electricity-Consumption/data/raw/eia_retail_sales_raw_20251204_100134.csv
