In [1]:
# Cell 1: Initialize Hybrid Data System
import sys
sys.path.append('../src')

from data_manager import SolarDataManager
from enphase_client import EnphaseClient
from pathlib import Path

print("=== HYBRID SOLAR DATA SYSTEM ===\n")

# Load credentials and initialize client
env_file = Path("../.env")
credentials = {}
with open(env_file, 'r') as f:
    for line in f:
        if '=' in line and not line.startswith('#'):
            key, value = line.strip().split('=', 1)
            credentials[key] = value

client = EnphaseClient(
    access_token=credentials['ENPHASE_ACCESS_TOKEN'],
    api_key=credentials['ENPHASE_API_KEY'],
    system_id=credentials['ENPHASE_SYSTEM_ID']
)

# Initialize data manager
data_manager = SolarDataManager(
    csv_path="../data/raw/4136754_custom_report.csv",
    enphase_client=client,
    cache_dir="../data/processed"
)

print("SolarDataManager initialized with hybrid CSV/API capabilities")

=== HYBRID SOLAR DATA SYSTEM ===

SolarDataManager initialized with hybrid CSV/API capabilities




In [2]:
# Cell 2: Data Source Analysis
print("=== DATA SOURCE ANALYSIS ===\n")

# Get summary of available data
summary = data_manager.get_data_summary()

print("Data Source Summary:")
for source, info in summary.items():
    print(f"\n{source.upper()} Data:")
    if info['available']:
        print(f"  Records: {info['records']:,}")
        print(f"  Date range: {info['date_range'][0]} to {info['date_range'][1]}")
        print(f"  Granularity: {info['granularity']}")
        print(f"  Status: Available")
    else:
        print(f"  Status: Not available")

# Load individual sources for comparison
print(f"\n=== DETAILED COMPARISON ===")
csv_data = data_manager.load_csv_data()
api_data = data_manager.load_api_data(days_back=60)

if not csv_data.empty:
    csv_daily = csv_data.resample('D').sum()
    print(f"CSV: {len(csv_data):,} 15-min intervals → {len(csv_daily):,} daily records")

if not api_data.empty:
    print(f"API: {len(api_data):,} daily records")

    # Check overlap
    if not csv_data.empty:
        csv_dates = set(csv_daily.index.date)
        api_dates = set(api_data.index.date)
        overlap = len(csv_dates.intersection(api_dates))
        csv_only = len(csv_dates - api_dates)
        api_only = len(api_dates - csv_dates)

        print(f"\nData overlap analysis:")
        print(f"  Dates in both sources: {overlap}")
        print(f"  CSV-only dates: {csv_only}")
        print(f"  API-only dates: {api_only}")

INFO:data_manager:Loading CSV data from ../data/raw/4136754_custom_report.csv
INFO:data_manager:Loaded 70272 CSV records from 2023-09-16 00:00:00 to 2025-09-16 23:45:00
INFO:data_manager:Loading API data for last 30 days


=== DATA SOURCE ANALYSIS ===



INFO:data_manager:Loaded 30 API records from 2025-08-20 00:00:00 to 2025-09-18 00:00:00


Data Source Summary:

CSV Data:
  Records: 70,272
  Date range: 2023-09-16 to 2025-09-16
  Granularity: 15min
  Status: Available

API Data:
  Records: 30
  Date range: 2025-08-20 to 2025-09-18
  Granularity: daily
  Status: Available

=== DETAILED COMPARISON ===
CSV: 70,272 15-min intervals → 732 daily records
API: 30 daily records

Data overlap analysis:
  Dates in both sources: 28
  CSV-only dates: 704
  API-only dates: 2


In [3]:
# Cell 3: Test Hybrid Data Retrieval
print("=== HYBRID DATA RETRIEVAL ===\n")

from datetime import datetime, timedelta

# Test different source priorities
strategies = ['csv_first', 'api_first', 'csv_only']

for strategy in strategies:
    print(f"{strategy.upper()} Strategy:")

    # Get last 30 days
    end_date = datetime.now()
    start_date = end_date - timedelta(days=30)

    data = data_manager.get_daily_production(
        start_date=start_date,
        end_date=end_date,
        source_priority=strategy
    )

    if not data.empty:
        total_energy = data['Production (kWh)'].sum()
        avg_daily = data['Production (kWh)'].mean()

        print(f"  Records: {len(data)}")
        print(f"  Total energy: {total_energy:.1f} kWh")
        print(f"  Average daily: {avg_daily:.1f} kWh")
        print(f"  Date range: {data.index.min().strftime('%Y-%m-%d')} to {data.index.max().strftime('%Y-%m-%d')}")
    else:
        print(f"  No data available")

    print()

# Show recommended strategy
print("RECOMMENDED: csv_first strategy provides:")
print("- Complete historical context from CSV")
print("- Latest updates from API")
print("- Best coverage for ML model training")

=== HYBRID DATA RETRIEVAL ===

CSV_FIRST Strategy:
  Records: 29
  Total energy: 1185.2 kWh
  Average daily: 40.9 kWh
  Date range: 2025-08-21 to 2025-09-18

API_FIRST Strategy:
  Records: 29
  Total energy: 1185.2 kWh
  Average daily: 40.9 kWh
  Date range: 2025-08-21 to 2025-09-18

CSV_ONLY Strategy:
  Records: 27
  Total energy: 1098.4 kWh
  Average daily: 40.7 kWh
  Date range: 2025-08-21 to 2025-09-16

RECOMMENDED: csv_first strategy provides:
- Complete historical context from CSV
- Latest updates from API
- Best coverage for ML model training


In [4]:
# Cell 4: Update System and Integration
print("=== UPDATE SYSTEM TEST ===\n")

# Test API update process
print("Testing API update process...")
update_stats = data_manager.update_from_api(save_cache=True)

print("Update Results:")
for key, value in update_stats.items():
    print(f"  {key}: {value}")

# Test data export for ML integration
print(f"\n=== ML INTEGRATION EXPORT ===")

export_success = data_manager.export_combined_dataset(
    filename="../data/processed/combined_solar_data.csv",
    source_priority="csv_first"
)

if export_success:
    # Verify export
    import pandas as pd
    exported = pd.read_csv("../data/processed/combined_solar_data.csv", index_col=0, parse_dates=True)

    print(f"Export successful:")
    print(f"  Combined dataset: {len(exported):,} daily records")
    print(f"  Date range: {exported.index.min().strftime('%Y-%m-%d')} to {exported.index.max().strftime('%Y-%m-%d')}")
    print(f"  Total production: {exported['Production (kWh)'].sum():.0f} kWh")

    # Check for recent API data
    recent_data = exported.tail(10)
    print(f"\nMost recent 10 days:")
    for date, row in recent_data.iterrows():
        print(f"    {date.strftime('%Y-%m-%d')}: {row['Production (kWh)']:.1f} kWh")

    print(f"\nReady for ML model integration!")

INFO:data_manager:Updating dataset from API
INFO:data_manager:Loading API data for last 8 days


=== UPDATE SYSTEM TEST ===

Testing API update process...


INFO:data_manager:Loaded 8 API records from 2025-09-11 00:00:00 to 2025-09-18 00:00:00
INFO:data_manager:Saved updated daily data to ../data/processed/daily_production_combined.csv
INFO:data_manager:Update complete: {'existing_records': 734, 'api_records': 8, 'new_records': 0, 'updated_records': 8}
INFO:data_manager:Exported 734 records to ../data/processed/combined_solar_data.csv


Update Results:
  existing_records: 734
  api_records: 8
  new_records: 0
  updated_records: 8

=== ML INTEGRATION EXPORT ===
Export successful:
  Combined dataset: 734 daily records
  Date range: 2023-09-16 to 2025-09-18
  Total production: 27485 kWh

Most recent 10 days:
    2025-09-09: 48.7 kWh
    2025-09-10: 23.2 kWh
    2025-09-11: 21.9 kWh
    2025-09-12: 50.9 kWh
    2025-09-13: 50.7 kWh
    2025-09-14: 26.6 kWh
    2025-09-15: 23.6 kWh
    2025-09-16: 16.7 kWh
    2025-09-17: 33.8 kWh
    2025-09-18: 53.1 kWh

Ready for ML model integration!
