In [1]:
import pandas as pd
import numpy as np
hourly_load = pd.read_csv('../clean_data/forecasts/ercot_load_forecast_2025-01-01_to_2025-09_01_combined_latest.csv', parse_dates=['interval_start_local'])
solar = pd.read_csv('../clean_data/ercot_solar_forecasts_allzones_2025.csv', parse_dates=['interval_start_local'])
wind = pd.read_csv('../clean_data/ercot_wind_forecasts_hourly_2025.csv', parse_dates=['interval_start_local'])
lmp = pd.read_csv('../clean_data/LMP_2025_Hubs.csv', parse_dates=['datetime'])
weather = pd.read_csv('../clean_data/weather_hourly_2025.csv', parse_dates=['datetime'])

In [2]:
import pandas as pd
def ensure_localized(df, col, tz='America/Chicago', make_naive=True):
    """Parse `col` in `df` robustly and convert to timezone `tz`.
    - Tries parsing with `utc=True` first (handles strings with offsets like `-06:00`).
    - Falls back to parsing without `utc` and then localizing naive timestamps.
    - Converts to `tz` and (optionally) drops tz info producing naive local times.
    """
    if col not in df.columns:
        print(f"column not found: {col}")
        return
    # First try: parse with utc=True (works for strings that include offsets)
    s = pd.to_datetime(df[col], errors='coerce', utc=True)
    # If parsing produced all NaT or resulted in tz-naive series, try fallback parse
    if s.isna().all() or (getattr(s.dt, 'tz', None) is None):
        s = pd.to_datetime(df[col], errors='coerce')
        # If still naive (no tz), localize to tz
        if getattr(s.dt, 'tz', None) is None:
            try:
                s = s.dt.tz_localize(tz)
            except Exception:
                # some strings may already include offsets; reparse with utc
                s = pd.to_datetime(df[col], errors='coerce', utc=True)
    # At this point s should be tz-aware (UTC or localized)
    if getattr(s.dt, 'tz', None) is not None:
        try:
            s = s.dt.tz_convert(tz)
        except Exception:
            # if conversion fails, keep as-is
            pass
    # Optionally drop tz info to produce naive local timestamps
    if make_naive and getattr(s.dt, 'tz', None) is not None:
        s = s.dt.tz_localize(None)
    df[col] = s

# Apply to the dataframes already loaded above
tz = 'America/Chicago'
ensure_localized(hourly_load, 'interval_start_local', tz=tz, make_naive=True)
ensure_localized(solar, 'interval_start_local', tz=tz, make_naive=True)
ensure_localized(wind, 'interval_start_local', tz=tz, make_naive=True)
ensure_localized(lmp, 'datetime', tz=tz, make_naive=True)
ensure_localized(weather, 'datetime', tz=tz, make_naive=True)

# Quick verification prints
for name, df, col in [
    ('hourly_load', hourly_load, 'interval_start_local'),
    ('solar', solar, 'interval_start_local'),
    ('wind', wind, 'interval_start_local'),
    ('lmp', lmp, 'datetime'),
    ('weather', weather, 'datetime'),
]:
    if col in df.columns:
        print(f"{name} -> {col} dtype:", df[col].dtype, "nulls:", df[col].isna().sum())
        display(df[[col]].head(3))
    else:
        print(f"{name} missing column {col}")

hourly_load -> interval_start_local dtype: datetime64[ns] nulls: 0


Unnamed: 0,interval_start_local
0,2025-01-01 00:00:00
1,2025-01-01 01:00:00
2,2025-01-01 02:00:00


solar -> interval_start_local dtype: datetime64[ns] nulls: 0


Unnamed: 0,interval_start_local
0,2025-01-01 00:00:00
1,2025-01-01 01:00:00
2,2025-01-01 02:00:00


wind -> interval_start_local dtype: datetime64[ns] nulls: 0


Unnamed: 0,interval_start_local
0,2025-01-01 00:00:00
1,2025-01-01 01:00:00
2,2025-01-01 02:00:00


lmp -> datetime dtype: datetime64[ns] nulls: 0


Unnamed: 0,datetime
0,2024-12-31 18:00:00
1,2024-12-31 19:00:00
2,2024-12-31 20:00:00


weather -> datetime dtype: datetime64[ns] nulls: 0


Unnamed: 0,datetime
0,2024-12-31 18:00:00
1,2024-12-31 19:00:00
2,2024-12-31 20:00:00


In [3]:
wind.drop(columns=['interval_start_utc', 'interval_end_utc', 'interval_end_local'], inplace=True)

In [4]:
weather.rename(columns={'datetime':'interval_start_local'}, inplace=True)
lmp.rename(columns={'datetime':'interval_start_local'}, inplace=True)


In [5]:
merged = hourly_load.merge(solar, on='interval_start_local', how='left')
merged = merged.merge(wind, on='interval_start_local', how='left')
merged = merged.merge(lmp, on='interval_start_local', how='left')
merged = merged.merge(weather, on='interval_start_local', how='left')

In [6]:
merged.isna().sum()

interval_start_local      0
interval_start_utc        0
interval_end_local        0
interval_end_utc          0
publish_time_local_x      0
                       ... 
SLP_qc                  146
WIND_DIR_deg            407
WIND_DIR_qc             146
WIND_SPD_ms             148
WIND_SPD_qc             146
Length: 62, dtype: int64

In [7]:
merged = merged[~merged['HB_BUSAVG'].isna()]

In [8]:
# Diagnostic: inspect the in-memory `merged` DataFrame (cell 17 context)
import pandas as pd
# ensure datetime column is parsed if present
dt_col = None
for c in ['interval_start_local','datetime','start','timestamp']:
    if c in merged.columns:
        dt_col = c
        break
if dt_col is not None:
    merged[dt_col] = pd.to_datetime(merged[dt_col], errors='coerce')
print('Columns in merged (count={}):'.format(len(merged.columns)))
print(list(merged.columns))
# Find candidate HB/LMP columns by keyword
candidates = [c for c in merged.columns if any(k in c.lower() for k in ('hb','busavg','lmp','hub','bus'))]
print('LMP/HB candidate columns:', candidates)
# If HB_BUSAVG exists, show its stats; otherwise show stats for candidates
target = 'HB_BUSAVG' if 'HB_BUSAVG' in merged.columns else (candidates[0] if candidates else None)
if target is None:
    print('No HB/LMP-like column found in `merged`.')
else:
    print('Using column for LMP analysis:', target)
    ser = merged[target]
    print(ser.describe())
    print('nulls:', ser.isna().sum(), 'of', len(ser))
    # show sample values where not null
    print('Sample non-null values:')
    display(merged.loc[merged[target].notna(), [dt_col, target]].head(10))
    # continuity check on rows where this LMP is present
    if dt_col is not None:
        df_present = merged.loc[merged[target].notna()].copy()
        df_present = df_present.sort_values(dt_col)
        s = pd.to_datetime(df_present[dt_col])
        diffs = s.diff().dt.total_seconds()/3600.0
        gaps = (diffs>1.001).sum()
        print('Rows with LMP present:', len(df_present))
        print('Gaps (diff>1.001h) among those rows:', gaps)
        # per-day completeness
        sdf = pd.DataFrame({dt_col: s})
        sdf['date'] = sdf[dt_col].dt.date
        sdf['hour'] = sdf[dt_col].dt.hour
        days = []
        for date, g in sdf.groupby('date'):
            hours = sorted(g['hour'].unique())
            days.append((date, len(hours)))
        full_days = sum(1 for d,n in days if n==24)
        print('Days with full 24 hours for rows where LMP present:', full_days, 'of', len(days))
    else:
        print('Skipping LMP continuity check because no datetime column found in merged.')

Columns in merged (count=62):
['interval_start_local', 'interval_start_utc', 'interval_end_local', 'interval_end_utc', 'publish_time_local_x', 'publish_time_utc_x', 'north', 'south', 'west', 'houston', 'system_total', 'forecast_date', 'publish_time_used', 'pvgrpp_system_wide', 'stppf_system_wide', 'cop_hsl_system_wide_x', 'pvgrpp_centerwest', 'stppf_centerwest', 'cop_hsl_centerwest', 'pvgrpp_northwest', 'stppf_northwest', 'cop_hsl_northwest', 'pvgrpp_fareast', 'stppf_fareast', 'cop_hsl_fareast', 'pvgrpp_southeast', 'stppf_southeast', 'cop_hsl_southeast', 'pvgrpp_centereast', 'stppf_centereast', 'cop_hsl_centereast', 'publish_time_local_y', 'publish_time_utc_y', 'cop_hsl_system_wide_y', 'stwpf_system_wide', 'wgrpp_system_wide', 'cop_hsl_lz_south_houston', 'stwpf_lz_south_houston', 'wgrpp_lz_south_houston', 'cop_hsl_lz_west', 'stwpf_lz_west', 'wgrpp_lz_west', 'cop_hsl_lz_north', 'stwpf_lz_north', 'wgrpp_lz_north', 'HB_BUSAVG', 'HB_HOUSTON', 'HB_HUBAVG', 'HB_NORTH', 'HB_PAN', 'HB_SOUTH', 

Unnamed: 0,interval_start_local,HB_BUSAVG
0,2025-01-01 00:00:00,20.49
1,2025-01-01 01:00:00,20.94
2,2025-01-01 02:00:00,21.88
3,2025-01-01 03:00:00,14.78
4,2025-01-01 04:00:00,8.09
5,2025-01-01 05:00:00,13.29
6,2025-01-01 06:00:00,15.22
7,2025-01-01 07:00:00,15.17
8,2025-01-01 08:00:00,14.93
9,2025-01-01 09:00:00,6.75


Rows with LMP present: 5854
Gaps (diff>1.001h) among those rows: 2
Days with full 24 hours for rows where LMP present: 242 of 244


In [9]:
merged.to_csv('../clean_data/test_data_forecast.csv', index=False)