In [4]:
import os
from dotenv import load_dotenv
import requests
import pandas as pd
from io import StringIO

In [5]:
# Load environment variables and get API key
load_dotenv()
api_key = os.getenv("GRIDSTATUS_API_KEY")
assert api_key is not None, "GRIDSTATUS_API_KEY not found in environment!"

In [6]:
## For 2023-2024 ACTUALS

# Base URL
url = "https://api.gridstatus.io/v1/datasets/ercot_solar_actual_and_forecast_by_geo_region_hourly/query"

params = {
    "start_time": "2023-01-01",
    "end_time":   "2024-12-31",
    "publish_time": "latest",
    "timezone": "market",
    "api_key": api_key,
    "return_format": "csv",
}

# Make request
resp = requests.get(url, params=params)
resp.raise_for_status()

df = pd.read_csv(StringIO(resp.text))

# timezone workaround to avoid error of timezones not matching

# 1. Parse timestamps as UTC (prevents mixed offset errors)
df["interval_start_local"] = pd.to_datetime(df["interval_start_local"], utc=True)

# 2. Apply cutoff (also tz-aware)
cutoff_utc = pd.Timestamp("2024-12-31 23:59:59", tz="UTC")
df = df[df["interval_start_local"] <= cutoff_utc].copy()

# 3. Convert UTC → ERCOT local time
df["interval_start_local"] = df["interval_start_local"].dt.tz_convert("America/Chicago")

# include systemwide + all regions

# keep only rows containing actuals
actual_cols = [c for c in df.columns if c.startswith("gen_")]
df = df.dropna(subset=actual_cols, how="all").copy()

# keep latest publish version per operating interval
if "publish_time_local" in df.columns:
    df["publish_time_local"] = pd.to_datetime(df["publish_time_local"], utc=True)
    df = (
        df.sort_values("publish_time_local")
          .drop_duplicates(subset=["interval_start_local"], keep="last")
    )

# select columns
regions = ["system_wide", "centerwest", "northwest", "fareast", "southeast", "centereast"]

solar_cols = ["interval_start_local"]
for r in regions:
    solar_cols.append(f"gen_{r}")

df_actuals = (
    df[solar_cols]
      .sort_values("interval_start_local")
      .reset_index(drop=True)
)

df_actuals.head()


Unnamed: 0,interval_start_local,gen_system_wide,gen_centerwest,gen_northwest,gen_fareast,gen_southeast,gen_centereast
0,2023-01-01 00:00:00-06:00,0.45,0.01,0.0,0.36,0.0,0.07
1,2023-01-01 01:00:00-06:00,0.46,0.01,0.0,0.37,0.0,0.07
2,2023-01-01 02:00:00-06:00,0.45,0.01,0.0,0.36,0.0,0.07
3,2023-01-01 03:00:00-06:00,0.46,0.01,0.0,0.37,0.0,0.07
4,2023-01-01 04:00:00-06:00,0.45,0.01,0.0,0.36,0.0,0.07


In [7]:
# save csv to folder
df_actuals.to_csv(
    r"C:\Users\lemre\Documents\ERCOT_Peaker_Project\clean_data\ercot_solar_actuals_allzones_2023_2024.csv",
    index=False
)

In [9]:
## For 2025 ACTUALS (for testing)

# Base URL
url = "https://api.gridstatus.io/v1/datasets/ercot_solar_actual_and_forecast_by_geo_region_hourly/query"

params = {
    "start_time": "2025-01-01",
    "end_time":   "2025-08-31",
    "timezone": "market",
    "api_key": api_key,
    "return_format": "csv",
}

# Make request
resp = requests.get(url, params=params)
resp.raise_for_status()

df = pd.read_csv(StringIO(resp.text))

# timezone workaround to avoid error of timezones not matching

# 1. Parse timestamps as UTC (prevents mixed offset errors)
df["interval_start_local"] = pd.to_datetime(df["interval_start_local"], utc=True)

# 2. Apply cutoff (also tz-aware)
cutoff_utc = pd.Timestamp("2025-08-31 23:59:59", tz="UTC")
df = df[df["interval_start_local"] <= cutoff_utc].copy()

# 3. Convert UTC → ERCOT local time
df["interval_start_local"] = df["interval_start_local"].dt.tz_convert("America/Chicago")

# include systemwide + all regions

# keep only rows containing actuals
actual_cols = [c for c in df.columns if c.startswith("gen_")]
df = df.dropna(subset=actual_cols, how="all").copy()

# keep latest publish version per operating interval
if "publish_time_local" in df.columns:
    df["publish_time_local"] = pd.to_datetime(df["publish_time_local"], utc=True)
    df = (
        df.sort_values("publish_time_local")
          .drop_duplicates(subset=["interval_start_local"], keep="last")
    )

# select columns
regions = ["system_wide", "centerwest", "northwest", "fareast", "southeast", "centereast"]

solar_cols = ["interval_start_local"]
for r in regions:
    solar_cols.append(f"gen_{r}")

df_actuals_2025 = (
    df[solar_cols]
      .sort_values("interval_start_local")
      .reset_index(drop=True)
)

df_actuals_2025.head()


Unnamed: 0,interval_start_local,gen_system_wide,gen_centerwest,gen_northwest,gen_fareast,gen_southeast,gen_centereast
0,2025-01-01 00:00:00-06:00,0.72,0.0,0.0,0.35,0.0,0.36
1,2025-01-01 01:00:00-06:00,0.75,0.0,0.0,0.36,0.0,0.37
2,2025-01-01 02:00:00-06:00,0.73,0.0,0.0,0.35,0.0,0.36
3,2025-01-01 03:00:00-06:00,0.75,0.0,0.0,0.37,0.0,0.36
4,2025-01-01 04:00:00-06:00,0.76,0.0,0.0,0.37,0.0,0.36


In [10]:
# save csv to folder
df_actuals_2025.to_csv(
    r"C:\Users\lemre\Documents\ERCOT_Peaker_Project\clean_data\ercot_solar_actuals_allzones_2025.csv",
    index=False
)