In [1]:
import os
from dotenv import load_dotenv
import requests
import pandas as pd
from io import StringIO

In [2]:
# Load environment variables and get API key
load_dotenv()
api_key = os.getenv("GRIDSTATUS_API_KEY")
assert api_key is not None, "GRIDSTATUS_API_KEY not found in environment!"

In [5]:
## For 2023-2024 (training set)

# Base URL
url = "https://api.gridstatus.io/v1/datasets/ercot_solar_actual_and_forecast_by_geo_region_hourly/query"

# Query parameters
params = {
    "start_time": "2023-01-01",
    "end_time":   "2024-12-31",
    # for each operating time, grab the latest forecast with
    # publish_time <= operating_time - 24 hours
    "publish_time": "latest_before:-24 hours",
    "timezone": "market",
    "api_key": api_key,
    "return_format": "csv",
}

# Make request
resp = requests.get(url, params=params)
resp.raise_for_status()

df = pd.read_csv(StringIO(resp.text))

# drop actuals (they're always NaN in forecast-only subset)
actual_cols = [c for c in df.columns if c.startswith("gen_")]
df = df.drop(columns=actual_cols, errors="ignore")

# timezone workaround to avoid error of timezones not matching

# 1. Parse timestamps as UTC (prevents mixed offset errors)
df["interval_start_local"] = pd.to_datetime(df["interval_start_local"], utc=True)

# 2. Apply cutoff (also tz-aware)
cutoff_utc = pd.Timestamp("2024-12-31 23:59:59", tz="UTC")
df = df[df["interval_start_local"] <= cutoff_utc].copy()

# 3. Convert UTC → ERCOT local time
df["interval_start_local"] = df["interval_start_local"].dt.tz_convert("America/Chicago")

# include systemwide + all regions

regions = ["system_wide", "centerwest", "northwest", "fareast", "southeast", "centereast"]

solar_cols = ["interval_start_local"]

for r in regions:
    solar_cols += [
        f"pvgrpp_{r}",
        f"stppf_{r}",
        f"cop_hsl_{r}"
    ]

df_solar_all = (
    df[solar_cols]
    .sort_values("interval_start_local")
    .reset_index(drop=True)
)

print(df_solar_all.shape)
df_solar_all.head()


(17520, 19)


Unnamed: 0,interval_start_local,pvgrpp_system_wide,stppf_system_wide,cop_hsl_system_wide,pvgrpp_centerwest,stppf_centerwest,cop_hsl_centerwest,pvgrpp_northwest,stppf_northwest,cop_hsl_northwest,pvgrpp_fareast,stppf_fareast,cop_hsl_fareast,pvgrpp_southeast,stppf_southeast,cop_hsl_southeast,pvgrpp_centereast,stppf_centereast,cop_hsl_centereast
0,2023-01-01 00:00:00-06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-01-01 01:00:00-06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2023-01-01 02:00:00-06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-01-01 03:00:00-06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2023-01-01 04:00:00-06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# save csv to folder
df_solar_all.to_csv(
    r"C:\Users\lemre\Documents\ERCOT_Peaker_Project\clean_data\ercot_solar_forecasts_allzones_2023_2024.csv",
    index=False
)

In [8]:
## For 2025 for test set

# Base URL
url = "https://api.gridstatus.io/v1/datasets/ercot_solar_actual_and_forecast_by_geo_region_hourly/query"

# Query parameters
params = {
    "start_time": "2025-01-01",
    "end_time":   "2025-08-31",
    # for each operating time, grab the latest forecast with
    # publish_time <= operating_time - 24 hours
    "publish_time": "latest_before:-24 hours",
    "timezone": "market",
    "api_key": api_key,
    "return_format": "csv",
}

# Make request
resp = requests.get(url, params=params)
resp.raise_for_status()

df = pd.read_csv(StringIO(resp.text))

# drop actuals (they're always NaN in forecast-only subset)
actual_cols = [c for c in df.columns if c.startswith("gen_")]
df = df.drop(columns=actual_cols, errors="ignore")

# timezone workaround to avoid error of timezones not matching

# 1. Parse timestamps as UTC (prevents mixed offset errors)
df["interval_start_local"] = pd.to_datetime(df["interval_start_local"], utc=True)

# 2. Apply cutoff (also tz-aware)
cutoff_utc = pd.Timestamp("2025-08-31 23:59:59", tz="UTC")
df = df[df["interval_start_local"] <= cutoff_utc].copy()

# 3. Convert UTC → ERCOT local time
df["interval_start_local"] = df["interval_start_local"].dt.tz_convert("America/Chicago")

# include systemwide + all regions

regions = ["system_wide", "centerwest", "northwest", "fareast", "southeast", "centereast"]

solar_cols = ["interval_start_local"]

for r in regions:
    solar_cols += [
        f"pvgrpp_{r}",
        f"stppf_{r}",
        f"cop_hsl_{r}"
    ]

df_solar_all_2025 = (
    df[solar_cols]
    .sort_values("interval_start_local")
    .reset_index(drop=True)
)

print(df_solar_all_2025.shape)
df_solar_all_2025.head()


(5807, 19)


Unnamed: 0,interval_start_local,pvgrpp_system_wide,stppf_system_wide,cop_hsl_system_wide,pvgrpp_centerwest,stppf_centerwest,cop_hsl_centerwest,pvgrpp_northwest,stppf_northwest,cop_hsl_northwest,pvgrpp_fareast,stppf_fareast,cop_hsl_fareast,pvgrpp_southeast,stppf_southeast,cop_hsl_southeast,pvgrpp_centereast,stppf_centereast,cop_hsl_centereast
0,2025-01-01 00:00:00-06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2025-01-01 01:00:00-06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2025-01-01 02:00:00-06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2025-01-01 03:00:00-06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2025-01-01 04:00:00-06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# save csv to folder
df_solar_all_2025.to_csv(
    r"C:\Users\lemre\Documents\ERCOT_Peaker_Project\clean_data\ercot_solar_forecasts_allzones_2025.csv",
    index=False
)