# Verify 'ecogridaidata' s3 bucket is created

In [8]:
%%bash

aws s3 ls s3://${bucket}/

2025-03-21 10:15:58 ecogridaidata
2025-03-03 19:04:48 sagemaker-studio-wmhb0pxgu8
2025-03-03 19:04:50 sagemaker-us-east-1-571350132829


In [9]:
import boto3

# Create an S3 client
s3 = boto3.client('s3')

In [10]:
from botocore.client import ClientError

response = None

try:
    response = s3.head_bucket(Bucket='ecogridaidata')
    print(response)
    setup_s3_bucket_passed = True
except ClientError as e:
    print("[ERROR] Cannot find bucket {} in {} due to {}.".format(bucket, response, e))

{'ResponseMetadata': {'RequestId': 'J3YSSZJNCW1CDCBK', 'HostId': 'o+tQzMk17wQJOt4DKf15OoFXC3CN4AoyZeSLlDd9bGs7HinATaFqqiaNSqJl5k4t6mlRyvCPib0=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'o+tQzMk17wQJOt4DKf15OoFXC3CN4AoyZeSLlDd9bGs7HinATaFqqiaNSqJl5k4t6mlRyvCPib0=', 'x-amz-request-id': 'J3YSSZJNCW1CDCBK', 'date': 'Sun, 23 Mar 2025 13:45:51 GMT', 'x-amz-bucket-region': 'us-east-1', 'x-amz-access-point-alias': 'false', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'BucketRegion': 'us-east-1', 'AccessPointAlias': False}


In [11]:
%store setup_s3_bucket_passed

Stored 'setup_s3_bucket_passed' (bool)


In [12]:
%store

Stored variables and their in-db values:
setup_dependencies_passed               -> True
setup_iam_roles_passed                  -> True
setup_instance_check_passed             -> True
setup_s3_bucket_passed                  -> True


# Store data into 'ecogridaidata' s3 folder through API integration

In [3]:
import boto3
import requests
import pandas as pd
from pandas import json_normalize
from datetime import datetime
import time
import json

In [2]:
# === CONFIGURATION ===
api_key = "g8IFZafpLwkh0hRmYC8xE91RQ1qa0Rggf567XdZe"
base_url = "https://api.eia.gov/v2/electricity/rto/daily-region-data/data/"
bucket_name = "ecogridaidata"
s3_folder = "eia_electricity"
target_total = 1_038_828

# === API PARAMS TEMPLATE ===
params_template = {
    "frequency": "daily",
    "data": ["value"],
    "facets": {},
    "start": "2024-01-01",
    "end": "2024-12-31",
    "sort": [{"column": "period", "direction": "desc"}],
    "offset": 0,
    "length": 5000
}

headers = {
    "Content-Type": "application/json"
}

# === DATA COLLECTION LOOP ===
all_data = []
offset = 0
length = params_template["length"]

while len(all_data) < target_total:
    print(f"Fetching records {offset} to {offset + length}...")

    params = params_template.copy()
    params["offset"] = offset

    response = requests.get(
        base_url,
        headers=headers,
        params={"api_key": api_key},
        json=params
    )

    if response.status_code != 200:
        print(f"Error at offset {offset}. Status code: {response.status_code}")
        break

    response_json = response.json()
    records = response_json.get("response", {}).get("data", [])

    if not records:
        print("No more data returned by API.")
        break

    all_data.extend(records)
    offset += length
    time.sleep(1)

    # Trim if we fetched slightly over
    if len(all_data) > target_total:
        all_data = all_data[:target_total]
        print("Trimmed to exact target record count.")

print(f"✅ Total records fetched: {len(all_data)}")

# === SAVE TO JSON ===
json_path = "/tmp/eia_demandforecast.json"
with open(json_path, "w") as f:
    json.dump(all_data, f, indent=2)

# Upload to S3
s3 = boto3.client("s3")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
s3_key = f"{s3_folder}/eia_data_demand_{timestamp}.json"

s3.upload_file(json_path, bucket_name, s3_key)
print(f"✅ Uploaded to s3://{bucket_name}/{s3_key}")


Fetching records 0 to 5000...
Fetching records 5000 to 10000...
Fetching records 10000 to 15000...
Fetching records 15000 to 20000...
Fetching records 20000 to 25000...
Fetching records 25000 to 30000...
Fetching records 30000 to 35000...
Fetching records 35000 to 40000...
Fetching records 40000 to 45000...
Fetching records 45000 to 50000...
Fetching records 50000 to 55000...
Fetching records 55000 to 60000...
Fetching records 60000 to 65000...
Fetching records 65000 to 70000...
Fetching records 70000 to 75000...
Fetching records 75000 to 80000...
Fetching records 80000 to 85000...
Fetching records 85000 to 90000...
Fetching records 90000 to 95000...
Fetching records 95000 to 100000...
Fetching records 100000 to 105000...
Fetching records 105000 to 110000...
Fetching records 110000 to 115000...
Fetching records 115000 to 120000...
Fetching records 120000 to 125000...
Fetching records 125000 to 130000...
Fetching records 130000 to 135000...
Fetching records 135000 to 140000...
Fetching 

NameError: name 'json' is not defined

In [9]:
# API for subregion data, used chatgpt on 3/23/2025 to assist with API connection of data.

# === CONFIGURATION ===
api_key = "g8IFZafpLwkh0hRmYC8xE91RQ1qa0Rggf567XdZe"
base_url = "https://api.eia.gov/v2/electricity/rto/daily-region-sub-ba-data/data/"
bucket_name = "ecogridaidata"
s3_folder = "eia_electricity"
target_total = 302_719

headers = {
    "Content-Type": "application/json"
}

# === STATIC PARAMS TEMPLATE (no offset/length here) ===
params_template = {
    "api_key": api_key,
    "frequency": "daily",
    "start": "2023-01-01",
    "end": "2024-12-31",
    "data[0]": "value",
    "sort[0][column]": "period",
    "sort[0][direction]": "desc"
}

# === DATA COLLECTION LOOP ===
all_data = []
offset = 0
length = 5000

while len(all_data) < target_total:
    print(f"Fetching records {offset} to {offset + length}...")

    # Combine static params with dynamic offset and length
    params = params_template.copy()
    params["offset"] = offset
    params["length"] = length

    response = requests.get(
        base_url,
        headers=headers,
        params=params
    )

    if response.status_code != 200:
        print(f"Error at offset {offset}. Status code: {response.status_code}")
        break

    response_json = response.json()
    records = response_json.get("response", {}).get("data", [])

    if not records:
        print("No more data returned by API.")
        break

    # 👀 Inspect sample record on first batch
    if offset == 0 and records:
        print("🔍 Sample keys in record:", records[0].keys())
        print("📦 Sample record preview:\n", json.dumps(records[0], indent=2))

    all_data.extend(records)
    offset += length
    time.sleep(1)

    # Trim if slightly over
    if len(all_data) > target_total:
        all_data = all_data[:target_total]
        print("Trimmed to exact target record count.")

print(f"✅ Total records fetched: {len(all_data)}")

# === SAVE RAW JSON ===
json_path = "/tmp/eia_demandforecast_subregion.json"
with open(json_path, "w") as f:
    json.dump(all_data, f, indent=2)

# === FLATTEN AND SAVE PREVIEW CSV ===
df = json_normalize(all_data)
df.columns = [col.replace(".", "_") for col in df.columns]  # ✅ Safe column name cleanup

csv_path = "/tmp/eia_demandforecast_subregion_flat.csv"
df.to_csv(csv_path, index=False)
print(f"📄 Flattened CSV saved to {csv_path}")
print("🧾 Flattened columns:", df.columns.tolist())

# === UPLOAD TO S3 (JSON file) ===
s3 = boto3.client("s3")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
s3_key = f"{s3_folder}/eia_data_subregion_{timestamp}.json"

s3.upload_file(json_path, bucket_name, s3_key)
print(f"✅ Uploaded raw JSON to s3://{bucket_name}/{s3_key}")



Fetching records 0 to 5000...
🔍 Sample keys in record: dict_keys(['period', 'subba', 'subba-name', 'parent', 'parent-name', 'timezone', 'value', 'value-units'])
📦 Sample record preview:
 {
  "period": "2024-12-31",
  "subba": "PGAE",
  "subba-name": "Pacific Gas and Electric",
  "parent": "CISO",
  "parent-name": "California Independent System Operator",
  "timezone": "Arizona",
  "value": "247213",
  "value-units": "megawatthours"
}
Fetching records 5000 to 10000...
Fetching records 10000 to 15000...
Fetching records 15000 to 20000...
Fetching records 20000 to 25000...
Fetching records 25000 to 30000...
Fetching records 30000 to 35000...
Fetching records 35000 to 40000...
Fetching records 40000 to 45000...
Fetching records 45000 to 50000...
Fetching records 50000 to 55000...
Fetching records 55000 to 60000...
Fetching records 60000 to 65000...
Fetching records 65000 to 70000...
Fetching records 70000 to 75000...
Fetching records 75000 to 80000...
Fetching records 80000 to 85000...
Fet

In [10]:
# === CONFIGURATION ===
api_key = "g8IFZafpLwkh0hRmYC8xE91RQ1qa0Rggf567XdZe"
base_url = "https://api.eia.gov/v2/electricity/rto/daily-fuel-type-data/data/"
bucket_name = "ecogridaidata"
s3_folder = "eia_electricity"
target_total = 1_425_802

headers = {
    "Content-Type": "application/json"
}

# === STATIC PARAMS TEMPLATE (no offset/length here) ===
params_template = {
    "api_key": api_key,
    "frequency": "daily",
    "start": "2023-01-01",
    "end": "2024-12-31",
    "data[0]": "value",
    "sort[0][column]": "period",
    "sort[0][direction]": "desc"
}-

# === DATA COLLECTION LOOP ===
all_data = []
offset = 0
length = 5000

while len(all_data) < target_total:
    print(f"Fetching records {offset} to {offset + length}...")

    # Combine static params with dynamic offset and length
    params = params_template.copy()
    params["offset"] = offset
    params["length"] = length

    response = requests.get(
        base_url,
        headers=headers,
        params=params
    )

    if response.status_code != 200:
        print(f"Error at offset {offset}. Status code: {response.status_code}")
        try:
            print("❗ Error details:", response.json())
        except:
            print("❗ Could not parse error response.")
        break

    response_json = response.json()
    records = response_json.get("response", {}).get("data", [])

    if not records:
        print("No more data returned by API.")
        break

    # 👀 Inspect sample record on first batch
    if offset == 0 and records:
        print("🔍 Sample keys in record:", records[0].keys())
        print("📦 Sample record preview:\n", json.dumps(records[0], indent=2))

    all_data.extend(records)
    offset += length
    time.sleep(1)

    # Trim if slightly over
    if len(all_data) > target_total:
        all_data = all_data[:target_total]
        print("Trimmed to exact target record count.")

print(f"✅ Total records fetched: {len(all_data)}")

# === SAVE RAW JSON ===
json_path = "/tmp/eia_data_energy_source.json"
with open(json_path, "w") as f:
    json.dump(all_data, f, indent=2)

# === FLATTEN AND SAVE PREVIEW CSV ===
df = json_normalize(all_data)
df.columns = [col.replace(".", "_") for col in df.columns]

csv_path = "/tmp/eia_data_energy_source_flat.csv"
df.to_csv(csv_path, index=False)
print(f"📄 Flattened CSV saved to {csv_path}")
print("🧾 Flattened columns:", df.columns.tolist())

# === UPLOAD TO S3 (JSON file) ===
s3 = boto3.client("s3")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
s3_key = f"{s3_folder}/eia_data_energy_source_{timestamp}.json"

s3.upload_file(json_path, bucket_name, s3_key)
print(f"✅ Uploaded raw JSON to s3://{bucket_name}/{s3_key}")

Fetching records 0 to 5000...
🔍 Sample keys in record: dict_keys(['period', 'respondent', 'respondent-name', 'fueltype', 'type-name', 'timezone', 'timezone-description', 'value', 'value-units'])
📦 Sample record preview:
 {
  "period": "2024-12-31",
  "respondent": "AECI",
  "respondent-name": "Associated Electric Cooperative, Inc.",
  "fueltype": "COL",
  "type-name": "Coal",
  "timezone": "Arizona",
  "timezone-description": "Arizona",
  "value": "23782",
  "value-units": "megawatthours"
}
Fetching records 5000 to 10000...
Fetching records 10000 to 15000...
Fetching records 15000 to 20000...
Fetching records 20000 to 25000...
Fetching records 25000 to 30000...
Fetching records 30000 to 35000...
Fetching records 35000 to 40000...
Fetching records 40000 to 45000...
Fetching records 45000 to 50000...
Fetching records 50000 to 55000...
Fetching records 55000 to 60000...
Fetching records 60000 to 65000...
Fetching records 65000 to 70000...
Fetching records 70000 to 75000...
Fetching record

In [None]:
# === CONFIGURATION ===
api_key = "g8IFZafpLwkh0hRmYC8xE91RQ1qa0Rggf567XdZe"
base_url = "https://api.eia.gov/v2/electricity/rto/daily-interchange-data/data/"
bucket_name = "ecogridaidata"
s3_folder = "eia_electricity"
target_total = 1_233_232

headers = {
    "Content-Type": "application/json"
}

# === STATIC PARAMS TEMPLATE ===
params_template = {
    "api_key": api_key,
    "frequency": "daily",
    "start": "2023-01-01",
    "end": "2024-12-31",
    "data[0]": "value",
    "sort[0][column]": "period",
    "sort[0][direction]": "desc"
}

# === DATA COLLECTION LOOP ===
all_data = []
offset = 0
length = 5000

while len(all_data) < target_total:
    print(f"Fetching records {offset} to {offset + length}...")

    # Merge template with dynamic offset/length
    params = params_template.copy()
    params["offset"] = offset
    params["length"] = length

    response = requests.get(
        base_url,
        headers=headers,
        params=params
    )

    if response.status_code != 200:
        print(f"❌ Error at offset {offset}. Status code: {response.status_code}")
        try:
            print("Details:", response.json())
        except:
            print("Could not parse error response.")
        break

    response_json = response.json()
    records = response_json.get("response", {}).get("data", [])

    if not records:
        print("No more data returned by API.")
        break

    if offset == 0 and records:
        print("🔍 Sample keys in record:", records[0].keys())
        print("📦 Sample record preview:\n", json.dumps(records[0], indent=2))

    all_data.extend(records)
    offset += length
    time.sleep(1)

    if len(all_data) > target_total:
        all_data = all_data[:target_total]
        print("Trimmed to exact target record count.")

print(f"✅ Total records fetched: {len(all_data)}")

# === SAVE RAW JSON ===
json_path = "/tmp/eia_data_neighboring_bal.json"
with open(json_path, "w") as f:
    json.dump(all_data, f, indent=2)

# === FLATTEN AND SAVE PREVIEW CSV ===
df = json_normalize(all_data)
df.columns = [col.replace(".", "_") for col in df.columns]

csv_path = "/tmp/eia_data_neighboring_bal_flat.csv"
df.to_csv(csv_path, index=False)
print(f"📄 Flattened CSV saved to {csv_path}")
print("🧾 Flattened columns:", df.columns.tolist())

# === UPLOAD TO S3 ===
s3 = boto3.client("s3")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
s3_key = f"{s3_folder}/eia_data_neighboring_bal_{timestamp}.json"

s3.upload_file(json_path, bucket_name, s3_key)
print(f"✅ Uploaded raw JSON to s3://{bucket_name}/{s3_key}")


# Release Resources

In [17]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [1]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>