In [None]:
# import libraries
import requests
import pandas as pd
import time
import zipfile
from io import BytesIO
from pandas.errors import EmptyDataError
from skimpy import skim
import os

In [None]:
# DOE and NREL Laws and Incentives Data #
csv_url = "https://developer.nrel.gov/api/transportation-incentives-laws/v1.csv?api_key=GAmcMbhWclW5qULHxvWQWtUw52EsehwTPtfu4cz8&expired=false&incentive_type=GNT%2CTAX%2CLOANS%2CRBATE%2CEXEM%2CTOU%2COTHER&law_type=INC%2CPROG%2CLAWREG%2CSTATEINC&regulation_type=REQ%2CDREST%2CREGIS%2CEVFEE%2CFUEL%2CSTD%2CRFS%2CAIRQEMISSIONS%2CCCEINIT%2CUTILITY%2CBUILD%2CRTC%2COTHER&technology=BIOD%2CETH%2CNG%2CLPG%2CHY%2CELEC%2CPHEV%2CHEV%2CNEVS%2CRD%2CAFTMKTCONV%2CEFFEC%2CIR%2CAUTONOMOUS%2COTHER&user_type=FLEET%2CGOV%2CTRIBAL%2CIND%2CSTATION%2CAFP%2CPURCH%2CMAN%2CMUD%2CTRANS%2COTHER"

# Load the CSV directly into a DataFrame
df = pd.read_csv(csv_url)

# Show the first few rows
print(df.head())

## limits:
# some fields are long text or JSON-like strings 

# save the csv file 
response = requests.get(csv_url)
with open("nrel_laws_incentives.csv", "wb") as f:
    f.write(response.content)

print("CSV file downloaded successfully.")
f.close()

nrel_laws_incentives = pd.read_csv("nrel_laws_incentives.csv")
print(nrel_laws_incentives.head())
skim(nrel_laws_incentives)

## initial data checks:
# need to drop any potential redundant variables
# missingness is present in a lot of the variables, with some having NA percentages close to 100%, thus need to look more deeply if these variables are importance overall 
# there are a lot of string variables, which prepares will be hard to parse through 

In [None]:
# Open EI Data Scraping #
# List of U.S. states (you can add more or switch to cities)
states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", 
    "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", 
    "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", 
    "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", 
    "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", 
    "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", 
    "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", 
    "Wisconsin", "Wyoming"
]

# Could also look at specific cities if wanted to via this program

# Store all results here
all_incentives = []

# Loop through each state and query the API
for state in states:
    print(f"Fetching: {state}")
    url = f"https://openei.org/services/api/place/{state}/incentives/v1"
    params = {
        "format": "json",
        "active": "yes"
    }

    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        results = data.get("results", {}).get("bindings", [])

        for item in results:
            all_incentives.append({
                "location": state,
                "name": item["incentive_name"]["value"],
                "type": item["incentive_type"]["value"],
                "description": item["incentive_descr"]["value"],
                "link": item["incentive_page"]["value"]
            })

    except Exception as e:
        print(f"Error fetching {state}: {e}")

    time.sleep(1)  # Add delay to avoid overwhelming server

# Convert to DataFrame
df = pd.DataFrame(all_incentives)

# Save to CSV
df.to_csv("all_state_incentives.csv", index=False)
print("Done! Saved all incentives.")

all_state_incentives = pd.read_csv("all_state_incentives.csv")
print(all_state_incentives.head())
skim(all_state_incentives)

# limitations 
## there is a lot of text making it hard to parse through this 
## data takes a while to load since it is scraping through a lot of data and information 

# initial check
## all of the data presented is string data, so need to look further to see if this would actually be helpful information within it
## no NA data found though 

In [None]:
# Section 48C Energy Communities Data #
url = "http://edx.netl.doe.gov/dataset/22944d5d-d063-4890-a995-064bc59b5a78/resource/3d01f2d6-1c1c-498d-8db2-3a51aa3c07f2/download"

# Spoof headers to mimic a browser
headers = {
    "User-Agent": "Mozilla/5.0",
    "Referer": "http://edx.netl.doe.gov/",
}

response = requests.get(url, headers=headers)
response.raise_for_status()

# Unzip and preview
with zipfile.ZipFile(BytesIO(response.content)) as z:
    print("Files in ZIP:", z.namelist())
    for file in z.namelist():
        if file.endswith(".csv"):
            with z.open(file) as f:
                df = pd.read_csv(f)
                print(f"\nPreview of {file}:")
                print(df.head())
                skim(df)

                filename = os.path.basename(file)  # handles nested folders inside zip
                df.to_csv(filename, index=False)
                print(f"Saved {filename}")



 # initial check 
 # missingness is only present in one variable, there is a lot of missingness so need to see if it is valuable information to our mission 
 # good mix of numerical and string variables 

data_48c = pd.read_csv("48C_CensusTractDesignation.csv")
print(data_48c.head())
skim(data_48c)