In [1]:
import os
import re
import io
from io import BytesIO
import requests
import zipfile
import openpyxl
import csv
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime

# ========================
# Download RECS Data Files
# ========================
def download_recs_files():
    base_url_for_year = 'https://www.eia.gov/consumption/residential/data/'
    response = requests.get(base_url_for_year)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find available RECS years
    year_links = soup.find_all('a', href=re.compile(r'^/consumption/residential/data/\d{4}/$'))
    years = [int(re.search(r'\d{4}', link['href']).group()) for link in year_links]
    if not years:
        print("❌ No RECS years found.")
        return

    year = max(years)
    base_path = os.path.join('STAT 390 Project', 'Energy Data', 'RECS', str(year))
    os.makedirs(base_path, exist_ok=True)
    
    print(f"📂 Saving to {base_path}")

    # Navigate to RECS microdata page
    base_url = f'https://www.eia.gov/consumption/residential/data/{year}/'
    microdata_url = f'{base_url}index.php?view=microdata'
    soup = BeautifulSoup(requests.get(microdata_url).text, 'html.parser')

    links = soup.find_all('a', href=True)
    zip_file = csv_file = codebook_file = None

    for link in links:
        href = link['href']
        text = link.get_text(strip=True).lower()
        if 'zip' in text and href.endswith('.zip'):
            zip_file = href
        elif 'csv' in text and href.endswith('.csv'):
            csv_file = href
        elif ('xlsx' in text or 'codebook' in href.lower()) and href.endswith('.xlsx'):
            codebook_file = href

    if zip_file:
        # Download and extract zip file
        full_zip_url = f'{base_url}{zip_file}'
        response = requests.get(full_zip_url)
        response.raise_for_status()

        if 'application/zip' not in response.headers.get('Content-Type', ''):
            print("❌ Downloaded file is not a ZIP archive.")
            return

        with zipfile.ZipFile(BytesIO(response.content)) as z:
            for file_name in z.namelist():
                if file_name.endswith('.csv') or file_name.endswith('.txt'):
                    extracted_path = os.path.join(base_path, os.path.basename(file_name))
                    with open(extracted_path, 'wb') as f:
                        f.write(z.read(file_name))
                    print(f"✅ Downloaded {file_name}")
        return

    if csv_file:
        full_csv_url = f'{base_url}{csv_file}'
        response = requests.get(full_csv_url)
        response.raise_for_status()

        # Manual content inspection
        if b"<html" in response.content[:500].lower():
            # If It's HTML, manually search versions
            version = 5
            last_successful_response = None
            last_successful_file = None

            while True:
                file_name = f'recs{year}_public_v{version}.csv'
                real_csv_url = f'https://www.eia.gov/consumption/residential/data/{year}/csv/{file_name}'
                test_response = requests.get(real_csv_url)

                if test_response.status_code != 200 or b"<html" in test_response.content[:500].lower():
                    # Version {version} not found or invalid
                    break
                else:
                    # Save last successful version
                    last_successful_response = test_response
                    last_successful_file = file_name
                    version += 1  # Keep trying next version

            if last_successful_response and last_successful_file:
                csv_path = os.path.join(base_path, last_successful_file)
                with open(csv_path, 'wb') as f:
                    f.write(last_successful_response.content)
                print(f"✅ Downloaded {last_successful_file}")
            else:
                print("❌ Could not find any working RECS public CSV version.")
                return

        else:
            # Normal CSV download
            csv_path = os.path.join(base_path, os.path.basename(csv_file))
            with open(csv_path, 'wb') as f:
                f.write(response.content)
            print(f"✅ Downloaded {os.path.basename(csv_file)}")
    else:
        print("❌ Could not find RECS microdata zip or CSV file.")
        return

    # Download codebook separately if found
    if codebook_file:
        full_codebook_url = f'{base_url}{codebook_file}'
        response = requests.get(full_codebook_url)
        response.raise_for_status()

        if 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' in response.headers.get('Content-Type', ''):
            codebook_path = os.path.join(base_path, os.path.basename(codebook_file))
            with open(codebook_path, 'wb') as f:
                f.write(response.content)
            print(f"✅ Downloaded {os.path.basename(codebook_file)}")
        else:
            print("❌ Codebook URL did not return an XLSX file.")
    else:
        print("❌ No codebook file found.")

# ========================
# Download SEDS Data Files
# ========================
def download_seds_files():
    seds_url = 'https://www.eia.gov/state/seds/seds-data-complete.php?sid=US'
    soup = BeautifulSoup(requests.get(seds_url).text, 'html.parser')

    # Extract latest year from heading
    match = re.search(r'1960-(\d{4})', soup.get_text())
    if not match:
        raise ValueError("❌ Could not find the latest SEDS year.")
    latest_year = match.group(1)
    print(f"Latest SEDS year: {latest_year}")
    print(f'📂 Saving to STAT 390 Project/Energy Data/SEDS/{latest_year}')

    base_path = os.path.join('STAT 390 Project', 'Energy Data', 'SEDS', latest_year)
    os.makedirs(base_path, exist_ok=True)

    # Download main CSV
    csv_url = next(
        (urljoin('https://www.eia.gov', a['href']) for a in soup.find_all('a', string='CSV') 
         if 'complete' in a['href'].lower() and 'seds' in a['href'].lower()), None)
    if not csv_url:
        raise ValueError("❌ Could not find Complete_SEDS.csv")
    csv_path = os.path.join(base_path, os.path.basename(csv_url))
    with open(csv_path, 'wb') as f:
        f.write(requests.get(csv_url).content)
    print(f"✅ Downloaded SEDS CSV file")

    # Download codes/descriptions
    notes_url = 'https://www.eia.gov/state/seds/seds-technical-notes-complete.php?sid=US'
    soup = BeautifulSoup(requests.get(notes_url).text, 'html.parser')
    code_url = next(
        (urljoin('https://www.eia.gov', a['href']) for a in soup.find_all('a', string='CSV') 
         if 'codes' in a['href'].lower() and 'descriptions' in a['href'].lower()), None)
    if not code_url:
        raise ValueError("❌ Could not find codes/descriptions file.")
    code_path = os.path.join(base_path, os.path.basename(code_url))
    with open(code_path, 'wb') as f:
        f.write(requests.get(code_url).content)
    print(f"✅ Downloaded SEDS Codes and Descriptions")

# ==============================
# Download Total Energy Data Set
# ==============================
def download_total_energy_files():
    base_url = 'https://www.eia.gov/totalenergy/data/monthly/index.php'
    soup = BeautifulSoup(requests.get(base_url).text, 'html.parser')

    # Extract the release year from page text
    match = re.search(r'Release Date:\s+\w+\s+\d{1,2},\s+(\d{4})', soup.get_text())
    if not match:
        raise ValueError("❌ Could not find release year.")
    year = match.group(1)
    print(f"Latest Total Energy release year: {year}")
    print(f'📂 Saving to STAT 390 Project/Energy Data/ATB/{year}')

    base_path = os.path.join('STAT 390 Project', 'Energy Data', 'Total Energy', year)
    os.makedirs(base_path, exist_ok=True)

    # Download ZIP directly to memory
    zip_tag = soup.find('a', string=re.compile(r'Download all tables ZIP', re.IGNORECASE))
    if not zip_tag or not zip_tag.get('href'):
        raise ValueError("❌ ZIP file link not found.")
    zip_url = urljoin(base_url, zip_tag['href'])

    try:
        zip_response = requests.get(zip_url)
        zip_response.raise_for_status()
        zip_bytes = io.BytesIO(zip_response.content)

        with zipfile.ZipFile(zip_bytes) as zip_ref:
            for member in zip_ref.infolist():
                if member.filename.endswith('/') or not member.filename.lower().endswith('.xlsx'):
                    continue  # Skip folders and non-Excel files

                # Read Excel file from ZIP directly
                xlsx_data = zip_ref.read(member)
                xlsx_buffer = io.BytesIO(xlsx_data)
                xlsx_name = os.path.splitext(os.path.basename(member.filename))[0]

                try:
                    wb = openpyxl.load_workbook(xlsx_buffer, data_only=True)
                    sheetnames = wb.sheetnames
                
                    if "Annual Data" in sheetnames:
                        # Only process "Annual Data" sheet
                        sheet = wb["Annual Data"]
                
                        # Read from the sheet starting from the correct position
                        label = sheet['A7'].value or 'unknown'
                        label_clean = re.sub(r'[^\w\- ]+', '', str(label)).replace(' ', '_').lower()
                
                        csv_filename = f"{label_clean}.csv"  # ❗ no prefix like "annual_data" anymore
                        # If it starts with "table_", remove "table_xxx_" prefix
                        if csv_filename.startswith('table_'):
                            # Find the first underscore after "table_", then strip up to there
                            parts = csv_filename.split('_', 2)  # Split into at most 3 parts: "table", "xxx", "rest"
                            if len(parts) == 3:
                                csv_filename = parts[2]  # Keep only "rest"
                        
                        # Prepend "total_energy_" no matter what
                        csv_filename = f"total_energy_{csv_filename}"
                        csv_path = os.path.join(base_path, csv_filename)
                
                        with open(csv_path, 'w', newline='', encoding='utf-8') as f:
                            writer = csv.writer(f)
                            for row in sheet.iter_rows(values_only=True):
                                writer.writerow(row)
                
                        print(f"✅ Downloaded Total Energy Data: {csv_filename}")
                
                    else:
                        # If no "Annual Data" sheet, process all sheets individually (ignoring monthly)
                        for sheet_name in sheetnames:
                            if "monthly" in sheet_name.lower():
                                continue  # ❗ skip any monthly data
                
                            sheet = wb[sheet_name]
                
                            # Clean the sheet name for the filename
                            label_clean = re.sub(r'[^\w\- ]+', '', sheet_name).replace(' ', '_').lower()
                
                            csv_filename = f"{label_clean}.csv"  # ❗ clean filename
                            # If it starts with "table_", remove "table_xxx_" prefix
                            if csv_filename.startswith('table_'):
                                # Find the first underscore after "table_", then strip up to there
                                parts = csv_filename.split('_', 2)  # Split into at most 3 parts: "table", "xxx", "rest"
                                if len(parts) == 3:
                                    csv_filename = parts[2]  # Keep only "rest"
                            
                            # Prepend "total_energy_" no matter what
                            csv_filename = f"total_energy_{csv_filename}"
                            csv_path = os.path.join(base_path, csv_filename)
                
                            with open(csv_path, 'w', newline='', encoding='utf-8') as f:
                                writer = csv.writer(f)
                                for row in sheet.iter_rows(values_only=True):
                                    writer.writerow(row)
                
                            print(f"✅ Downloaded Total Energy Data: {csv_filename}")
                
                except Exception as e:
                    print(f"❌ Failed to process {member.filename}: {e}")
                
    except requests.RequestException as e:
        print(f"❌ Failed to download ZIP archive: {e}")
        return

    # Download Glossary PDF
    pdf_tag = soup.find('a', href=re.compile(r'PDF', re.IGNORECASE), attrs={'title': 'Glossary'})
    if not pdf_tag or not pdf_tag.get('href'):
        raise ValueError("❌ Glossary PDF not found.")
    pdf_url = urljoin(base_url, pdf_tag['href'])
    pdf_path = os.path.join(base_path, os.path.basename(pdf_url))

    try:
        with open(pdf_path, 'wb') as f:
            f.write(requests.get(pdf_url).content)
        print(f"✅ Downloaded Total Energy Glossary PDF")
    except Exception as e:
        print(f"❌ Failed to download Glossary PDF: {e}")

# ===========================================
# Download Latest ATB Workbook & Documentation
# ===========================================
def download_atb_files():
    base_path_root = os.path.join('STAT 390 Project', 'Energy Data', 'ATB')
    base_url_template = 'https://atb.nrel.gov/electricity/{year}/data'
    doc_url = 'https://raw.githubusercontent.com/openEDI/documentation/main/ATB.md'

    latest_year = None
    for year in range(2025, 2014, -1):
        url = base_url_template.format(year=year)
        try:
            if requests.head(url, allow_redirects=True, timeout=5).status_code == 200:
                latest_year = str(year)
                break
        except requests.RequestException:
            continue
    if not latest_year:
        raise ValueError("❌ No valid ATB year found.")
    print(f"Latest ATB year: {latest_year}")
    print(f'📂 Saving to STAT 390 Project/Energy Data/ATB/{latest_year}')

    base_path = os.path.join(base_path_root, latest_year)
    os.makedirs(base_path, exist_ok=True)

    soup = BeautifulSoup(requests.get(base_url_template.format(year=latest_year)).text, 'html.parser')
    csv_tag = soup.find('a', string=re.compile(rf'Download the {latest_year} ATB Summary CSV Files', re.IGNORECASE))
    if not csv_tag or not csv_tag.get('href'):
        raise ValueError("❌ ATB Summary CSV not found.")
    csv_url = urljoin(base_url_template.format(year=latest_year), csv_tag['href'])
    csv_path = os.path.join(base_path, os.path.basename(csv_url))
    with open(csv_path, 'wb') as f:
        f.write(requests.get(csv_url).content)
    print(f"✅ Downloaded ATB Summary CSV")

    doc_path = os.path.join(base_path, 'ATB.md')
    with open(doc_path, 'wb') as f:
        f.write(requests.get(doc_url).content)
    print(f"✅ Downloaded ATB Documentation")

# ============================
# Download RMI Dataset Files
# ============================
def download_rmi_files():
    base_url = 'https://utilitytransitionhub.rmi.org/data-download/'
    base_dir = os.path.join('STAT 390 Project', 'Energy Data', 'RMI')
    os.makedirs(base_dir, exist_ok=True)

    # Files to keep (original base names)
    keep_basenames = [
        'employees',
        'operations_emissions_by_fuel',
        'revenue_by_tech',
        'utility_state_map'
    ]

    try:
        soup = BeautifulSoup(requests.get(base_url).text, 'html.parser')
    except requests.RequestException as e:
        print(f"❌ Failed to retrieve RMI page: {e}")
        return

    print('📂 Saving to STAT 390 Project/Energy Data/RMI')

    containers = soup.find_all('div', class_='container mb-16')[0]
    for container in containers:
        # Extract "Last updated" year
        match = re.search(r'Last updated:\s+\w+\s+\d{1,2},\s+(\d{4})', container.get_text())
        modified_year = match.group(1) if match else 'unknown'

        for a in container.find_all('a', href=True):
            file_url = urljoin(base_url, a['href'])
            file_name = os.path.basename(file_url)

            # Only process if .csv, .xlsx, or .zip
            if not re.search(r'\.(csv|xlsx|zip)$', file_name, re.IGNORECASE):
                continue

            base, ext = os.path.splitext(file_name)
            base_lower = base.lower()

            # Skip if not in keep list
            if not any(target in base_lower for target in keep_basenames):
                continue

            try:
                response = requests.get(file_url)
                response.raise_for_status()

                # Handle zip extraction in memory
                if ext.lower() == '.zip':
                    zip_bytes = io.BytesIO(response.content)
                    try:
                        with zipfile.ZipFile(zip_bytes, 'r') as zip_ref:
                            for member in zip_ref.infolist():
                                if member.filename.endswith('/'):
                                    continue  # Skip folders
                                inner_name = os.path.basename(member.filename)
                                inner_base, inner_ext = os.path.splitext(inner_name)

                                # Check again inside zip
                                inner_base_lower = inner_base.lower()
                                if not any(target in inner_base_lower for target in keep_basenames):
                                    continue

                                # Add year if not already present
                                if not re.search(r'\d{4}$', inner_base):
                                    inner_name = f"{inner_base}_{modified_year}{inner_ext}"

                                file_path = os.path.join(base_dir, inner_name)
                                with zip_ref.open(member) as source, open(file_path, 'wb') as target:
                                    target.write(source.read())
                                print(f"✅ Downloaded RMI Data: {inner_name}")
                    except zipfile.BadZipFile:
                        print(f"❌ Invalid ZIP: {file_name}")
                else:
                    # Non-zip file
                    if not re.search(r'\d{4}$', base):
                        file_name = f"{base}_{modified_year}{ext}"
                    file_path = os.path.join(base_dir, file_name)
                    with open(file_path, 'wb') as f:
                        f.write(response.content)
                    print(f"✅ Downloaded RMI Data: {file_name}")

            except requests.RequestException as e:
                print(f"❌ Failed to download RMI Data: {file_url}: {e}")

def main():
    try:
        print('Downloading Energy Data ...')
        print('--------------------------------------------------------')
        download_recs_files()
        print('--------------------------------------------------------')
        download_seds_files()
        print('--------------------------------------------------------')
        download_total_energy_files()
        print('--------------------------------------------------------')
        download_atb_files()
        print('--------------------------------------------------------')
        download_rmi_files()
    except Exception as e:
        print(f'An error occurred: {e}')

In [3]:
if __name__ == '__main__':
    main()

Downloading Energy Data ...
--------------------------------------------------------
📂 Saving to STAT 390 Project\Energy Data\RECS\2020
✅ Downloaded recs2020_public_v7.csv
✅ Downloaded RECS 2020 Codebook for Public File - v7.xlsx
--------------------------------------------------------
Latest SEDS year: 2022
📂 Saving to STAT 390 Project/Energy Data/SEDS/2022
✅ Downloaded SEDS CSV file
✅ Downloaded SEDS Codes and Descriptions
--------------------------------------------------------
Latest Total Energy release year: 2025
📂 Saving to STAT 390 Project/Energy Data/ATB/2025
✅ Downloaded Total Energy Data: total_energy_primary_energy_overview.csv
✅ Downloaded Total Energy Data: total_energy_primary_energy_production_by_source.csv
✅ Downloaded Total Energy Data: total_energy_primary_energy_consumption_by_source.csv
✅ Downloaded Total Energy Data: total_energy_primary_energy_imports_by_source.csv
✅ Downloaded Total Energy Data: total_energy_primary_energy_exports.csv
✅ Downloaded Total Energy D