In [None]:
# %pip install icecream
# %pip install aiohttp
# %pip install httpx

In [1]:
import os
import requests
import httpx
import json
from urllib.parse import urlparse, parse_qs
from icecream import ic
import aiohttp
import asyncio
from bs4 import BeautifulSoup


In [2]:
title_number = "12"
directory = f"{title_number}CFR"
toc_file = f"{title_number}CFR/title-{title_number}.json"
json_toc = json.load(open(toc_file, "r"))

base_url = url = "https://www.ecfr.gov/api/renderer/v1/content/enhanced/"
base_date = "2023-09-28"

In [3]:
def gather_types(node, types_set=None):
    if types_set is None:
        types_set = set()

    node_type = node.get('type')
    if node_type:
        types_set.add(node_type)

    for child in node.get('children', []):
        gather_types(child, types_set)

    return types_set

types_found = gather_types(json_toc)
print(types_found)

{'subchapter', 'chapter', 'hed1', 'appendix', 'subject_group', 'title', 'part', 'section', 'subpart'}


In [5]:
# Get the URLs and order of table of contents TOC

def traverse_node(node, path=[], urls=[], toc_path=[], toc_full=[]):
    if node['reserved']:
        return urls, toc_full

    toc_level = node['label_level'].strip().split(' ')[-1]
    match node['type']:
        case 'title':
            label = f"title-{toc_level}?"
        case 'chapter':
            label = f"chapter={toc_level}"
        case 'subchapter':
            label = f"&subchapter={toc_level}"
        case 'part':
            label = f"&part={toc_level}"
        case 'appendix':
            label = f"&appendix={toc_level}"
        case 'section':
            label = f"&section={toc_level}"
        case 'subject_group':
            label = f"&subject_group={toc_level}"
        case other:
            label = f"ERROR!"

    path = path + [label]
    toc_path = toc_path + [toc_level]

    if node['type'] == 'part':
        toc_full.append(f"{'-'.join(toc_path)}")
        urls.append(f"{''.join(path)}")

    for child in node.get('children', []):
        traverse_node(child, path, urls, toc_path, toc_full)

    return urls, toc_full

urls, toc_full = traverse_node(json_toc)
with open(f'{directory}/{directory}urls.txt', 'w') as f:
    for u in urls:
        f.write(f'{base_url}{base_date}/{u}\n')

with open(f'{directory}/{directory}toc.txt', 'w') as f:
    for idx, toc_item in enumerate(toc_full):
        f.write(f"{idx+1}\t{toc_item}\n")

In [None]:
# Get all the HTML files in the urls.txt file

urls = [url.strip() for url in open(f"{directory}/{directory}urls.txt", "r").readlines()]

directory = "12CFR"
if not os.path.exists(directory):
    os.makedirs(directory)

MAX_RETRIES = 5  # Max retries per URL
DELAY_ON_ERROR = 10  # Delay for 10 seconds if an error is encountered

def generate_filename(url):
    query_params = parse_qs(urlparse(url).query)
    title = query_params.get("title")[0] if "title" in query_params else ""
    chapter = query_params.get("chapter")[0] if "chapter" in query_params else ""
    subchapter = query_params.get("subchapter")[0] if "subchapter" in query_params else ""
    part = query_params.get("part")[0] if "part" in query_params else ""
    return f"title{title}_{chapter}_{subchapter}_{part}.html"

async def download_file(url, retries=MAX_RETRIES):
    filename = generate_filename(url)
    filepath = os.path.join(directory, filename)
    
    # Check if the file already exists
    if os.path.exists(filepath):
        return

    async with httpx.AsyncClient() as client:
        try:
            response = await client.get(url)
            response.raise_for_status()

            with open(filepath, 'wb') as file:
                file.write(response.content)

        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429 and retries > 0:
                print(f"429 Too Many Requests received for {url}. Retrying in {DELAY_ON_ERROR} seconds...")
                await asyncio.sleep(DELAY_ON_ERROR)
                await download_file(url, retries=retries - 1)
            else:
                print(f"Failed to download {url} due to HTTP status error.")

        except httpx.ConnectError:
            if retries > 0:
                print(f"Connection error for {url}. Retrying in {DELAY_ON_ERROR} seconds...")
                await asyncio.sleep(DELAY_ON_ERROR)
                await download_file(url, retries=retries - 1)
            else:
                print(f"Failed to download {url} after {MAX_RETRIES} attempts due to connection error.")

# Gather tasks and run them concurrently
tasks = [download_file(url) for url in urls]
await asyncio.gather(*tasks)



In [None]:
# #validate urls

# urls = [url.strip() for url in open("urls.txt", "r").readlines()]
# async def check_url(url, session, delay=1):  # default delay of 1 second
#     try:
#         await asyncio.sleep(delay)
#         async with session.head(url) as response:
#             if response.status == 200:  # OK status
#                 return (url, True)
#             else:
#                 return (url, response.status)
#     except Exception as e:
#         return (url, e)
# async def check_urls_exist_async(urls, delay=1):
#     async with aiohttp.ClientSession() as session:
#         results = []
#         for url in urls:
#             result = await check_url(url, session, delay)
#             results.append(result)
#         return results
# responses = await check_urls_exist_async(toc_full)
# with open(f"{directory}/{directory}url_validate.txt", "w") as f:
#     for url, status in responses:
#         f.write(f"{status:8}\t{url} \n")