In [24]:
from bs4 import BeautifulSoup
import pandas as pd
import csv

In [25]:
# get the names of the CFR titles from the toc file
title_number = "12"
directory = f"{title_number}CFR"
# toc_file = f"{title_number}CFR/title-{title_number}.json"
# json_toc = json.load(open(toc_file, "r"))
# base_url = url = "https://www.ecfr.gov/api/renderer/v1/content/enhanced/"
# base_date = "2023-09-28"
docs = [doc.split('\t')[1].strip() for doc in open(f"../{directory}/{directory}toc.txt", "r").readlines()]
print(docs[:5])

['12-I-1', '12-I-2', '12-I-3', '12-I-4', '12-I-5']


In [26]:
# convert the toc file into a list of filenames
def process_docs(s):
    s = s[s.find("-")+1:]
    modified_s = s[:].translate(str.maketrans('-', '_'))
    
    if modified_s.count('_') == 1:
        modified_s = modified_s.replace('_', '__')
    return '../'+directory+'/title_'+modified_s+'.html'

filenames = [process_docs(doc) for doc in docs]

list(zip(filenames[:5], docs[:5]))  

[('../12CFR/title_I__1.html', '12-I-1'),
 ('../12CFR/title_I__2.html', '12-I-2'),
 ('../12CFR/title_I__3.html', '12-I-3'),
 ('../12CFR/title_I__4.html', '12-I-4'),
 ('../12CFR/title_I__5.html', '12-I-5')]

In [27]:
combined_html = ''

# Loop through each file and append its contents to the combined_html string
for file_path in filenames:
    with open(file_path, 'r', encoding='utf-8') as file:
        combined_html += file.read()

soup = BeautifulSoup(combined_html, 'html.parser')

In [51]:
# put it in pandas

# This list will store the records, each as a dictionary
data_records = []

def extract_internal_id(href):
    """Extract internal ID from the href if it exists."""
    if '#' in href:
        return href.split('#')[-1]
    return None

# Parse the HTML for divs, paragraphs, and links
for div in soup.find_all('div', id=True):
    # ignore divs with class "subject-group" as they are not used
    if 'subject-group' in div.get('class', []):
        continue

    div_id = div['id']

    # Get the paragraph id
    p_id = div.find('p').get('data-title') if div.find('p') else None
    if p_id == None:
        p_id = div_id    
    p_id = p_id.removeprefix("p-")

    # Extract the paragraph text; assuming only one paragraph per div
    paragraph_text = div.find('p').get_text(strip=True) if div.find('p') else ''

     # Identify child elements. We consider 'div' elements within the current 'div' as its children.
    child_ids = [child_div.get('id').removeprefix("p-") for child_div in div.find_all('div', id=True)]
    
    # Initialize containers for the links
    cfr_links = []  # for links of class "cfr" (internal)
    other_links = []  # for links not having "cfr" class (considered external)
    link_targets = []  # for storing target IDs from internal links

    # Search for links and categorize them
    for a_tag in div.find_all('a', href=True):
        link_class = a_tag.get('class', [])
        href = a_tag['href']

        if "cfr" in link_class:
            cfr_links.append(href)
            
            # Extract target ID from internal link
            target_id = extract_internal_id(href)
            # if target_id == None:
            #     print(target_id, href ) # print the href if there is no target_id
            if target_id:
                link_targets.append(target_id.removeprefix("p-"))  # Just store the target ID
        else:
            other_links.append(href)  # If not "cfr", considered as external link

    # Construct the record and add it to the list
    record = {
        'p_id': p_id,
        'text': paragraph_text,
        'child_ids': child_ids,
        'cfr_links': cfr_links,  # internal links
        'other_links': other_links,  # external links
        'link_targets': link_targets,  # target IDs from internal links
    }
    data_records.append(record)

# Create a pandas DataFrame from the list of records
df = pd.DataFrame(data_records)


# 'df' is now a pandas DataFrame, where each row corresponds to a paragraph, 
# containing the ID, text, internal links (cfr), other links (external), 
# and the target IDs from link relations.


In [52]:

df.sample(10)

Unnamed: 0,p_id,text,child_ids,cfr_links,other_links,link_targets
61994,353.3(a)(4)(iii),(iii)The transaction has no business or appare...,[],[],[],[]
47627,265.7(k)(6)(iii)(I),(I)Approve or disapprove under§ 217.205(c)of R...,[],[/on/2023-09-28/title-12/section-217.205#p-217...,[],"[217.205(c), 217.205(c)]"
35888,237.22(g),(g)Consultation.The Board shall consult with t...,[],[],[],[]
71102,620.5(f)(4),(4)For all banks (on a bank only basis) and fo...,"[620.5(f)(4)(i), 620.5(f)(4)(ii), 620.5(f)(4)(...",[/on/2023-09-28/title-12/section-620.5#p-620.5...,[],"[620.5(f)(2), 620.5(f)(3)]"
46037,261.18(a),(a)Resolving requests for confidential treatme...,[],[],[],[]
71005,620.4(b)(1),(1)A bank must provide its annual report to th...,[],[],[],[]
82359,797.13(b)(2),(2)The amount and basis of the debt; and,[],[],[],[]
68281,611.1155(a)(4)(i),(i)Is necessary or expedient to the System ins...,[],[],[],[]
112004,1269.3(a)(3),(3)To assist housing associates with asset/lia...,[],[],[],[]
95560,Supplement-I-to-Part-1026 3.,3.Credit accessed in connection with by a prep...,[],[],[],[]


In [31]:
# print a sample row
selected_rows = df[df['p_id'] == '1.2(a)']
selected_rows


Unnamed: 0,p_id,text,child_ids,cfr_links,other_links,link_targets
6,1.2(a),(a)Capital and surplusmeans:,"[p-1.2(a), p-1.2(a)(1), p-1.2(a)(1)(i), p-1.2(...","[/on/2023-09-28/title-12/part-3, /on/2023-09-2...","[https://www.govinfo.gov/link/uscode/15/80a-8,...","[p-1.2(a)(2)(i), p-1.2(j), p-1.2(j), p-1.2(k)]"
7,1.2(a),(a)Capital and surplusmeans:,"[p-1.2(a)(1), p-1.2(a)(1)(i), p-1.2(a)(1)(ii),...","[/on/2023-09-28/title-12/part-3, /on/2023-09-2...",[],[p-1.2(a)(2)(i)]


In [34]:
# write the dataframe to a csv
df.to_csv(f"../dataframe/{title_number}.csv", quoting=csv.QUOTE_ALL, quotechar='"', sep=',', index=False)
# write the datframe to parquet
df.to_parquet(f"../dataframe/{title_number}.parquet")

In [35]:
# links to external sites, such as us code

nodes_with_external = df[df['other_links'].astype(bool)]
nodes_with_external['other_links']

0         [https://www.govinfo.gov/link/uscode/12/1, htt...
1         [https://www.govinfo.gov/link/uscode/12/1, htt...
2         [https://www.govinfo.gov/link/uscode/12/1, htt...
3               [https://www.govinfo.gov/link/uscode/12/24]
4              [https://www.govinfo.gov/link/uscode/12/335]
                                ...                        
117194    [https://www.govinfo.gov/link/uscode/12/4701, ...
117195    [https://www.govinfo.gov/link/uscode/12/4701, ...
117196        [https://www.govinfo.gov/link/uscode/12/4701]
117203        [https://www.govinfo.gov/link/uscode/12/4703]
117204        [https://www.govinfo.gov/link/uscode/42/4321]
Name: other_links, Length: 14663, dtype: object