In [1]:
import pandas as pd
from bs4 import BeautifulSoup

# Path to the HTML file
file_path = '../../raw_data/lamunicipalcode.html'

# Open and read the HTML file
with open(file_path, 'r', encoding='utf-8') as municipal_code:
    content = municipal_code.read()

soup = BeautifulSoup(content, 'lxml')

# Find chapters
chapters = soup.find_all('div', class_='rbox Chapter')
chapter_data = []
for chapter in chapters:
    a_tag = chapter.find('a')
    if a_tag:
        chapter_id = a_tag.attrs['id']
    chapter_text = chapter.get_text(separator=" | ").strip()
    
    chapter_data.append({
        'Chapter ID': chapter_id,
        'Chapter Text': chapter_text
    })

chapter_data = pd.DataFrame.from_dict(chapter_data)

# Save chapter data to Week 6 directory
output_directory = '../../intermediate_data/'
chapter_data.to_csv(output_directory + "chapter_data.csv", header=True, index=False)

# Find articles
articles = soup.find_all('div', class_='Article toc-destination rbox')
article_data = []
for article in articles:
    a_tag = article.find('a')
    if a_tag:
        article_id = a_tag.attrs['id']
    article_text = article.get_text(separator=" | ")
    article_data.append({
        'Article ID': article_id,
        'Article Text': article_text
    })

article_data = pd.DataFrame.from_dict(article_data)

# Save article data to Week 6 directory
article_data.to_csv(output_directory + "article_data.csv", header=True, index=False)

# Find Sections
sections = soup.find_all('div', class_='Section toc-destination rbox')
section_data = []
for section in sections:
    a_tag = section.find('a')
    if a_tag:
        section_id = a_tag.attrs['id']
    section_text = section.get_text(separator=" | ")  #i saw the difference that adding the separator makes, why is that
    section_data.append({
        'Section ID': section_id,
        'Section Text': section_text
    })
    
section_data = pd.DataFrame.from_dict(section_data)

# Save section data to Week 6 directory
section_data.to_csv(output_directory + "section_data.csv", header=True, index=False)

# Find Subsections
subsection_data = []
sections = soup.find_all('div', class_='Section toc-destination rbox')

i=0
for section in sections:
    a_tag = section.find('a')
    if a_tag:
        section_id = a_tag.attrs['id']
    
    #section_id = section.get('id')

    # Find the sibling divs with class 'rbox Normal-Level' to get the subsections
    subsection_siblings = section.find_next_siblings('div', class_='rbox Normal-Level')

    # Loop through the subsections and extract text
    for subsection in subsection_siblings:
        # Get the text from the div that contains the subsection title and content
        full_text = subsection.get_text(separator=" ", strip=True)

        # Split and extract the title
        # Split by space to extract the first part (like "(a)")
        if full_text:
            content = full_text

            # Add the extracted title and content to the list
            subsection_data.append({
                'Section ID': section_id,
                'Subsection ID': i,
                'Subsection Content': content
            })
            i+=1

# Convert to DataFrame
subsection_data_df = pd.DataFrame(subsection_data)

# Save subsection data to Week 6 directory
subsection_data_df.to_csv(output_directory + "subsection_data.csv", header=True, index=False)

# Print the DataFrame
print(subsection_data_df)


        Section ID  Subsection ID  \
0        JD_11.00.              0   
1        JD_11.00.              1   
2        JD_11.00.              2   
3        JD_11.00.              3   
4        JD_11.00.              4   
...            ...            ...   
40861  JD_200.126.          40861   
40862  JD_200.126.          40862   
40863  JD_200.127.          40863   
40864  JD_200.127.          40864   
40865  JD_200.201.          40865   

                                      Subsection Content  
0           (Amended by Ord. No. 175,676, Eff. 1/11/04.)  
1      (a) Short Title.  Reference to Code in Prosecu...  
2      (b) Existing Law Continued. The provisions of ...  
3      (c) Construction. The provisions of this Code ...  
4      (d) Effect of Code on Past Actions and Obligat...  
...                                                  ...  
40861     (Renumbered by Ord. No. 187,455, Eff. 4/1/22.)  
40862  Nothing in this article shall be interpreted s...  
40863     (Renumbered by