In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import html

# Path to the HTML file
file_path = '../../raw_data/lamunicipalcode.html'

# Open and read the HTML file using lxml parser
with open(file_path, 'r', encoding='utf-8') as municipal_code:
    content = municipal_code.read()

# Decode HTML entities (pre-cleaning step)
content = html.unescape(content)  # Decodes things like &nbsp;, &quot;, etc.

soup = BeautifulSoup(content, 'lxml')

# Helper function to dynamically clean up garbled characters and strip unnecessary formatting
def dynamic_clean_text(text):
    # Replace all &nbsp; with normal space
    text = text.replace(u'\xa0', ' ')  # Replace non-breaking spaces with regular spaces
    
    # Remove excessive spaces
    text = ' '.join(text.split())

    # Further cleaning for misinterpreted characters
    replacements = {
        'Â': '',  # Remove non-breaking space placeholder
        'â€™': "'",  # Misinterpreted apostrophes
        'â€œ': '"',  # Misinterpreted left double quotes
        'â€': '"',  # Misinterpreted right double quotes
        'â€“': '–',  # Misinterpreted en-dash
        'â€¦': '...',  # Misinterpreted ellipsis
        'œ': 'oe',  # Correct misinterpreted "œ"
        '“': '"',  # Smart left double quotes
        '”': '"',  # Smart right double quotes
        '‘': "'",  # Smart left single quote
        '’': "'",  # Smart right single quote
    }
    for key, value in replacements.items():
        text = text.replace(key, value)
    return text.strip()

# Find chapters
chapters = soup.find_all('div', class_='rbox Chapter')
chapter_data = []
for chapter in chapters:
    a_tag = chapter.find('a')
    if a_tag:
        chapter_id = a_tag.attrs['id']
    chapter_text = dynamic_clean_text(chapter.get_text(separator=" | ").strip())  # Apply cleaning here
    
    chapter_data.append({
        'Chapter ID': chapter_id,
        'Chapter Text': chapter_text
    })

chapter_data = pd.DataFrame.from_dict(chapter_data)

# Save chapter data to the Week 6 directory
output_directory = '../../intermediate_data/'
chapter_data.to_csv(output_directory + "chapter_data.csv", header=True, index=False)

# Find articles
articles = soup.find_all('div', class_='Article toc-destination rbox')
article_data = []
for article in articles:
    a_tag = article.find('a')
    if a_tag:
        article_id = a_tag.attrs['id']
    article_text = dynamic_clean_text(article.get_text(separator=" | "))  # Apply cleaning here
    article_data.append({
        'Article ID': article_id,
        'Article Text': article_text
    })

article_data = pd.DataFrame.from_dict(article_data)

# Save article data to the Week 6 directory
article_data.to_csv(output_directory + "article_data.csv", header=True, index=False)

# Find sections and assign the correct Article ID and Chapter ID
sections = soup.find_all('div', class_='Section toc-destination rbox')
section_data = []
for section in sections:
    a_tag = section.find('a')
    if a_tag:
        section_id = a_tag.attrs['id']
    
    # Find the closest preceding article to assign the correct Article ID
    parent_article = section.find_previous('div', class_='Article toc-destination rbox')
    if parent_article:
        article_id = parent_article.find('a').attrs['id']
    
    # Find the closest preceding chapter to assign the correct Chapter ID
    parent_chapter = section.find_previous('div', class_='rbox Chapter')
    if parent_chapter:
        chapter_id = parent_chapter.find('a').attrs['id']
    
    section_text = dynamic_clean_text(section.get_text(separator=" | "))  # Apply cleaning here
    section_data.append({
        'Chapter ID': chapter_id,     
        'Article ID': article_id,     
        'Section ID': section_id,     
        'Section Text': section_text  
    })

section_data = pd.DataFrame.from_dict(section_data)

# Save section data to the Week 6 directory
section_data.to_csv(output_directory + "section_data.csv", header=True, index=False)

# Find subsections
subsection_data = []
i = 0
for section in sections:
    a_tag = section.find('a')
    if a_tag:
        section_id = a_tag.attrs['id']

    # Find the closest preceding article for the subsection
    parent_article = section.find_previous('div', class_='Article toc-destination rbox')
    if parent_article:
        article_id = parent_article.find('a').attrs['id']

    # Find the closest preceding chapter for the subsection
    parent_chapter = section.find_previous('div', class_='rbox Chapter')
    if parent_chapter:
        chapter_id = parent_chapter.find('a').attrs['id']

    # Find the sibling divs with class 'rbox Normal-Level' to get the subsections
    subsection_siblings = section.find_next_siblings('div', class_='rbox Normal-Level')

    # Loop through the subsections and extract text
    for subsection in subsection_siblings:
        # Get the text from the div that contains the subsection title and content
        full_text = dynamic_clean_text(subsection.get_text(separator=" ", strip=True))  # Apply cleaning here

        if full_text:
            content = full_text

            # Add the extracted title and content to the list
            subsection_data.append({
                'Chapter ID': chapter_id,      
                'Article ID': article_id,      
                'Section ID': section_id,      
                'Subsection Content': content  
            })
            i += 1

# Convert to DataFrame
subsection_data_df = pd.DataFrame(subsection_data)

# Save subsection data to the Week 6 directory
subsection_data_df.to_csv(output_directory + "subsection_data.csv", header=True, index=False)


KeyboardInterrupt: 