In [1]:
import pandas as pd
import re
from unidecode import unidecode
from bs4 import BeautifulSoup

MUNICIPALITY = "City of Los Angeles"
DOCUMENT_NAME = "City of Los Angeles Municipal Code"
FILENAME = "../../raw_data/lamunicipalcode.html"
DOCID = "lamunicipalcode"


In [2]:
# Import the HTML file

with open(FILENAME, 'r', encoding='utf-8') as f:
    content = f.read()

soup = BeautifulSoup(content, 'lxml')

In [3]:
divs = soup.find_all('div', class_='rbox')

chapter_data = []
article_data = []
section_data = []
text_data = []

chapter_id = ''
article_id = ''
section_id = ''
curr_text = ''

for div in divs:
    class_ = div.attrs.get('class')
    is_chapter = ('Chapter' in class_) and div.find('a')
    is_article = ('Article' in class_) and div.find('a')
    is_section = ('Section' in class_) and div.find('a')
    
    if is_chapter or is_article or is_section:
        text_data.append({
            'passage_id': div_id,
            'doc_id': DOCID, 
            'chapter_id': chapter_id, 
            'article_id': article_id,
            'section_id': section_id,
            'doc_type': 'passage',
            'text': curr_text
        })
        curr_text = ''
    
    div_id = div.attrs.get('id')
    text = div.get_text('\n').replace(u'\xa0',' ')
    text = re.sub(r'\s+',' ',text).strip()
    text = unidecode(text)
    
    if is_chapter:
        chapter_id = div.find('a').attrs.get('id')
        article_id = ''
        section_id = ''
        chapter_data.append({
            'chapter_id': chapter_id,
            'doc_id': DOCID,
            'chapter_title': text
        })
    elif is_article:
        article_id = div.find('a').attrs.get('id')
        section_id = ''
        article_data.append({
            'chapter_id': chapter_id,
            'article_id': article_id,
            'doc_id': DOCID,
            'article_title': text
        })
    elif is_section:
        section_id = div.find('a').attrs.get('id')
        section_data.append({
            'chapter_id': chapter_id,
            'article_id': article_id,
            'section_id': section_id,
            'doc_id': DOCID,
            'section_title': text
        })
    curr_text += text + ' '

text_data.append({
    'passage_id': div_id,
    'doc_id': DOCID, 
    'chapter_id': chapter_id, 
    'article_id': article_id,
    'section_id': section_id,
    'doc_type': 'passage',
    'text': curr_text
})

chapter_data = pd.DataFrame.from_dict(chapter_data)
article_data = pd.DataFrame.from_dict(article_data)
section_data = pd.DataFrame.from_dict(section_data)
text_data = pd.DataFrame.from_dict(text_data)

chapter_data.to_csv("../../intermediate_data/los_angeles_chapter_data.csv", header=True, index=False)
article_data.to_csv("../../intermediate_data/los_angeles_article_data.csv", header=True, index=False)
section_data.to_csv("../../intermediate_data/los_angeles_section_data.csv", header=True, index=False)
text_data.to_csv("../../intermediate_data/los_angeles_text_data.csv", header=True, index=False)
