In [1]:
import pandas as pd
import re
import yaml

from unidecode import unidecode
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter

with open('../../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

FILENAME = "../../raw_data/lamunicipalcode.html"
DOCID = "lamunicipalcode"
CHUNK_SIZE = config['CHUNK_SIZE']
CHUNK_OVERLAP = config['CHUNK_OVERLAP']

import re

BULLET_TYPES = [
    r'^\([ivx]+\)',
    r'^[ivx]+\.',
    r'^[0-9\.]+',
    r'^\([0-9]+\)',
    r'^[A-Z]\.',
    r'^[a-z]\.',
    r'^\([A-Z]\)',
    r'^\([a-z]\)',
]

def is_bullet(text):
    for btype in BULLET_TYPES: 
        if re.match(btype, text):
            return btype

In [2]:
# Import the HTML file

with open(FILENAME, 'r', encoding='utf-8') as f:
    content = f.read()

soup = BeautifulSoup(content, 'lxml')

In [3]:
# Parse the data into chapters, articles, sections, and passages

chapter_id = ''
article_id = ''
section_id = ''
passage_id = ''

divs = soup.find_all('div', class_='rbox')
data = []
for div in divs:
    class_ = div.attrs.get('class')
    is_chapter = ('Chapter' in class_) and div.find('a')
    is_article = ('Article' in class_) and div.find('a')
    is_section = ('Section' in class_) and div.find('a')
    div_id = div.attrs.get('id')
    text = div.get_text('\n')
    text = unidecode(text)
    text = re.sub(r'\s+',' ',text).strip()
    myid = DOCID + '_' + div_id
    if is_chapter:
        chapter_id = myid
        article_id = ''
        section_id = ''
        passage_id = ''
        item_type = 'chapter_title'
    elif is_article:
        article_id = myid
        section_id = ''
        passage_id = ''
        item_type = 'article_title'
    elif is_section:
        section_id = myid
        passage_id = ''
        item_type = 'section_title'
    else:
        passage_id = myid
        item_type = 'passage_text'
    if len(text)>0:
        data.append({
            'id': myid,
            'doc_id': DOCID, 
            'chapter_id': chapter_id, 
            'article_id': article_id, 
            'section_id': section_id, 
            'passage_id': passage_id,
            'item_type': item_type,
            'text': text
        })

data = pd.DataFrame.from_dict(data)

In [428]:
import re

BULLET_TYPES = [
    r'^\([ivx]+\)',
    r'^[ivx]+\.',
    r'^[0-9\.]+',
    r'^\([0-9]+\)',
    r'^[A-Z]\.',
    r'^[a-z]\.',
    r'^\([A-Z]\)',
    r'^\([a-z]\)',
]

def is_bullet(text):
    for btype in BULLET_TYPES: 
        if re.match(btype, text):
            return btype

mytext = data.sample(1).reset_index().loc[0, 'text']
print(mytext)
print("")
print(is_bullet(mytext))

(b) In any case in which a person is arrested for the violation of any provision of this Code, or any ordinance of this City, by any officer or employee of this City who is not a peace officer, but who has been authorized by ordinance pursuant to Penal Code Section 836.5 to make such arrests, and such person does not demand to be taken before a magistrate, such arresting officer or employee shall prepare a written notice to appear and release the person on the person's promise to appear as prescribed by Chapter 5C (commencing with Section 853.6) of the Penal Code. The provisions of such code shall thereafter apply with reference to any proceeding based upon the issuance of a written notice to appear pursuant to this authority.

^\([a-z]\)


In [189]:
mytext = data.sample(1).reset_index().loc[0,'text']
print(mytext)

" Responsible Person " means the owner and/or person in charge or control of the Vacant Structure.


In [5]:
# Parse the data into chapters, articles, sections, and passages

divs = soup.find_all('div', class_='rbox')

chapter_data = []
article_data = []
section_data = []
passage_data = []

chapter_id = ''
article_id = ''
section_id = ''
curr_text = ''

for div in divs:
    class_ = div.attrs.get('class')
    is_chapter = ('Chapter' in class_) and div.find('a')
    is_article = ('Article' in class_) and div.find('a')
    is_section = ('Section' in class_) and div.find('a')
    
    if is_chapter or is_article or is_section:
        passage_data.append({
            'id': DOCID + '_' + div_id,
            'doc_id': DOCID, 
            'chapter_id': chapter_id, 
            'article_id': article_id,
            'section_id': section_id,
            'passage_id': div_id,
            'item_type': 'passage',
            'text': curr_text
        })
        curr_text = ''
    
    div_id = div.attrs.get('id')
    text = div.get_text('\n').replace(u'\xa0',' ')
    text = re.sub(r'\s+',' ',text).strip()
    text = unidecode(text)
    
    if is_chapter:
        chapter_id = div.find('a').attrs.get('id')
        article_id = ''
        section_id = ''
        chapter_data.append({
            'id': DOCID + '_' + chapter_id,
            'doc_id': DOCID,
            'chapter_id': chapter_id,
            'item_type': 'chapter_title',
            'text': text
        })
    elif is_article:
        article_id = div.find('a').attrs.get('id')
        section_id = ''
        article_data.append({
            'id': DOCID + '_' + article_id,
            'doc_id': DOCID,
            'chapter_id': chapter_id,
            'article_id': article_id,
            'item_type': 'article_title',
            'text': text
        })
    elif is_section:
        section_id = div.find('a').attrs.get('id')
        section_data.append({
            'id': DOCID + '_' + section_id,
            'doc_id': DOCID,
            'chapter_id': chapter_id,
            'article_id': article_id,
            'section_id': section_id,
            'item_type': 'section_title',
            'text': text
        })
    else:
        curr_text += text + ' '

passage_data.append({
    'id': DOCID + '_' + div_id,
    'doc_id': DOCID, 
    'chapter_id': chapter_id, 
    'article_id': article_id,
    'section_id': section_id,
    'passage_id': div_id,
    'item_type': 'passage',
    'text': curr_text
})

chapter_data = pd.DataFrame.from_dict(chapter_data)
article_data = pd.DataFrame.from_dict(article_data)
section_data = pd.DataFrame.from_dict(section_data)
passage_data = pd.DataFrame.from_dict(passage_data)

chapter_data.to_csv("../../intermediate_data/los_angeles_chapter_data.csv", header=True, index=False)
article_data.to_csv("../../intermediate_data/los_angeles_article_data.csv", header=True, index=False)
section_data.to_csv("../../intermediate_data/los_angeles_section_data.csv", header=True, index=False)
passage_data.to_csv("../../intermediate_data/los_angeles_passage_data.csv", header=True, index=False)


In [6]:
# Chunk the passages

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

chunk_data = []
for idx, row in passage_data.iterrows():
    if (idx%1000==0):
        print(f"{idx}... ", end='')
    text = row['text']
    texts = text_splitter.create_documents([text])
    chunk_id = 0
    for tx in texts:
        new_row = dict(row).copy()
        new_row.pop('text')
        new_row.pop('item_type')
        new_row['id'] = new_row['id'] + f'_{chunk_id}'
        new_row['chunk_id'] = chunk_id
        new_row['item_type'] = 'passage_chunk'
        new_row['text'] = tx.page_content
        chunk_data.append(new_row)
        chunk_id += 1
        
chunk_data = pd.DataFrame.from_dict(chunk_data)

chunk_data.to_csv("../../intermediate_data/los_angeles_passage_chunk_data.csv", header=True, index=False)

0... 1000... 2000... 3000... 4000... 5000... 