In [1]:
import os
import sys
import time
import pandas as pd
import re
import yaml

from unidecode import unidecode
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter

sys.path.append('../python')

import text_tools as tt

with open('../../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

DOCID = "lamunicipalcode"
FILENAME = f"../../raw_data/{DOCID}.html"
CHUNK_SIZE = config['CHUNK_SIZE']
CHUNK_OVERLAP = config['CHUNK_OVERLAP']


In [2]:
# Import the HTML file

with open(FILENAME, 'r', encoding='utf-8') as f:
    content = f.read()

soup = BeautifulSoup(content, 'lxml')

In [3]:
# Parse the data into chapters, articles, sections, and passages

divs = soup.find_all('div', class_='rbox')
data = []
text_order = 0
for div in divs:
    class_ = div.attrs.get('class')
    div_id = div.attrs.get('id')
    myid = DOCID + '_' + div_id
    text = div.get_text('\n')
    text = unidecode(text)
    text = re.sub(r'\s+',' ',text).strip()

    if len(text)==0:
        continue

    if (('Chapter' in class_) and div.find('a')):  # div is a chapter title
        item_type = 'chapter_title'
        level = -2
    elif (('Article' in class_) and div.find('a')): # div is an article title
        item_type = 'article_title'
        level = -1
    elif (('Section' in class_) and div.find('a')): # div is a section title
        item_type = 'section_title'
        level = 0
    else:
        item_type = 'passage'
        level = 1
        for l in range(2,8):
            subdivs = div.find_all('div', class_=f'L{l}')
            if (subdivs is not None) and (len(subdivs)>0):
                level = l
    data.append({
        'id': myid,
        'parent_id': '',
        'doc_id': DOCID, 
        'item_type': item_type,
        'item_level': level,
        'text_order': text_order,
        'text': text,
        'sibling_left_id': '',
        'sibling_right_id': '',
        'first_child_id': '',
        'next_id': '',
        'prev_id': ''
    })
    text_order+=1

data = pd.DataFrame.from_dict(data)


In [4]:
# Find parents by looking backwards to the first previous item with lower level
# Find siblings by looking left and right at same level
t0 = time.time()

for idx, row in data.iterrows():
    if (idx%128)==0:
        print(f"{idx}... ", end='')
    
    parent_found = False
    my_level = row['item_level']
    j = idx-1
    while (not parent_found and j>=0):
        if data.loc[j,'item_level'] < my_level:
            parent_found = True
            data.loc[idx,'parent_id'] = data.loc[j,'id']
        j=j-1

    done = False
    my_level = row['item_level']
    j = idx+1
    while (not done and j<len(data)):
        if data.loc[j,'item_level'] < my_level:
            done = True
        elif data.loc[j,'item_level'] > my_level:
            done = True
            data.loc[idx,'first_child_id'] = data.loc[j,'id']
        j=j+1

    if idx>0:
        j = idx-1
        data.loc[idx,'prev_id'] = data.loc[j,'id']
        if data.loc[j,'item_level']==my_level:
            data.loc[idx,'sibling_left_id'] = data.loc[j,'id']

    if idx<len(data)-1:
        j = idx+1
        data.loc[idx,'next_id'] = data.loc[j,'id']
        if data.loc[j,'item_level']==my_level:
            data.loc[idx,'sibling_right_id'] = data.loc[j,'id']

t1 = time.time()
print(f"Elapsed time: {(t1-t0)/60:.2f} minutes")
            
data = data.set_index('id')
data.to_csv(f"../../intermediate_data/{DOCID}_main.csv")
data.to_pickle(f"../../intermediate_data/{DOCID}_main.pkl")

0... 128... 256... 384... 512... 640... 768... 896... 1024... 1152... 1280... 1408... 1536... 1664... 1792... 1920... 2048... 2176... 2304... 2432... 2560... 2688... 2816... 2944... 3072... 3200... 3328... 3456... 3584... 3712... 3840... 3968... 4096... 4224... 4352... 4480... 4608... 4736... 4864... 4992... 5120... 5248... 5376... 5504... 5632... 5760... 5888... 6016... 6144... 6272... 6400... 6528... 6656... 6784... 6912... 7040... 7168... 7296... 7424... 7552... 7680... 7808... 7936... 8064... 8192... 8320... 8448... 8576... 8704... 8832... 8960... 9088... 9216... 9344... 9472... 9600... 9728... 9856... 9984... 10112... 10240... 10368... 10496... 10624... 10752... 10880... 11008... 11136... 11264... 11392... 11520... 11648... 11776... 11904... 12032... 12160... 12288... 12416... 12544... 12672... 12800... 12928... 13056... 13184... 13312... 13440... 13568... 13696... 13824... 13952... 14080... 14208... 14336... 14464... 14592... 14720... 14848... 14976... 15104... 15232... 15360... 