In [1]:
import pandas as pd
import re
import yaml

from unidecode import unidecode
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter

with open('../../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

FILENAME = "../../raw_data/lamunicipalcode.html"
DOCID = "lamunicipalcode"
CHUNK_SIZE = config['CHUNK_SIZE']
CHUNK_OVERLAP = config['CHUNK_OVERLAP']


In [2]:
# Import the HTML file

with open(FILENAME, 'r', encoding='utf-8') as f:
    content = f.read()

soup = BeautifulSoup(content, 'lxml')

In [3]:
# Parse the data into chapters, articles, sections, and passages

divs = soup.find_all('div', class_='rbox')
data = []
for div in divs:
    class_ = div.attrs.get('class')
    div_id = div.attrs.get('id')
    myid = DOCID + '_' + div_id
    text = div.get_text('\n')
    text = unidecode(text)
    text = re.sub(r'\s+',' ',text).strip()

    if len(text)==0:
        continue

    if (('Chapter' in class_) and div.find('a')):  # div is a chapter title
        item_type = 'chapter_title'
        level = -2
    elif (('Article' in class_) and div.find('a')): # div is an article title
        item_type = 'article_title'
        level = -1
    elif (('Section' in class_) and div.find('a')): # div is a section title
        item_type = 'section_title'
        level = 0
    else:
        item_type = 'passage'
        level = 1
        for l in range(2,8):
            subdivs = div.find_all('div', class_=f'L{l}')
            if (subdivs is not None) and (len(subdivs)>0):
                level = l
    data.append({
        'id': myid,
        'parent_id': '',
        'doc_id': DOCID, 
        'item_type': item_type,
        'item_level': level,
        'text': text,
        'sibling_left_id': '',
        'sibling_right_id': '',
        'first_child_id': '',
        'next_id': '',
        'prev_id': ''
    })

data = pd.DataFrame.from_dict(data)


In [4]:
# Find parents by looking backwards to the first previous item with lower level
# Find siblings by looking left and right at same level

for idx, row in data.iterrows():
    
    parent_found = False
    my_level = row['item_level']
    j = idx-1
    while (not parent_found and j>=0):
        if data.loc[j,'item_level'] < my_level:
            parent_found = True
            data.loc[idx,'parent_id'] = data.loc[j,'id']
        j=j-1

    done = False
    my_level = row['item_level']
    j = idx+1
    while (not done and j<len(data)):
        if data.loc[j,'item_level'] < my_level:
            done = True
        elif data.loc[j,'item_level'] > my_level:
            done = True
            data.loc[idx,'first_child_id'] = data.loc[j,'id']
        j=j+1

    if idx>0:
        j = idx-1
        data.loc[idx,'prev_id'] = data.loc[j,'id']
        if data.loc[j,'item_level']==my_level:
            data.loc[idx,'sibling_left_id'] = data.loc[j,'id']

    if idx<len(data)-1:
        j = idx+1
        data.loc[idx,'next_id'] = data.loc[j,'id']
        if data.loc[j,'item_level']==my_level:
            data.loc[idx,'sibling_right_id'] = data.loc[j,'id']

data = data.set_index('id')
data.to_csv("../../intermediate_data/lamunicode_all.csv")
data.to_pickle("../../intermediate_data/lamunicode_all.pkl")
    

In [5]:
def get_parents(id):
    parent_id = data.loc[id,'parent_id']
    if parent_id == '':
        return []
    parents = get_parents(parent_id)
    return parents + [parent_id]

def get_siblings_left(id):
    sibling_left_id = data.loc[id,'sibling_left_id']
    if sibling_left_id == '':
        return []
    siblings_left = get_siblings_left(sibling_left_id)
    return siblings_left + [sibling_left_id]

def get_siblings_right(id):
    sibling_right_id = data.loc[id,'sibling_right_id']
    if sibling_right_id == '':
        return []
    siblings_right = get_siblings_right(sibling_right_id)
    return [sibling_right_id] + siblings_right

def get_descendants(id):
    first_child_id = data.loc[id,'first_child_id']
    if first_child_id == '':
        return []
    children = [first_child_id] + get_siblings_right(first_child_id)
    descendants = []
    for child in children:
        descendants = descendants + [child]
        descendants = descendants + get_descendants(child)
    return descendants

In [8]:
new_data = []
for idx, row in data.iterrows():
    text = ''
    if row['item_type'] in ['chapter_title','article_title','section_title']:
        new_row = row.copy()
        new_row['text_to_embed'] = row['text']
        new_data.append(new_row)
    elif row['first_child_id']=='':
        parents = get_parents(idx)
        for parent in parents:
            text += data.loc[parent, 'text'] + '\n'
        text += row['text']
        new_row = row.copy()
        new_row['text_to_embed'] = text
        new_data.append(new_row)
new_data = pd.DataFrame.from_dict(new_data)
new_data.to_csv("../../intermediate_data/lamunicipalcode_to_embed.csv")
new_data.to_pickle("../../intermediate_data/lamunicipalcode_to_embed.pkl")