In [1]:
import os
import sys
import pandas as pd
import re
import yaml
import time

from unidecode import unidecode
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter

sys.path.append('../python')

import text_tools as tt

with open('../../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

DOCID = "lamunicipalcode"
FILENAME = f"../../raw_data/{DOCID}.html"
CHUNK_SIZE = config['CHUNK_SIZE']
CHUNK_OVERLAP = config['CHUNK_OVERLAP']


In [2]:
data = pd.read_pickle(f"../../intermediate_data/{DOCID}_main.pkl")

t0 = time.time()

new_data = []
curr_chapter_id = ''
curr_article_id = ''
curr_section_id = ''
all_ids = set()
i = 0
for idx, row in data.iterrows():
    if row['item_type'] == 'chapter_title':
        curr_chapter_id = idx
        curr_article_id = ''
        curr_section_id = ''
    elif row['item_type'] == 'article_title':
        curr_article_id = idx
        curr_section_id = ''
    elif row['item_type'] == 'section_title':
        curr_section_id = idx
    else:
        myids = tt.get_vertical_chain_with_siblings(idx, data)
        mytext = tt.get_text_from_ids(myids, data)
        
        # if the text length has over 6000 words, try not fetching the siblings
        if len(mytext.split())>6000:
            myids = tt.get_vertical_chain(idx, data)
            mytext = tt.get_text_from_ids(myids, data)
        
        # if the text still has over 6000 words, but has a child, skip this row and
        # let the child be added instead
        if (len(mytext.split())>6000) and (len(row['first_child_id'])>0):
            continue
        else:
            if len(set(myids).difference(all_ids))>0:
                all_ids = all_ids.union(set(myids))
                mytext = tt.get_text_from_ids(myids, data)
                new_row = {
                    'id': DOCID + f'_passage_chunk{i}',
                    'ids': repr(myids),
                    'chapter_id': curr_chapter_id, 
                    'article_id': curr_article_id,
                    'section_id': curr_section_id,
                    'item_type': 'passage_chunk',
                    'text': mytext
                }
                new_data.append(new_row)
                i+=1
                if i%128==0:
                    print(f"{i}... ", end='')
        
t1 = time.time()
print(f"Elapsed time: {(t1-t0)/60:.2f} minutes")

new_data = pd.DataFrame.from_dict(new_data)
new_data = new_data.set_index('id')


128... 256... 384... 512... 640... 768... 896... 1024... 1152... 1280... 1408... 1536... 1664... 1792... 1920... 2048... 2176... 2304... 2432... 2560... 2688... 2816... 2944... 3072... 3200... 3328... 3456... 3584... 3712... 3840... 3968... 4096... 4224... 4352... 4480... 4608... 4736... 4864... 4992... 5120... 5248... 5376... 5504... 5632... 5760... 5888... 6016... 6144... 6272... 6400... 6528... 6656... Elapsed time: 4.23 minutes


In [3]:
# All passages (non section/article/chapter titles) should be in all_ids

missing_ids = list(set(data.index).difference(all_ids))
bad = 0
for idx in missing_ids:
    item_type = data.loc[idx, 'item_type']
    if item_type=='passage':
        bad+=1
assert bad==0

In [4]:
num_words = new_data['text'].apply(lambda x: len(x.split()))
sorted(num_words)[-10:]

[5918, 5920, 5923, 5975, 5987, 5992, 5992, 5998, 5998, 6782]

In [5]:
new_data.to_csv(f"../../intermediate_data/{DOCID}_passagechunks.csv")
new_data.to_pickle(f"../../intermediate_data/{DOCID}_passagechunks.pkl")

In [31]:
print(new_data.iloc[199]['text'])

CHAPTER I GENERAL PROVISIONS AND ZONING
  ARTICLE 2 SPECIFIC PLANNING - ZONING - COMPREHENSIVE ZONING PLAN
    SEC. 12.21. GENERAL PROVISIONS.
      COMPACT PARKING STALLS
        6. Automobile Parking and Sales Area - Improvement. Every public or private parking area or automobile, manufactured home or trailer sales area other than those lawfully in existence on August 21, 1969, shall be arranged, improved and maintained in accordance with the following regulations: (Amended by Ord. No. 161,716, Eff. 12/6/86.)
          (a) Yard Areas. (Title and Par. (a) amended by Ord. No. 152,949, Eff. 9/21/79.) Where a public parking area is the principal use of land in the A or R zones, or in any combination of an A or R zone with a P zone, the public parking area shall not extend into the portion of the lot within 10 feet of the front lot line.
          Where parking is an accessory use of land in the A and R zones, the parking area may occupy the remainder of the lot, except for the required A