In [1]:
import os
import sys
import pandas as pd
import re
import yaml
import time

from unidecode import unidecode
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter

sys.path.append('../python')

import text_tools as tt

with open('../../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

DOCID = "lamunicipalcode"
FILENAME = f"../../raw_data/{DOCID}.html"
CHUNK_SIZE = config['CHUNK_SIZE']
CHUNK_OVERLAP = config['CHUNK_OVERLAP']


In [2]:
data = pd.read_pickle(f"../../intermediate_data/{DOCID}_main.pkl")

t0 = time.time()

new_data = []
curr_chapter_id = ''
curr_article_id = ''
curr_section_id = ''
all_ids = set()
curr_ids = set()
flush = False
i = 0
for idx, row in data.iterrows():
    # Flush current ids if new chapter/article/or section
    if (len(curr_ids)>0) and (row['item_type'] in ['chapter_title','article_title','section_title']):
        flush = True
    if flush:
        mytext = tt.get_text_from_ids(list(curr_ids), data)
        new_data.append({
            'id': DOCID + f'_passage_chunk{i}',
            'doc_id': DOCID,
            'ids': repr(list(curr_ids)),
            'chapter_id': curr_chapter_id,
            'article_id': curr_article_id,
            'section_id': curr_section_id,
            'item_type': 'passage_chunk',
            'text': mytext
        })
        all_ids = all_ids.union(curr_ids)
        curr_ids = set()
        flush = False
        i+=1
        if i%128==0:
            print(f"{i}... ", end='')
            
    if row['item_type'] == 'chapter_title':
        curr_chapter_id = idx
        curr_article_id = ''
        curr_section_id = ''
    elif row['item_type'] == 'article_title':
        curr_article_id = idx
        curr_section_id = ''
    elif row['item_type'] == 'section_title':
        curr_section_id = idx
    elif row['first_child_id']=='':
        myids = tt.get_vertical_chain(idx, data)
        
        # if all ids in myids have already been flushed, skip
        if len(set(myids).difference(all_ids))==0: 
            continue
        else:
            mytext0 = tt.get_text_from_ids(list(curr_ids), data)
            mytext1 = tt.get_text_from_ids(list(curr_ids.union(set(myids))), data)
            nwords1 = len(mytext1.split())

            # flush if adding the current block would send the word count over 500
            if (nwords1 > 500) and (len(mytext0)>0):
                new_data.append({
                    'id': DOCID + f'_passage_chunk{i}',
                    'doc_id': DOCID,
                    'ids': repr(list(curr_ids)),
                    'chapter_id': curr_chapter_id,
                    'article_id': curr_article_id,
                    'section_id': curr_section_id,
                    'item_type': 'passage_chunk',
                    'text': mytext0
                })
                all_ids = all_ids.union(curr_ids)
                curr_ids = set(myids)
                flush = False
                i+=1
                if i%128==0:
                    print(f"{i}... ", end='')
            else:
                curr_ids = curr_ids.union(set(myids))
                flush = False

# last flush
if len(curr_ids)>0:
    mytext = tt.get_text_from_ids(list(curr_ids), data)
    new_data.append({
        'id': DOCID + f'_passage_chunk{i}',
        'doc_id': DOCID,
        'ids': repr(list(curr_ids)),
        'chapter_id': curr_chapter_id,
        'article_id': curr_article_id,
        'section_id': curr_section_id,
        'item_type': 'passage_chunk',
        'text': mytext
    })
    all_ids = all_ids.union(curr_ids)


t1 = time.time()
print(f"Elapsed time: {(t1-t0)/60:.2f} minutes")

new_data = pd.DataFrame.from_dict(new_data)
new_data = new_data.set_index('id')


128... 256... 384... 512... 640... 768... 896... 1024... 1152... 1280... 1408... 1536... 1664... 1792... 1920... 2048... 2176... 2304... 2432... 2560... 2688... 2816... 2944... 3072... 3200... 3328... 3456... 3584... 3712... 3840... 3968... 4096... 4224... 4352... 4480... 4608... 4736... 4864... 4992... 5120... 5248... 5376... 5504... 5632... 5760... 5888... 6016... 6144... 6272... 6400... 6528... 6656... 6784... 6912... 7040... 7168... 7296... 7424... 7552... 7680... 7808... 7936... Elapsed time: 1.36 minutes


In [3]:
# There should be no empty text
assert (new_data['text'].str.len()<=0).sum()==0

In [4]:
# All passages (non section/article/chapter titles) should be in all_ids
missing_ids = list(set(data.index).difference(all_ids))
bad = 0
for idx in missing_ids:
    item_type = data.loc[idx, 'item_type']
    if item_type=='passage':
        bad+=1
        print(idx)
assert bad==0

In [5]:
num_words = new_data['text'].apply(lambda x: len(x.split()))
sorted(num_words)[-10:]

[797, 862, 911, 919, 928, 1245, 1339, 1490, 3634, 6782]

In [6]:
new_data.to_csv(f"../../intermediate_data/{DOCID}_passagechunks.csv")
new_data.to_pickle(f"../../intermediate_data/{DOCID}_passagechunks.pkl")

In [7]:
print(new_data.iloc[71]['text'])

CHAPTER I GENERAL PROVISIONS AND ZONING
  ARTICLE 1.5 PLANNING - COMPREHENSIVE PLANNING PROGRAM
    SEC. 11.5.11. AFFORDABLE HOUSING.
      (j) Definitions.
        " Project " shall mean the construction, erection, alteration of, or addition to a structure. The term Project shall not include interior or exterior improvements that do not increase the floor area over that of an existing structure, and shall not mean any construction for which a building permit or demolition permit is required to comply with an order issued by the Department of Building and Safety to repair, remove, or demolish an unsafe or substandard condition, or to rebuild as a result of destruction by fire, earthquake or natural disaster, provided that the development is not prohibited by any provision of the Los Angeles Municipal Code and the development does not increase the square footage beyond what previously existed on the site.
        " Replacement Unit " shall mean any unit that would need to be replaced pu

In [8]:
new_data.iloc[71]['ids']

"['lamunicipalcode_rid-0-0-0-364096', 'lamunicipalcode_rid-0-0-0-364102', 'lamunicipalcode_rid-0-0-0-107689', 'lamunicipalcode_rid-0-0-0-364080', 'lamunicipalcode_rid-0-0-0-364098', 'lamunicipalcode_rid-0-0-0-297069', 'lamunicipalcode_rid-0-0-0-364100', 'lamunicipalcode_rid-0-0-0-107408']"