In [1]:
from bs4 import BeautifulSoup
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
with open('constitution.html', 'rb') as f:
    soup = BeautifulSoup(f, 'html.parser')

In [3]:
legis = soup.find(id='legis')

In [17]:
legis.select('.part')

[<td class="part" id="P11-"><div class="partNo">PART 1</div><table width="100%"><tbody><tr><td class="partHdr" id="P11-he-">PRELIMINARY</td></tr></tbody></table><div class="prov1"><table width="100%"><tbody><tr><td class="prov1Hdr" id="pr1-"><span class="noBold"></span><span class="">Citation</span></td></tr></tbody></table><table width="100%"><tbody><tr><td class="prov1Txt"><strong>1.</strong>  This Constitution may be cited as the Constitution of the Republic of Singapore.</td></tr></tbody></table></div><div class="prov1"><table width="100%"><tbody><tr><td class="prov1Hdr" id="pr2-"><span class="noBold"></span><span class="">Interpretation</span></td></tr></tbody></table><table width="100%"><tbody><tr><td class="prov1Txt"><strong>2.</strong><a name="pr2-ps1-"></a><span class="prov2TxtIL">—(1)  In this Constitution, unless it is otherwise provided or the context otherwise requires —<table width="100%"><tbody><tr><td class="def" style="font-size:13pt">“Cabinet” means the Cabinet consti

In [61]:
documents = []
for html_part in legis.select('.part'):
    res = ''
    part_number = html_part.select('.partNo')[0].text
    part_header = html_part.select('.partHdr')[0].text
    prov_header = html_part.select('.prov1Hdr')[0].text
    prov_text = html_part.select('.prov1Txt')

    res += '# ' + part_number + '<br>'
    res += '# ' + part_header + '<br>'
    res += '## ' + prov_header
    for each in prov_text:
        temp_dict = {}
        # res += '<br>' + each.text
        meta_data = {}
        meta_data['Legislation'] = 'Constitution of the Republic of Singapore'
        meta_data['Part No.'] = part_number
        meta_data['Part Title'] = part_header
        meta_data['Section Title'] = prov_header
        meta_data['Subsection No.'] = each.select('strong')[0].text
        temp_dict['metadata'] = meta_data
        temp_dict['text'] = f'''{meta_data['Legislation']} {meta_data['Part No.']} {meta_data['Part Title']} {meta_data['Section Title']} {meta_data['Subsection No.']}<br><br>''' + each.text.replace('\xa0', '')

        documents.append(temp_dict)


In [63]:
embed = OllamaEmbeddings(
    model="mxbai-embed-large"
)

In [64]:
documents2 = [Document(page_content=doc['text'], metadata=doc['metadata']) for doc in documents]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=128,
    length_function=len
)
documents2 = text_splitter.split_documents(documents2)
for idx, doc in enumerate(documents2):
    doc.metadata["id"] = str(idx)

In [65]:
documents2

[Document(metadata={'Legislation': 'Constitution of the Republic of Singapore', 'Part No.': 'PART 1', 'Part Title': 'PRELIMINARY', 'Section Title': 'Citation', 'Subsection No.': '1.', 'id': '0'}, page_content='Constitution of the Republic of Singapore PART 1 PRELIMINARY Citation 1.<br><br>1.This Constitution may be cited as the Constitution of the Republic of Singapore.'),
 Document(metadata={'Legislation': 'Constitution of the Republic of Singapore', 'Part No.': 'PART 1', 'Part Title': 'PRELIMINARY', 'Section Title': 'Citation', 'Subsection No.': '2.', 'id': '1'}, page_content='Constitution of the Republic of Singapore PART 1 PRELIMINARY Citation 2.<br><br>2.—(1)In this Constitution, unless it is otherwise provided or the context otherwise requires—“Cabinet” means the Cabinet constituted under this Constitution;“Civil List” means the provision made under Article22J for the maintenance of the President;“citizen of Singapore” means any person who, under the provisions of this Constituti

In [68]:
import pickle

with open("documents.pkl", "wb") as f:
    pickle.dump(documents2, f)

In [69]:
vectorstore = FAISS.from_documents(documents2, embed)

In [70]:
vectorstore.save_local("faiss_index_constitution")