In [1]:
from bs4 import BeautifulSoup
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.chains import RetrievalQA
from langchain_core.embeddings import Embeddings


In [2]:
with open('constitution.html', 'rb') as f:
    soup = BeautifulSoup(f, 'html.parser')

In [3]:
legis = soup.find(id='legis')

In [4]:
documents = []
for html_part in legis.select('.part'):
    temp_dict = {}
    res = ''
    part_number = html_part.select('.partNo')[0].text
    part_header = html_part.select('.partHdr')[0].text
    prov_header = html_part.select('.prov1Hdr')[0].text
    prov_text = html_part.select('.prov1Txt')


    res += '# ' + part_number + '<br>'
    res += '# ' + part_header + '<br>'
    res += '## ' + prov_header
    for each in prov_text:
        # res += '<br>' + each.text
        temp_dict['text'] = each.text.replace('\xa0', '')
        meta_data = {}
        meta_data['Legislation'] = 'Constitution of the Republic of Singapore'
        meta_data['Part No.'] = part_number
        meta_data['Part Title'] = part_header
        meta_data['Section Title'] = prov_header
        meta_data['Subsection No.'] = each.select('strong')[0].text
        temp_dict['metadata'] = meta_data

    documents.append(temp_dict)

In [5]:
documents

[{'text': '2.—(1)In this Constitution, unless it is otherwise provided or the context otherwise requires—“Cabinet” means the Cabinet constituted under this Constitution;“Civil List” means the provision made under Article22J for the maintenance of the President;“citizen of Singapore” means any person who, under the provisions of this Constitution, has the status of a citizen of Singapore;“commencement”, used with reference to this Constitution, means 9 August 1965;“Consolidated Fund” means the Consolidated Fund established by this Constitution;“Council of Presidential Advisers” means the Council of Presidential Advisers constituted under Part5A;“existing law” means any law having effect as part of the law of Singapore immediately before the commencement of this Constitution;“Government” means the Government of Singapore;“high judicial office” means the office of the Chief Justice, a Justice of the Court of Appeal, a Judge of the Appellate Division, a Judge of the High Court, a Judicial 

In [6]:
embed = OllamaEmbeddings(
    model="mxbai-embed-large"
)

In [7]:
class ollamaEmbeddings(Embeddings):
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        """Embed search docs."""
        return [self.model.embed_query(text) for text in texts]

    def embed_query(self, text):
        """Embed query text."""
        return self.model.embed_query(text)

In [8]:
documents2 = [Document(page_content=doc['text'], metadata=doc['metadata']) for doc in documents]
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=128,
    length_function=len
)
documents2 = text_splitter.split_documents(documents2)

In [9]:
vectorstore = FAISS.from_documents(documents2, embed)

In [10]:
vectorstore.save_local("faiss_index_constitution")