In [1]:
import os
import pprint

from dotenv import load_dotenv
from langchain_core.documents import Document

load_dotenv("./.env", override=True)

True

In [2]:
os.environ["LANGSMITH_PROJECT"] = "lc_semantic_search"

In [3]:
# documents
docs: list[Document] = [
    Document(
        page_content="Trichogenes claviger, the Caetés catfish, is a critically endangered species of pencil catfish native to the Atlantic Forest of Brazil. It was discovered early in 2010 and scientifically described later in the same year. One of three species within the genus Trichogenes, it is restricted to an area of 16 km² in the Caetés forest, a mountainous area in the Brazilian state of Espírito Santo. When discovered, the rainforest in which it occurs was unprotected and threatened by deforestation. A private nature reserve has since been established, allowing visitors to see the fish in its habitat.",
        metadata={"source": "wikipedia"},
        id="fish",
    ),
    Document(
        page_content="Deforestation or forest clearance is the removal and destruction of a forest or stand of trees from land that is then converted to non-forest use.[1] Deforestation can involve conversion of forest land to farms, ranches, or urban use. About 31% of Earth's land surface is covered by forests at present.[2] This is one-third less than the forest cover before the expansion of agriculture, with half of that loss occurring in the last century.[3] Between 15 million to 18 million hectares of forest, an area the size of Bangladesh, are destroyed every year. On average 2,400 trees are cut down each minute.[4] Estimates vary widely as to the extent of deforestation in the tropics.[5][6] In 2019, nearly a third of the overall tree cover loss, or 3.8 million hectares, occurred within humid tropical primary forests. These are areas of mature rainforest that are especially important for biodiversity and carbon storage.",
        metadata={"source": "wikipedia"},
        id="deforestration",
    ),
]

In [4]:
# load pdf
from langchain_community.document_loaders import PyPDFLoader

pdf_path = "/media/gova/Data/others/amma/name-change-affidavit.pdf"

loader = PyPDFLoader(pdf_path, extract_images=True)

docs = loader.load()
pprint.pp(docs[0].to_json())

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'metadata': {'producer': 'Skia/PDF m136 Google Docs Renderer',
                         'creator': 'PyPDF',
                         'creationdate': '',
                         'title': 'name-change-affidavit',
                         'source': '/media/gova/Data/others/amma/name-change-affidavit.pdf',
                         'total_pages': 1,
                         'page': 0,
                         'page_label': '1'},
            'page_content': 'AFFIDAVIT  FOR  CHANGE  OF  NAME  AFTER  '
                            'MARRIAGE   I,  Thuropatha,  daughter  of  Mr.  '
                            'K.  Somanathan  and  wife  of  Mr.  K.  '
                            'Rajadurai  aged  60  \n'
                            'residing\n'
                            ' \n'
                            'at\n'
                            ' \n'
                            '65/1A,\n'
           

In [5]:
# split text
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
splits = text_splitter.split_documents(docs)

pprint.pp(splits[1].to_json())

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'metadata': {'producer': 'Skia/PDF m136 Google Docs Renderer',
                         'creator': 'PyPDF',
                         'creationdate': '',
                         'title': 'name-change-affidavit',
                         'source': '/media/gova/Data/others/amma/name-change-affidavit.pdf',
                         'total_pages': 1,
                         'page': 0,
                         'page_label': '1',
                         'start_index': 641},
            'page_content': 'person,\n'
                            ' \n'
                            'and\n'
                            ' \n'
                            'that\n'
                            ' \n'
                            'is\n'
                            ' \n'
                            'myself.\n'
                            '  I’m  executing  this  declaration  to  be  '
                        

In [6]:
type(splits[1])

langchain_core.documents.base.Document

In [7]:
# get embeddings for each split
from langchain_openai import OpenAIEmbeddings

embeddding_model = OpenAIEmbeddings(model="text-embedding-3-large")

generated_embeddings = []

for i in splits:
    generated_embeddings.append(embeddding_model.embed_query(i.page_content))

pprint.pp(len(generated_embeddings[0]))
pprint.pp(generated_embeddings[0][:10])

3072
[0.020266450941562653,
 -0.050126660615205765,
 -0.007206254173070192,
 -0.006685012485831976,
 -0.009287575259804726,
 0.020222710445523262,
 -0.024480123072862625,
 0.026725471019744873,
 -0.025282032787799835,
 0.014186515472829342]


In [8]:
# consttruct vector store
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="affidavit_letter", embedding_function=embeddding_model, persist_directory="./.langchain_chroma_db"
)
ids = vector_store.add_documents(documents=splits)

In [15]:
results = vector_store.similarity_search_with_score("Name of husband")
for i in results:
    print(i[1], repr(i[0].page_content))

1.2274361848831177 'AFFIDAVIT  FOR  CHANGE  OF  NAME  AFTER  MARRIAGE   I,  Thuropatha,  daughter  of  Mr.  K.  Somanathan  and  wife  of  Mr.  K.  Rajadurai  aged  60  \nresiding\n \nat\n \n65/1A,\n \nThisaweerasingam\n \nSquare\n \nWest,\n \nBatticaloa,\n \ndo\n \nhereby\n \nsolemnly\n \naffirm\n \nand\n \ndeclare\n \nas\n \nunder:\n  1.  That  my  maiden  name  is  Miss.  Somanathan  Thuropatha.  2.  That  I  got  married  to  Mr.  K.  Rajadurai  on  21/10/1998  at  Palugamam  2.  3.  After  marriage,  my  name  is  Mrs.  Thuropatha  Rajadurai.  4.  I  state  that  Miss.  Somanathan  Thruopatha  and  Mrs.  Thruropatha  Rajadurai  are  \nthe\n \nnames\n \nof\n \none\n \nand\n \nthe\n \nsame\n \nperson,\n \nand\n \nthat\n \nis\n \nmyself.\n  I’m  executing  this  declaration  to  be  submitted  to  the  concerned  authorities  for  the  \nchange\n \nof\n \nmy\n \nname\n \nin\n \nthe\n \nmembership\n \ndatabase.'
1.4031236171722412 'person,\n \nand\n \nthat\n \nis\n \nmyself.\n  I’m  e