

# Just before starting 

You can check completely explained working here: https://medium.com/@muhammad.bese23seecs/building-a-rag-powered-pinecone-database-using-ocr-a-practical-guide-with-pakistani-law-d83e869e1458

## Issues and Considerations

This notebook requires a few installation to run. First is tesseract:

For windows: https://stackoverflow.com/questions/46140485/tesseract-installation-in-windows

For Linux: run on the cli "sudo apt-get install tesseract-ocr"

For Mac: https://www.oreilly.com/library/view/building-computer-vision/9781838644673/95de5b35-436b-4668-8ca2-44970a6e2924.xhtml


Next install pytesseract in your environment

For uv: uv pip install pytesseract

For pip install: pip install pytesseract


You would require an OpenAI API key and Pinecone API key in your .env file


In [134]:
# Install LangChain Unstructured (which requires unstructured under the hood)

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import getpass
from pinecone import Pinecone
import os
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from unstructured.partition.pdf import partition_pdf
import glob

In [135]:
# Initialize embeddings and Pinecone vector store
embeddings = OpenAIEmbeddings()  #

In [136]:

# Pinecone client
if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

In [137]:
# Index Creation and its testing 
index_name = "lahore-cases" # Replace the name with anything you like 
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )


In [138]:
# vector store
index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index, embedding=embeddings)


In [None]:
files = glob.glob(pathname='./**/*.pdf',recursive=True) # I have set recursive = True so that we can check subdirectories too.
print(len(files)) # confirm that you have all of the pdfs here with the correct path

In [140]:
chunks = [] # The array to store the sections in 
section_content = ""
index = 1
for file_path in files:
    print(f"File Number {index} completed:",file_path) # To keep track of files
    index+=1
    elements = partition_pdf(file_path, languages=["eng"],strategy="fast")
    for element in elements:
        if element.category == 'NarrativeText': # meaning that it is simmple text 
            section_content+=element.text # Then append it to the already going section content
        elif element.category=="ListItem":
            chunks.append({"page_content":section_content,"metadata":element.metadata})
            section_content="" # Because a new sectionn has started
            section_content += element.text # The string should start with the title of the text


In [None]:
len(set([chunk['metadata'].filename for chunk in chunks])) # Check if all of the completed files are here

In [None]:
chunks # How each chunk looks like

In [143]:
from langchain_core.documents import Document
# How pinecone expects each chunk to be
docs = [Document(page_content=chunk['page_content'],metadata={"source":chunk['metadata'].filename}) for chunk in chunks]

In [None]:
for doc in docs:
    print(doc.page_content)

In [None]:
# Now add all of the docs in the pinceone namespace
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(docs))]
batch_size = 200
for i in range(0, len(docs), batch_size):
    print("Current Batch Index is:",i)
    batch = docs[i:i+batch_size]
    batch_ids = uuids[i:i+batch_size]
    vector_store.add_documents(batch,ids=batch_ids)

In [147]:
res = vector_store.similarity_search(query="Which act contains the words 'nothing from this act can be removed from the railways ..",k=10)

In [None]:
res