In [1]:
print("ok")

ok


In [2]:
%pwd

'/home/efulo/repos/ml/ragion/research'

In [3]:
import os
os.chdir("../")
%pwd

'/home/efulo/repos/ml/ragion'

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
#load_pdf_file function extracts data from the pdf file
def  load_pdf_file(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)

    documents = loader.load()

    return documents

In [6]:
extracted_data = load_pdf_file(data="data/")

In [7]:
# extracted_data

In [8]:
#text_split function splits the data into text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)

    return text_chunks

In [9]:
text_chunks = text_split(extracted_data)
print("length of text chunks", len(text_chunks))

length of text chunks 165


In [10]:
from langchain_huggingface import HuggingFaceEmbeddings

In [11]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [12]:
embeddings = download_hugging_face_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
query_result = embeddings.embed_query("hello world")
print("length", len(query_result))

length 384


In [14]:
# query_result

In [15]:
from dotenv import load_dotenv
load_dotenv()

True

In [16]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")


In [18]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "ragion"

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

{
    "name": "ragion",
    "metric": "cosine",
    "host": "ragion-unkq2vm.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [19]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY


In [20]:
from langchain_pinecone import PineconeVectorStore

#embedding each chunk and upsert the embeddings into pinecone index
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [21]:
#loading existing index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
)

In [22]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [23]:
retrieved_docs = retriever.invoke("what is the minimum clearance from road base material to SHW?")

In [25]:
retrieved_docs

[Document(id='3b10967c-ef48-4fbe-a1af-4ac233323b6b', metadata={'author': 'LEA', 'creationdate': '2015-05-26T13:06:23-04:00', 'creator': 'Microsoft® Word 2013', 'moddate': '2015-05-29T10:41:16-04:00', 'page': 16.0, 'page_label': '17', 'producer': 'Microsoft® Word 2013', 'source': 'data/Stormwater_Design_Standards.pdf', 'total_pages': 54.0}, page_content='2.4.2.  MI NIM UM  GRO U ND WA TE R AN D HIG H-WA TE R CL E A RA NC E S \nAll roadways shall be designed to provide a minimum of one (1) foot between the bottom of the base course of \nthe roadway and the seasonal high groundwater table. Roadside underdrains may be used in lieu of  meeting \nthis standard, provided that the underdrains will result in the seasonal high water table under the roadway being \nlowered to the above cited level.'),
 Document(id='a08b33de-9a5c-45db-8ee3-e0fe2682401a', metadata={'author': 'LEA', 'creationdate': '2015-05-26T13:06:23-04:00', 'creator': 'Microsoft® Word 2013', 'moddate': '2015-05-29T10:41:16-04:00'

In [24]:
from langchain_openai import OpenAI
from langchain_google_genai import GoogleGenerativeAI

# llm = OpenAI(temperature=0.4, max_tokens=500)
llm = GoogleGenerativeAI(model="models/gemini-2.0-flash", google_api_key=GOOGLE_API_KEY)

In [25]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you do not know the answer, say that you "
    " do not know. Directly state the code, manual and the data source for each answer and quote the text only. always state the page number and link to the source "
    "do not add anything extra."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [26]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [27]:
response = rag_chain.invoke({"input": "what is the minimum clearance from road base material to SHW?"})
print(response["answer"])

The minimum clearance from the bottom of the road base to the seasonal high groundwater table (SHW) is one foot.

"All roadways shall be designed to provide a minimum of one (1) foot between the bottom of the base course of the roadway and the seasonal high groundwater table." (Page 1)


In [59]:
response = rag_chain.invoke({"input": 'what is the minimum drainage measurement for a 24 inch pipe in a residential development?'})
print(response["answer"])

This question cannot be answered from the given source.


In [28]:
response = rag_chain.invoke({"input": 'what design storm is required for a local road if it is not within a floodplain?'})
print(response["answer"])

For local streets, bridges and culverts not in the published one hundred (100) year floodplain, the design storm shall be twenty-five (25) year frequency. (MANATEE COUNTY PUBLIC WORKS STANDARDS PART 2 - STORMWATER MANAGEMENT DESIGN MANUAL, [05/15] Page SW-4)
