In [None]:
import os
from langchain_community.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader

pdfPath = "./data/file.pdf"
model = "llama3.2"

if os.path.exists(path=pdfPath):
    loader = UnstructuredPDFLoader(file_path=pdfPath)
    data = loader.load()
    print(data[0].page_content[:100])
else:
    print("Upload a pdf file")

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)
chunks = text_splitter.split_documents(data)
print(f"chunks: {len(chunks)}, \nexample: {chunks[0]}")

In [None]:
from pinecone import Pinecone, ServerlessSpec
from langchain_ollama import OllamaEmbeddings
from langchain_pinecone import PineconeVectorStore

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index_name = "simple-rag"
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=768,
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

embeddings = OllamaEmbeddings(model="nomic-embed-text")
vectors = embeddings.embed_documents(texts=["my name is chandan"])

vector_db = PineconeVectorStore(index_name=index_name, embedding=embeddings)
vector_db.from_documents(index_name=index_name, documents=chunks, embedding=embeddings)

In [None]:
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

llm = ChatOllama(model=model)

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

retriever = MultiQueryRetriever.from_llm(
    retriever=vector_db.as_retriever(), llm=llm, prompt=QUERY_PROMPT
)


template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    # | StrOutputParser()
)

# res = chain.invoke(input=("what is the document about?",))
res = chain.invoke(input=("what is the linked lists?",))
print(res.content)