In [42]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain.embeddings.openai import OpenAIEmbeddings

import pinecone
from pinecone import ServerlessSpec
import time

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY") #langchain searches for this key and loads it automatically

In [43]:
# Load PDF File
loader = PyPDFLoader('./assets/srp-covid-19-6month.pdf')
pdf = loader.load()

In [44]:
# Transform pdf texst into chunks and create document
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(pdf)

In [45]:
# Vector Embedding and Querying with ChromaDB
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(documents, OpenAIEmbeddings())

# Vector database
query = "When was the first COVID case discovered in the united states?" 
result = db.similarity_search(query)

In [46]:
# epub_loader = UnstructuredEPubLoader(
#     file_path="./RAG/assets/dokumen.pub_beginning-python-from-novice-to-professional-3rd-edition.epub", 
#     mode="elements", 
#     strategy="fast")


In [47]:
# Vector Embedding and Querying with Pinecone

#Setting up pinecone
pc = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

    
# Check if index exsits before creation
index_name = "rag-project"
if index_name not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
        name=index_name, 
        dimension=1536, # Must match ada-002 output
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    )
    time.sleep(5) # Wait for index to be ready
    
# Connect to the index
index = pc.Index(index_name)


In [48]:
# Create embeddings
from langchain.embeddings.openai import OpenAIEmbeddings

# Use "text-embedding-ada-002"
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

# Generate embeddings for documents
docs = [doc.page_content for doc in documents]
embeddings = embed_model.embed_documents(docs)

In [None]:
# Create The Retriever

# Wrap Pinecone index in a LangChain retriever
from langchain_pinecone import Pinecone
from langchain_community.retrievers import PineconeHybridSearchRetriever

# vector_store = pinecone.Pinecone.from_existing_index(
#     index_name=index_name,
#     embedding=embed_model, 
#     namespace="all-users"
# )

# retriever = vector_store.as_retriever(search_kwargs={"k": 3})  # Retrieve top 3 results
retriever = PineconeHybridSearchRetriever(
    index=index,
    embeddings=OpenAIEmbeddings(),  # Provide the embedding function for the query# Retrieve top 3 similar documents
    text_key="context",
)

ValidationError: 1 validation error for PineconeHybridSearchRetriever
search_kwargs
  Extra inputs are not permitted [type=extra_forbidden, input_value={'k': 3}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/extra_forbidden

In [7]:
# Prepare data for insertion
vectors = [(str(i), embeddings[i], {"text": docs[i]}) for i in range(len(docs))]

# Insert into Pinecone
index.upsert(vectors = vectors, namespace = "all-users")

{'upserted_count': 202}

In [10]:
# Define your query
query = "When was the first COVID case discovered in the United States?"

# Convert the query into an embedding using the same model as the documents
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
query_embedding = embeddings.embed_query(query)

# Search the Pinecone index
results = index.query(
    namespace="all-users",
    vector=query_embedding,
    top_k=3,
    include_values=False,
    include_metadata=True
)

#print(results)


#### Recievers

In [None]:
# Import Ollama LLM
from langchain_ollama import OllamaLLM

# Load Ollama LAMA2 LLM Model
llm = OllamaLLM(model='llama2 ')

# Design Chat Prompt Template
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
    Answer the following question based only on the provided context.
    Think step by step before providing a detailed answer.
    I will tip you $1000 if the user finds the answer helpful.
    <context>
        {context}
    </context>
    Question: {input}""")

# Chain Introduction

# Create Stuff Document Chain
from langchain.chains.combine_documents import create_stuff_documents_chain
 
document_chain = create_stuff_documents_chain(llm, prompt)

# Retrievers
from langchain.chains import create_retrieval_chain

retrieval_chain = create_retrieval_chain(retriever, document_chain)
retrieval_chain.invoke({"input": query})