In [27]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredEPubLoader

import pinecone
from pinecone import ServerlessSpec
import time

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY") #langchain searches for this key and loads it automatically

In [4]:
# Load PDF File
loader = PyPDFLoader('./RAG/assets/srp-covid-19-6month.pdf')
pdf = loader.load()

In [5]:
# Transform pdf texst into chunks and create document
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(pdf)

In [13]:
# Vector Embedding and Querying with ChromaDB
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(documents, OpenAIEmbeddings())

# Vector database
query = "When was the first COVID case discovered in the united states?" 
result = db.similarity_search(query)

In [21]:
# epub_loader = UnstructuredEPubLoader(
#     file_path="./RAG/assets/dokumen.pub_beginning-python-from-novice-to-professional-3rd-edition.epub", 
#     mode="elements", 
#     strategy="fast")


In [35]:
# Vector Embedding and Querying with Pinecone

#Setting up pinecone
pc = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

    
# Check if index exsits before creation
index_name = "rag-project"
if index_name not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
        name=index_name, 
        dimension=1536, # Must match ada-002 output
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    )
    time.sleep(5) # Wait for index to be ready
    
# Connect to the index
index = pc.Index(index_name)


In [36]:
# Create embeddings
from langchain.embeddings.openai import OpenAIEmbeddings

# Use "text-embedding-ada-002"
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

# Generate embeddings for documents
docs = [doc.page_content for doc in documents]
embeddings = embed_model.embed_documents(docs)

In [37]:
# Prepare data for insertion
vectors = [(str(i), embeddings[i], {"text": docs[i]}) for i in range(len(docs))]

# Insert into Pinecone
index.upsert(vectors = vectors, namespace = "all-users")

{'upserted_count': 202}

In [42]:
# Define your query
query = "When was the first COVID case discovered in the United States?"

# Convert the query into an embedding using the same model as the documents
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
query_embedding = embeddings.embed_query(query)

# Search the Pinecone index
results = index.query(
    namespace="all-users",
    vector=query_embedding,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)


{'matches': [{'id': '101',
              'metadata': {'text': 'represent WHO at the ad hoc Working Group '
                                   'for the COVID-19 response. The Regional '
                                   'office also regularly \n'
                                   'engages in deep-dive calls with WCOs to '
                                   'discuss in-depth epidemiological analysis, '
                                   'transmission scenarios \n'
                                   'and strategic priorities across the nine '
                                   'pillars.\n'
                                   'Region of the Americas\n'
                                   'The first case of COVID-19 in the Americas '
                                   'was confirmed in the USA on 20 \n'
                                   'January 2020, followed by Brazil on 26 '
                                   'February 2020. Since then, COVID-19 \n'
                                   'h