In [1]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_openai import OpenAIEmbeddings

import pinecone
from pinecone import ServerlessSpec
import time

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY") #langchain searches for this key and loads it automatically

In [2]:
# Load PDF File
loader = PyPDFLoader('./assets/srp-covid-19-6month.pdf')
pdf = loader.load()

In [3]:
# Transform pdf text into chunks and create document
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(pdf)

In [45]:
# Vector Embedding and Querying with ChromaDB
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(documents, OpenAIEmbeddings())

# Vector database
query = "When was the first COVID case discovered in the united states?" 
result = db.similarity_search(query)

In [46]:
# epub_loader = UnstructuredEPubLoader(
#     file_path="./RAG/assets/dokumen.pub_beginning-python-from-novice-to-professional-3rd-edition.epub", 
#     mode="elements", 
#     strategy="fast")


In [47]:
# Vector Embedding and Querying with Pinecone

#Setting up pinecone
pc = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

    
# Check if index exsits before creation
index_name = "rag-project"
if index_name not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
        name=index_name, 
        dimension=1536, # Must match ada-002 output
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    )
    time.sleep(5) # Wait for index to be ready
    
# Connect to the index
index = pc.Index(index_name)


In [None]:
# Create embeddings using OpenAI "text-embedding-ada-002"
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

# Generate embeddings for documents
docs = [doc.page_content for doc in documents]
embeddings = embed_model.embed_documents(docs)

In [7]:
# Prepare data for insertion
vectors = [(str(i), embeddings[i], {"text": docs[i]}) for i in range(len(docs))]

# Insert into Pinecone
index.upsert(vectors = vectors, namespace = "all-users")

{'upserted_count': 202}