### Import libraries

In [1]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage 


  from tqdm.autonotebook import tqdm


#### load pinecone API Key

In [2]:
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

### Load Documents


In [3]:
directory = 'Data'#this folder has the pdf file
def load_docs(directory):
    loader = PyPDFDirectoryLoader(directory)
    documents = loader.load()
    return documents

documents = load_docs(directory)
print(f"Number of pages loaded: {len(documents)}")

Number of pages loaded: 483


### Split documents into smaller chunks


In [4]:
def split_docs(documents, chunk=500, chunk_overlap=40):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs

docs = split_docs(documents)
print(f"Number of document chunks: {len(docs)}")

Number of document chunks: 5826


In [5]:
# Step 3: Hugging Face Embeddings
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})



#### Initialize Pinecone and Create Index and connect to it


In [6]:
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "my-index"

if index_name not in pc.list_indexes():
    pc.create_index(
    name=index_name,
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) ,
    dimension=384 
)
    
# Connect to the index
index = pc.Index(index_name)

print(f"Successfully connected to Pinecone index: {index_name}")

Successfully connected to Pinecone index: my-index


#### Embed and Upsert document chunks to Pinecone


In [7]:
for i, doc in enumerate(docs):
    embedding = embeddings.embed_query(doc.page_content)
    
    metadata = {"source": doc.metadata["source"], "page": doc.metadata["page"]}
    index.upsert([(f"doc_{i}", embedding, metadata)])

print("All document chunks have been embedded and upserted to Pinecone.")

All document chunks have been embedded and upserted to Pinecone.


In [8]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5903}},
 'total_vector_count': 5903}