# Vector Database | PineconeDB

Installations

In [None]:
!pip install pinecone

In [None]:
!pip install pypdf

In [None]:
!pip install langchain-community langchain-mistralai

In [None]:
!pip install langchain-pinecone

In [None]:
!pip install langchain_pinecone

In [None]:
!pip install --upgrade langchain_community

Imports

In [55]:
import os
import time
from typing import List
from uuid import uuid4
from google.colab import userdata
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_core.documents import Document
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

Setup Variables

In [7]:
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
MISTRAL_API_KEY = userdata.get('MISTRAL_API_KEY')

## By Using LangChain

In [18]:
loader = PyPDFLoader("/content/data/Accenture-Terms-Conditions-2022.pdf")
pages = loader.load()
len(pages)

25

In [21]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

In [34]:
docs = text_splitter.split_documents(pages)
len(docs)

156

In [19]:
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = 'test-index'

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

*It may take some time as model will be downloaded locally*

In [27]:
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
existing_indexes

['articles-embeddings', 'test-index']

In [28]:
if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=768,  # sentence-transformer embedding dimension
        metric='cosine',
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [32]:
index = pc.Index(index_name)
index.config

Config(api_key='pcsk_NutMZ_Qg9vdjC3XdEph1zW8bavyLn2Um8GboUH688bXph3X8MDGYXUeHBWoZdC66gmMYs', host='https://test-index-ci7scyq.svc.aped-4627-b74a.pinecone.io', proxy_url=None, proxy_headers=None, ssl_ca_certs=None, ssl_verify=None, additional_headers={}, source_tag=None)

In [52]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

ModuleNotFoundError: No module named 'langchain_pinecone'

In [None]:
uuids = [str(uuid4()) for _ in range(len(docs))]


In [35]:
pc.from_documents(
            docs,
            embeddings,
            index_name=index_name
        )

AttributeError: from_documents is not a top-level attribute of the Pinecone class provided by pinecone's official python package developed at https://github.com/pinecone-io/pinecone-python-client. You may have a name collision with an export from another dependency in your project that wraps Pinecone functionality and exports a similarly named class. Please refer to the following knowledge base article for more information: https://docs.pinecone.io/troubleshooting/pinecone-attribute-errors-with-langchain


In [None]:
class PDFPineconeQA:
    def __init__(self, mistral_api_key: str, pinecone_api_key: str, index_name: str):
        """
        Initialize the PDF to Pinecone QA system

        Args:
            mistral_api_key (str): Mistral AI API key
            pinecone_api_key (str): Pinecone API key
            index_name (str): Name of the Pinecone index to use
        """
        self.mistral_api_key = mistral_api_key
        self.pinecone_api_key = pinecone_api_key
        self.index_name = index_name

        # Initialize embeddings using sentence-transformers
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2"
        )

        # Initialize Pinecone
        self.pinecone = Pinecone(api_key=pinecone_api_key)

        # Check if index exists, if not create it
        if index_name not in self.pinecone.list_indexes():
            self.pinecone.create_index(
                name=index_name,
                dimension=768,  # sentence-transformer embedding dimension
                metric='cosine'
            )

        self.index = self.pinecone.Index(index_name)

    def load_pdf(self, pdf_path: str) -> List:
        """
        Load PDF and split into documents

        Args:
            pdf_path (str): Path to PDF file

        Returns:
            List: List of document chunks
        """
        # Load PDF
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()

        # Split into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100,
            length_function=len
        )

        return text_splitter.split_documents(pages)

    def upload_to_pinecone(self, documents: List) -> None:
        """
        Upload documents to Pinecone

        Args:
            documents (List): List of document chunks to upload
        """
        # Create vector store
        Pinecone.from_documents(
            documents,
            self.embeddings,
            index_name=self.index_name
        )

    def query_documents(self, query: str, k: int = 3) -> List:
        """
        Query documents from Pinecone

        Args:
            query (str): Query string
            k (int): Number of documents to retrieve

        Returns:
            List: List of relevant documents
        """
        # Create vector store for querying
        vectorstore = Pinecone.from_existing_index(
            index_name=self.index_name,
            embedding=self.embeddings
        )

        # Create retrieval chain with Mistral
        qa_chain = RetrievalQA.from_chain_type(
            llm=ChatMistralAI(
                mistral_api_key=self.mistral_api_key,
                model="mistral-tiny"  # or "mistral-small" or "mistral-medium"
            ),
            chain_type="stuff",
            retriever=vectorstore.as_retriever(search_kwargs={"k": k})
        )

        return qa_chain.run(query)

# Example usage
def main():
    # Initialize environment variables
    mistral_api_key = os.getenv("MISTRAL_API_KEY")
    pinecone_api_key = os.getenv("PINECONE_API_KEY")

    # Create QA system
    qa_system = PDFPineconeQA(
        mistral_api_key=mistral_api_key,
        pinecone_api_key=pinecone_api_key,
        index_name="pdf-qa-index"
    )

    # Load and process PDF
    pdf_path = "path/to/your/pdf"
    documents = qa_system.load_pdf(pdf_path)

    # Upload to Pinecone
    qa_system.upload_to_pinecone(documents)

    # Query documents
    query = "What is the main topic of the document?"
    result = qa_system.query_documents(query)
    print(f"Query: {query}")
    print(f"Answer: {result}")

if __name__ == "__main__":
    main()