# Vector Database | PineconeDB

Installations

In [56]:
!pip install pinecone

In [None]:
!pip install pypdf

In [None]:
!pip install langchain-community langchain-mistralai

In [None]:
!pip install langchain-pinecone

In [None]:
!pip install langchain_pinecone

In [None]:
!pip install --upgrade langchain_community

Imports

In [67]:
import os
import time
from typing import List
from uuid import uuid4
from google.colab import userdata
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_core.documents import Document
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

Setup Variables

In [68]:
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
MISTRAL_API_KEY = userdata.get('MISTRAL_API_KEY')

## By Using LangChain

In [None]:
loader = PyPDFLoader("/content/data/Accenture-Terms-Conditions-2022.pdf")
pages = loader.load()
len(pages)

25

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

In [None]:
docs = text_splitter.split_documents(pages)
len(docs)

156

In [None]:
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = 'test-index'

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

*It may take some time as model will be downloaded locally*

In [None]:
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
existing_indexes

['articles-embeddings', 'test-index']

In [None]:
if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=768,  # sentence-transformer embedding dimension
        metric='cosine',
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [None]:
index = pc.Index(index_name)
index.config

Config(api_key='pcsk_NutMZ_Qg9vdjC3XdEph1zW8bavyLn2Um8GboUH688bXph3X8MDGYXUeHBWoZdC66gmMYs', host='https://test-index-ci7scyq.svc.aped-4627-b74a.pinecone.io', proxy_url=None, proxy_headers=None, ssl_ca_certs=None, ssl_verify=None, additional_headers={}, source_tag=None)

In [57]:
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [58]:
uuids = [str(uuid4()) for _ in range(len(docs))]

In [59]:
vector_store.add_documents(documents=docs, ids=uuids)

['a7c47eaa-8318-4ea6-85b5-8c6120595db2',
 '9a4d4478-0673-4829-9bed-9383401363a2',
 '57affa9a-dad6-4ff4-a2d9-a0e777c8566b',
 '56e20fa5-27ee-4db5-877b-3f3b5eeee79a',
 '86dce88a-850e-4c4a-b640-d6b57129e8c9',
 '9e8b8b99-1685-4e71-956d-e07474018c67',
 'af1d0f60-8c1a-45bd-8bc0-fe1e74d32a8b',
 '9344fa70-e35a-4f6f-861f-88c2dd79589b',
 '2dbcfe66-6595-4b3a-bd37-fbe4a21d0cf4',
 'a1d2debb-7b0f-406d-81e3-32fe0e9bc1a9',
 '18973ec4-7c1d-415c-b778-775e84cdcd83',
 '3126feb9-9cfd-4a12-a7f7-a06282b7196b',
 '7066e95f-41f4-4652-9ff6-47351699e7eb',
 '6a6197f0-58ee-4ba9-bf91-c9f91e8b26ae',
 'ca943795-1a61-4ef7-9952-c109008e571c',
 'c962ea78-2b2e-4862-a205-89d1bbcc5d48',
 'f957eb18-712a-4a26-9511-d84ecfe95f2a',
 '8e0f306a-6e09-4ade-9c93-72f564d8c69d',
 'd3f547fd-4590-4a7f-9f78-885fa1237563',
 '7691bce6-aaa1-4e38-9d9e-6294da346899',
 '05b04f76-29e9-448b-a0e9-532feac6d93e',
 '7703e409-240c-4986-bfd5-91d8af08cfa4',
 '7db1ff78-5419-4cc8-bce0-28d4ab9ee0ac',
 'd2b29bdd-6360-44ca-9c53-7aa8943eb143',
 '99ba7703-6b25-

In [69]:
qa_chain = RetrievalQA.from_chain_type(
            llm=ChatMistralAI(
                mistral_api_key=MISTRAL_API_KEY,
                model="mistral-tiny"  # or "mistral-small" or "mistral-medium"
            ),
            chain_type="stuff",
            retriever=vector_store.as_retriever(search_kwargs={"k": 2})
        )

In [70]:
qa_chain.run("What is the main topic of the document?")

"The main topic of the document, based on the provided context, appears to be the amendment of the Terms and Conditions of an agreement. The amendments cover various sections, including governance laws, jurisdiction, intellectual property, export controls, and indemnification. However, without more specific context, it's difficult to pinpoint a single, overarching topic. The document seems to be focused on modifying the terms of a legal agreement between two parties."

In [72]:
results = vector_store.similarity_search_with_score(
    "Is there any confidential section?", k=2
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.467038] not disclose the Confidential Information to any third party. A Recipient will protect it by using the same degree of care, but no less 
than a reasonable degree of care, to prevent the unauthorized use, dissemination or publication as the Recipient uses to protect its 
own confidential information of a like nature. The Recipient may disclose the Confidential Information to its affiliates, agents and 
subcontractors with a need to know in order to fulfill the purpose of the Agreement, under a nondisclosure agreement at least as 
protective of the Discloser’s rights as this Agreement. [{'page': 3.0, 'source': '/content/data/Accenture-Terms-Conditions-2022.pdf'}]
* [SIM=0.393984] the Confidential Information as is required.     
8.4 Each party will retain all right, title and interest to such party’s Confidential Information.  The parties acknowledge that a 
violation of the Recipient’s obligations with respect to Confidential Information may cause irreparable harm to th

*Instead of using `PineconeVectorStore()` we can directly pass the vectors into database by upserting them.*