# **Vector Database Pinecone:**

## **Install Pinecone:**

In [None]:
!pip install "pinecone[grpc]" -qU

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.3/1.3 MB[0m [31m53.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m419.8/419.8 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following depe

## **Create Index (Serverless):**

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key="pinecone-api-key")

pc.create_index(
  name="test",
  dimension=1536,
  metric="cosine",
  spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  ),
  deletion_protection="disabled" # enabled means index never deleted, disabled means index can be deleted.
)

## **Load the Documents:**

In [5]:
!pip install langchain -qU
!pip install langchain_community -qU
!pip install unstructured[pdf] -qU
!pip install pypdfium2 -qU

### **Load 2 Pdfs:**

In [None]:
from langchain_community.document_loaders import PDFPlumberLoader, PyPDFium2Loader
from pathlib import Path
from IPython.display import display, Markdown

# load the pdf documents:

def load_pdf(file_path:Path):
  try:
    loader = PDFPlumberLoader(file_path)
    documents = loader.load()
    return documents, len(documents)

  except Exception as ex:
    return ex

In [None]:
transformer_docs = load_pdf(Path("Research Paper Transformer.pdf"))
transformer_docs = transformer_docs[0]

In [None]:
seq2seq_docs = load_pdf(Path("Research Paper Sequence to Sequence Learning.pdf"))
seq2seq_docs = seq2seq_docs[0]

### **Pre-processed the Docs:**

In [None]:
import re

def clean_text(text):
    # Remove newlines, tabs, and extra spaces
    text = re.sub(r'[\n\t\r]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


def preprocess_documents(documents, subject):
    processed_docs = []
    for doc in documents:
        # Clean the page content
        cleaned_content = clean_text(doc.page_content)

        # Create Structured Documents:
        processed_doc = {
            "subject": subject,
            "source": doc.metadata.get('source', ''),
            # "file_path": doc.metadata.get('file_path', ''),
            "page": doc.metadata.get('page', 0),
            "total_pages": doc.metadata.get('total_pages', 0),
            "content": cleaned_content
         }

        processed_docs.append(processed_doc)

    return processed_docs

In [None]:
transformer_pre_processed_docs = preprocess_documents(transformer_docs, "Transformer")
seq2seq_pre_processed_docs = preprocess_documents(seq2seq_docs, "Sequence to Sequence Learning")

In [None]:
seq2seq_pre_processed_docs

## **Load Embedding Model:**

In [1]:
!pip install boto3 -qU

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/139.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/12.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/12.6 MB[0m [31m130.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m10.1/12.6 MB[0m [31m141.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m12.6/12.6 MB[0m [31m155.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.6/12.6 MB[0m [31m87.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/82.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   

In [6]:
import boto3
from langchain.llms.bedrock import Bedrock
from langchain.embeddings import BedrockEmbeddings

AWS_REGION = ''
AWS_ACCESS_KEY = ''
AWS_SECRET_KEY = ''

def get_embeddings():
  try:
    bedrock_client = boto3.client(
          service_name = "bedrock-runtime",
          region_name = AWS_REGION,
          aws_access_key_id = AWS_ACCESS_KEY,
          aws_secret_access_key = AWS_SECRET_KEY,
      )
    # The model_id was incorrect. Changing it to amazon.titan-text-embed-v1
    bedrock_embedding = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
    return bedrock_embedding

  except Exception as ex:
    return ex

embedding_model = get_embeddings()

  bedrock_embedding = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)


In [None]:
hh = embedding_model.embed_query("Hello Wrold")
len(hh)

## **Store Embedding & Metadata into Pinecone:**

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
pc = Pinecone(api_key="")
index = pc.Index("test")

In [None]:
# Store embeddings;

def store_embeddings(docs, namespace, embeddings=embedding_model):
  vector_list = []
  for i, doc in enumerate(docs):
    metadata = {
        "subject": doc["subject"],
        "source": doc["source"],
        "page": doc["page"],
        "total_pages": doc["total_pages"],
        'content': doc['content']
    }
    id_ = doc["subject"] + "_" + str(i)
    embedding = embeddings.embed_query(doc["content"])
    vc = {
        'id': id_,
        'values': embedding,
        'metadata': metadata
    }

    vector_list.append(vc)

  index.upsert(vectors=vector_list, namespace=namespace)



store_embeddings(docs=transformer_pre_processed_docs, namespace="Transformer")
store_embeddings(docs=seq2seq_pre_processed_docs, namespace="Seq2Seq")

## **Create Retriever (Using LangChain):**

In [7]:
!pip install pinecone-client langchain -qU

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [14]:
# Configure Pinecone:

import os
from pinecone import Pinecone

# Initialize the Pinecone client
pc = Pinecone(
    api_key="pinecone-api-key",  # Set your API key in an environment variable
    environment="us-east-1"   # Set your Pinecone environment
)


index = pc.Index("test")
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'Seq2Seq': {'vector_count': 9},
                'Transformer': {'vector_count': 58}},
 'total_vector_count': 67}

In [16]:
# Create retriever:

def create_retriever(query, namespace, top_k=5):
  query_embedding = embedding_model.embed_query(query)
  search_results = index.query(
        vector=query_embedding,
        top_k=top_k,
        namespace=namespace,
        include_metadata=True
    )
  return search_results['matches']


create_retriever(query='what is Transformer', namespace='Transformer', top_k=1)

[{'id': 'Transformer_12',
  'metadata': {'content': 'Figure4: '
                          'ProportionoftransformerapplicationinTop-5fields '
                          'analysisidentifiedseveralhighlyimpactfulandsignificanttransformer-basedmodelsthathavebeensuccessfullyappliedina '
                          'varietyoffields. '
                          'Wethenorganizedthesemodelsintofivedifferentapplicationareas: '
                          'NaturalLanguageProcessing(NLP), '
                          'ComputerVision,Multi-modality,AudioandSpeech,andSignalProcessing. '
                          'TheproposedtaxonomyinFigure5providesa '
                          'morenuancedandcomprehensiveframeworkforunderstandingthediverseapplicationsoftransformers. '
                          'Webelievethatthis '
                          'taxonomywouldbebeneficialforresearchersandpractitionersworkingontransformer-basedmodels,asitwouldhelpthem '
                          'toidentifythemostrelevantmodels

## **Integrate with LLM:**

### **Create 2 separate retriever:**

In [30]:
import os

os.environ["PINECONE_API_KEY"] = "pinecone-api-key"

In [33]:
from langchain.vectorstores import Pinecone as PineconeVectorStore

vectorstore_transformer = PineconeVectorStore.from_existing_index(
    index_name='test',
    embedding=embedding_model,
    text_key="content",
    namespace="Transformer"  # Replace with your namespace
)

vectorstore_seq2seq = PineconeVectorStore.from_existing_index(
    index_name='test',
    embedding=embedding_model,
    text_key="content",
    namespace="Seq2Seq"  # Replace with your namespace
)

In [None]:
vectorstore_transformer.similarity_search(
    query="what is Transformer",
    k=2
)

### **LLM Generation:**

In [35]:
!pip install langchain_google_genai -qU

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/160.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.8/160.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/760.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m760.0/760.0 kB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [38]:
from google.colab import userdata
from langchain_google_genai import ChatGoogleGenerativeAI


GEMINI_API_KEY = userdata.get("GEMINI_API_KEY")

llm = ChatGoogleGenerativeAI(
    model="gemini-pro",
    google_api_key=GEMINI_API_KEY,
    temperature=0.5,
    max_tokens=1024,
    max_length=1024,
)

In [39]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from IPython.display import display, Markdown


prompt = """
You are an AI-powered virtual assistant, your name is 'Dibyendu', designed by Dibyendu Biswas, who is an AI Engineer.
Your task is to answer based on user's query in detailed way.
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that 'I don't have enough information to answer this question'.
Provide only the helpful answer. Do not include any other information.

Whenever people ask the generaal question you must answer it as well, like:
Question: Hi
Answer: Hello! How can I assist you with your studies today?

Question: What is your name?
Answer: I am Dibyendu, your virtual assistant designed by Dibyendu Biswas, an AI Engineer.

Context: `{context}`
Question: `{question}`
"""

prompt_template = PromptTemplate(
    template=prompt,
    input_variables=['context', 'question']
)


def generation(vectorstore, query, llm=llm):
  qa_chain = RetrievalQA.from_chain_type(
      llm=llm,
      chain_type="stuff",
      retriever=vectorstore.as_retriever(),
      chain_type_kwargs={"prompt": prompt_template}
  )

  response = qa_chain.invoke(query)
  res = display(Markdown(response['result']))

  return res

In [41]:
generation(vectorstore=vectorstore_transformer, query="Tell me something about Transformer")

Transformers are a type of deep neural network (DNNs) that offer a solution to the limitations of sequence-to-sequence (seq-2-seq) architectures, including short-term dependency of sequence inputs and the sequential processing of input, which hinders parallel training of networks. Transformers leverage the multi-head self-attention mechanism to extract features, and they exhibit great potential for application in NLP.

In [43]:
generation(vectorstore=vectorstore_seq2seq, query="Tell me something about Sequence to Sequence Learning")

Sequence to Sequence Learning is a general end-to-end approach to sequence learning that makes minimal assumptions on the sequence structure. It uses a multilayered Long Short-Term Memory (LSTM) to map the input sequence to a vector of a fixed dimensionality, and then another deep LSTM to decode the target sequence from the vector.

In [46]:
generation(vectorstore=vectorstore_seq2seq, query="What kind of information you have about Sequence to Sequence Learning")

I have the following information about Sequence to Sequence Learning:

1. Sequence to Sequence Learning is a type of machine learning task that involves learning to map an input sequence to an output sequence.

2. The input and output sequences can be of different lengths, and the mapping can be complex and non-monotonic.

3. Sequence to Sequence Learning is used in a variety of applications, including machine translation, speech recognition, and question answering.

4. The most common approach to Sequence to Sequence Learning is to use a recurrent neural network (RNN), such as a Long Short-Term Memory (LSTM) network.

5. RNNs are able to learn long-term dependencies between elements in the input and output sequences.

6. Sequence to Sequence Learning is a challenging task, but it has made significant progress in recent years.

7. The best performing Sequence to Sequence models are now able to achieve state-of-the-art results on a variety of tasks.