<a href="https://colab.research.google.com/github/dietmarja/LLM-Elements/blob/main/vector_db/pinecone_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Query-Document Matching Using Pinecone  
Given a query the code identifies similar pages of a document. To get the embeddings the match is based upon, the code harnesses Xenova's public domain model all-MiniLM-L6-v2 via the transformer library of Huggingface

In [143]:
!pip qU install transformers pinecone-client torch langchain PyMuPDF

ERROR: unknown command "qU"


In [144]:
import fitz  # PyMuPDF
from langchain import LLMChain
from transformers import AutoModel, AutoTokenizer
import pinecone
import torch


In [145]:
# Set Pinecone API Keys
import os
from google.colab import userdata
pinecone_api_key = userdata.get('PINECONE_API_KEY')

# Initialize Pinecone
from pinecone import Pinecone, ServerlessSpec

# Create a pincecone client
pc = Pinecone(api_key=pinecone_api_key)

# Create a pincecone client
index_name = "docs-quickstart-index"

# Check if the index exists and delete it if it does
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
    print("Existing index deleted")

# Create the index
pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",  # Available metrics: "euclidean"/"manhattan"/"dotproduct"/"cosine"
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

print("Index created successfully")

# Connect pc to the index
index = pc.Index(index_name)
print("Connected to the index")

Existing index deleted
Index created successfully
Connected to the index


In [146]:
from transformers import AutoModel, AutoTokenizer
import pinecone
import torch

# Load a all-MiniLM-L6-v2 model and corresponding tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Define a simple embedding function using Hugging Face model
def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist()
        # Ensure the embedding is a flat list of floats
        if isinstance(embeddings[0], list):
            embeddings = [item for sublist in embeddings for item in sublist]
    if len(embeddings) != 384:
        raise ValueError(f"Embedding dimension mismatch: Expected 384, got {len(embeddings)}")
    return embeddings


In [147]:
import fitz  # PyMuPDF
from transformers import AutoModel, AutoTokenizer
import pinecone
import torch

# Load a Hugging Face model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define a simple embedding function using Hugging Face model
def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist()
        # Ensure the embedding is a flat list of floats
        if isinstance(embeddings[0], list):
            embeddings = [item for sublist in embeddings for item in sublist]
    if len(embeddings) != 384:
        raise ValueError(f"Embedding dimension mismatch: Expected 384, got {len(embeddings)}")
    return embeddings

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    texts = []
    with fitz.open(pdf_path) as doc:
        for page_num, page in enumerate(doc):
            text = page.get_text()
            texts.append((page_num, text))
    return texts

# Function to store document in Pinecone
def store_document(doc_id, text):
    embedding = embed_text(text)
    index.upsert(vectors=[(doc_id, embedding)])

# Path to your PDF file
pdf_path_1 = "Attention_Paper.pdf"
pdf_path_2 = "Lora_Paper.pdf"

# Extract text from PDF
texts_1 = extract_text_from_pdf(pdf_path_1)
texts_2 = extract_text_from_pdf(pdf_path_2)


# Store each page as a separate vector in Pinecone
for page_num, text in texts_1:    # 11 papges -> 11 vectors
    store_document(f"doc_1_page_{page_num}", text)

for page_num, text in texts_2:    # 26 papges -> 26 vectors
    store_document(f"doc_2_page_{page_num}", text)


In [148]:
import fitz  # PyMuPDF
from transformers import AutoModel, AutoTokenizer
import pinecone
import torch

# Load a Hugging Face model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define a simple embedding function using Hugging Face model
def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist()
        # Ensure the embedding is a flat list of floats
        if isinstance(embeddings[0], list):
            embeddings = [item for sublist in embeddings for item in sublist]
    if len(embeddings) != 384:
        raise ValueError(f"Embedding dimension mismatch: Expected 384, got {len(embeddings)}")
    return embeddings

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    texts = []
    with fitz.open(pdf_path) as doc:
        for page_num, page in enumerate(doc):
            text = page.get_text()
            texts.append((page_num, text))
    return texts

# Function to store document in Pinecone
def store_document(doc_id, text):
    embedding = embed_text(text)
    index.upsert(vectors=[(doc_id, embedding)])


# Function to fetch document embeddings from Pinecone
def fetch_document_embedding(doc_id):
    response = index.fetch(ids=[doc_id])
    return response['vectors'][doc_id]['values']


# Path to your PDF file
pdf_path_1 = "Attention_Paper.pdf"
pdf_path_2 = "Lora_Paper.pdf"

# Extract text from PDF
texts_1 = extract_text_from_pdf(pdf_path_1)
texts_2 = extract_text_from_pdf(pdf_path_2)

# Store each page as a separate document in Pinecone
for page_num, text in texts_1:
    store_document(f"doc_1_page_{page_num}", text)

for page_num, text in texts_2:
    store_document(f"doc_2_page_{page_num}", text)



In [149]:
# Fetch embeddings for a specific document and specific page
doc_1_page_0_embedding = fetch_document_embedding("doc_1_page_0")
print(f"Embedding for doc_1_page_0: {doc_1_page_0_embedding}")


Embedding for doc_1_page_0: [-0.206853062, -0.223999053, 0.046813827, -0.0230643358, 0.0230432414, 0.071560055, -0.113816455, 0.0820734054, 0.179481059, -0.0645837858, 0.000451715663, -0.0589559264, -0.0238105282, 0.0359089114, -0.148885295, -0.0010653967, 0.00790819712, 0.158959687, -0.219155401, -0.0994952619, 0.145848, 0.0338694155, 0.0657733828, 0.0922754109, -0.0230305381, 0.00614938326, -0.0647270307, -0.0973173678, -0.166990086, -0.0636097863, 0.0896595865, 0.0611081757, -0.0194371808, 0.15370205, -0.0951550752, 0.122550845, -0.20127055, -0.0460345298, -0.054356724, -0.137010694, -0.016428465, 0.0209778473, -0.0663719326, 0.0116982833, 0.252382189, -0.0417954028, 0.0444649272, -0.0279237218, 0.0591082424, -0.00949567, -0.169647872, 0.0829335749, -0.111427121, 0.205787525, -0.100707196, -0.0886689276, -0.0747392923, 0.012416536, -0.0711478591, -0.0783602595, -0.144999057, -0.0780568421, -0.024562031, -0.157820642, 0.0198680833, -0.0756786317, 0.0277286582, 0.0444582179, -0.178535

In [150]:
#  Pinecone query related to the attention paper
query_text = "Where can I find key aspects of the attention mechanisms"
query_embedding = embed_text(query_text)
results = index.query(vector=query_embedding, top_k=3)

print("Results:")
for match in results['matches']:
    doc_id = match['id']
    score = match['score']
    print(f"Document ID: {doc_id}, Score: {score}")


Results:
Document ID: doc_1_page_4, Score: 0.399698615
Document ID: doc_1_page_3, Score: 0.383235663
Document ID: doc_1_page_6, Score: 0.34568134


In [151]:
#  Pinecone query related to the lora paper
query_text = "Where can I find key aspects of low-rank adaptation"
query_embedding = embed_text(query_text)
results = index.query(vector=query_embedding, top_k=3)

print("Results:")
for match in results['matches']:
    doc_id = match['id']
    score = match['score']
    print(f"Document ID: {doc_id}, Score: {score}")

Results:
Document ID: doc_2_page_23, Score: 0.488661855
Document ID: doc_2_page_12, Score: 0.488027722
Document ID: doc_2_page_1, Score: 0.447766632
