### Try the pincone vector db

In [1]:
from pinecone import Pinecone, ServerlessSpec
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
import openai

load_dotenv()

True

In [3]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = os.getenv("PINECONE_INDEX_NAME")
dimension = 1536

# Create index if it doesn't exist
if index_name not in [i.name for i in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",  # or "gcp", "azure" depending on your Pinecone environment
            region="us-east-1"  # match your Pinecone environment
        )
    )
index = pc.Index(index_name)
# Clear all vectors in the index
# Check if the namespace exists before deleting
namespace = "Default"  # or your custom namespace if you use one

try:
    index.describe_namespace(namespace=namespace)
    index.delete(delete_all=True, namespace=namespace)
    print(f"All vectors deleted from the '{namespace}' namespace.")
except Exception as e:
    print(f"Namespace '{namespace}' not found or already empty. Skipping delete. Details: {e}")

# Helper: extract text from PDF
def pdf_to_text(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
    return text

# Helper: create embeddings for a list of texts
def create_embeddings(texts, batch_size=10, max_characters=2000):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = [t[:max_characters] for t in texts[i:i+batch_size] if t and len(t.strip()) > 0]
        if not batch:
            continue
        response = openai.embeddings.create(
            input=batch,
            model="text-embedding-ada-002"
        )
        embeddings.extend([d.embedding for d in response.data])
    return embeddings

# Use a smaller chunk size and filter empty/large chunks
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs_folder = "/workspaces/gdpr_chat/docs"
for filename in os.listdir(docs_folder):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(docs_folder, filename)
        text = pdf_to_text(pdf_path)
        chunks = [c for c in splitter.split_text(text) if c and len(c.strip()) > 0 and len(c) < 2000]
        embeddings = create_embeddings(chunks)
        for i, (emb, chunk) in enumerate(zip(embeddings, chunks)):
            print(type(emb), len(emb))  # emb should be a list of 1536 floats
            print(type(chunk), len(chunk))  # chunk should be a string

        vectors = [
            {
                "id": f"{filename}-{i}",
                "values": list(emb),
                "metadata": {"text": chunk, "source": filename},
            }
            for i, (emb, chunk) in enumerate(zip(embeddings, chunks))
        ]

        if vectors:
            print(vectors[0])  # print first vector for inspection
            index.upsert(vectors=vectors)
        else:
            print(f"No vectors to upsert for {filename}. Check if the file is empty or unreadable.")

Namespace 'Default' not found or already empty. Skipping delete. Details: (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'Date': 'Sat, 23 Aug 2025 00:10:46 GMT', 'Content-Type': 'application/json', 'Content-Length': '90', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '2', 'x-pinecone-request-id': '3240774743712451062', 'x-envoy-upstream-service-time': '4', 'server': 'envoy'})
HTTP response body: {"code":5,"message":"No namespace: 'Default' found in index gdpr-chat-index","details":[]}

No vectors to upsert for GDPR_Guidelines.pdf. Check if the file is empty or unreadable.


In [None]:
def search_docs(query, top_k=5):
    # Create embedding for the query
    query_emb = openai.embeddings.create(
        input=[query],
        model="text-embedding-ada-002"
    ).data[0].embedding

    # Query Pinecone index
    results = index.query(
        vector=query_emb,
        top_k=top_k,
        include_metadata=True
    )

    # Return matched text chunks and sources
    return [
        {
            "score": match["score"],
            "text": match["metadata"]["text"],
            "source": match["metadata"]["source"]
        }
        for match in results["matches"]
    ]

# Example usage:
results = search_docs("What is GDPR?")
for r in results:
    print(f"Score: {r['score']:.2f} | Source: {r['source']}\n{r['text']}\n")