In [None]:

import os, sys, glob, uuid, time, itertools
from typing import Iterable, Tuple, List, Dict, Generator
from dotenv import load_dotenv
from pypdf import PdfReader
import tiktoken
from openai import OpenAI
from pinecone.grpc import PineconeGRPC as Pinecone

# ----------------- Config (defaults; can be overridden via CLI/env) -----------------
CHUNK_TOKENS_DEFAULT = 700
OVERLAP_TOKENS_DEFAULT = 100
EMBED_BATCH = 64          # texts per OpenAI embeddings request
UPSERT_BATCH = 200       

load_dotenv()  # load .env file if present

In [2]:
 # vectors per Pinecone gRPC upsert

# ----------------- Helpers -----------------
def chunks(iterable: Iterable, batch_size: int) -> Iterable[tuple]:
    it = iter(iterable)
    batch = tuple(itertools.islice(it, batch_size))
    while batch:
        yield batch
        batch = tuple(itertools.islice(it, batch_size))

def read_pdf_pages(pdf_path: str) -> List[Tuple[int, str]]:
    try:
        r = PdfReader(pdf_path)
    except Exception as e:
        print(f"!! Failed to open {pdf_path}: {e}", file=sys.stderr)
        return []
    out = []
    for i, p in enumerate(r.pages):
        txt = (p.extract_text() or "").strip()
        if txt:
            out.append((i + 1, " ".join(txt.split())))
    return out

def token_chunks(text: str, enc, chunk_tokens: int, overlap: int) -> List[str]:
    ids = enc.encode(text)
    out, start, n = [], 0, len(ids)
    while start < n:
        end = min(start + chunk_tokens, n)
        out.append(enc.decode(ids[start:end]))
        if end == n: break
        start = max(0, end - overlap)
    return out

def iter_pdf_chunks(pdf_path: str, chunk_tokens: int, overlap: int) -> Generator[Tuple[str,int,int,str], None, None]:
    enc = tiktoken.get_encoding("cl100k_base")
    for page, text in read_pdf_pages(pdf_path):
        for ci, ch in enumerate(token_chunks(text, enc, chunk_tokens, overlap)):
            yield pdf_path, page, ci, ch

def embed_texts_openai(client: OpenAI, model: str, texts: List[str]) -> List[List[float]]:
    # Basic retry for transient 5xx/429
    backoff = 1.0
    while True:
        try:
            resp = client.embeddings.create(model=model, input=texts)
            return [d.embedding for d in resp.data]
        except Exception as e:
            msg = str(e).lower()
            if any(x in msg for x in ("429", "temporar", "timeout", "overload", "503", "500")):
                time.sleep(backoff)
                backoff = min(backoff * 2, 30)
                continue
            raise

# ----------------- Main ingest -----------------
def ingest_folder(folder: str,
                  index_host: str,
                  namespace: str,
                  embed_model: str,
                  chunk_tokens: int,
                  overlap_tokens: int):
    # Gather PDFs
    pdfs = sorted(glob.glob(os.path.join(folder, "**/*.pdf"), recursive=True))
    if not pdfs:
        print(f"No PDFs found under: {folder}")
        return

    # Init clients
    pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
    index = pc.Index(host=index_host)   # gRPC connects by host
    oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

    total_files = 0
    total_chunks = 0

    def gen_upserts() -> Iterable[Dict]:
        nonlocal total_chunks
        for pdf_path in pdfs:
            rel_name = os.path.relpath(pdf_path, folder)
            file_id_prefix = os.path.basename(pdf_path)
            chunks_iter = list(iter_pdf_chunks(pdf_path, chunk_tokens, overlap_tokens))
            if not chunks_iter:
                print(f"-- Skipping (no extractable text): {rel_name}")
                continue

            # Embed in batches for this file
            for i in range(0, len(chunks_iter), EMBED_BATCH):
                batch = chunks_iter[i:i + EMBED_BATCH]
                texts = [t for (_, _, _, t) in batch]
                vecs = embed_texts_openai(oai, embed_model, texts)
                for (path_, page, ci, txt), v in zip(batch, vecs):
                    total_chunks += 1
                    yield {
                        "id": f"{file_id_prefix}#p{page}-{ci}-{uuid.uuid4().hex[:6]}",
                        "values": v,
                        "metadata": {
                            "source": rel_name.replace("\\", "/"),
                            "page": page,
                            "chunk": ci,
                            "model": embed_model,
                            "text": txt[:2000]  # helpful snippet; keep metadata reasonable
                        }
                    }
            nonlocal total_files
            total_files += 1

    # Stream upserts in UPSERT_BATCH chunks
    streamed = 0
    for upsert_chunk in chunks(gen_upserts(), UPSERT_BATCH):
        index.upsert(vectors=list(upsert_chunk), namespace=namespace)
        streamed += len(upsert_chunk)
        print(f"Upserted {streamed} vectors so far...")

    print(f"✅ Done. Files ingested: {total_files}, chunks embedded & upserted: {total_chunks}, namespace: {namespace}")

In [3]:
load_dotenv()
import argparse

folder = os.path.abspath("/workspaces/gdpr_chat/docs")
index_host = "https://gdpr-chat-index-bw6rwbq.svc.aped-4627-b74a.pinecone.io"
namespace = "__default__"
embed_model = "text-embedding-3-small"
chunk_tokens = 700
overlap_tokens = 100

ingest_folder(
    folder=folder,
    index_host=index_host,
    namespace=namespace,
    embed_model=embed_model,
    chunk_tokens=chunk_tokens,
    overlap_tokens=overlap_tokens
)


Upserted 175 vectors so far...
✅ Done. Files ingested: 2, chunks embedded & upserted: 175, namespace: __default__


In [3]:
# Query Pinecone index with a text query
from pinecone.grpc import PineconeGRPC as Pinecone
from openai import OpenAI

# 1. Prepare clients
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index(host="https://gdpr-chat-index-bw6rwbq.svc.aped-4627-b74a.pinecone.io")
oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# 2. Embed your query
query_text = "Why do we need GDPR"
embed_model = "text-embedding-3-small"
query_vec = oai.embeddings.create(model=embed_model, input=[query_text]).data[0].embedding

# 3. Query Pinecone
results = index.query(
    vector=query_vec,
    top_k=1,
    namespace="__default__",
    include_metadata=True
)

# 4. Print results
for match in results.matches:
    print(f"Score: {match.score:.3f}")
    print(f"Source: {match.metadata.get('source')}")
    print(f"Text: {match.metadata.get('text')}...\n")

Score: 0.722
Source: GDPR_Guidelines.pdf
Text: GDPR Guidelines – Plain Text Version 1. What is GDPR? The General Data Protection Regulation (GDPR) is a comprehensive European Union law designed to protect individuals' personal data and privacy. It came into effect on 25 May 2018, replacing outdated data protection rules and harmonizing data privacy regulations across EU member states. 2. To Whom Does GDPR Apply? - Applies to organizations within the EU that process personal data as part of their operations, no matter where the processing takes place. - Also applies to entities outside the EU if they offer goods or services to individuals in the EU or monitor their behavior. 3. Key Principles of GDPR Organizations handling personal data must adhere to these core principles: 1. Lawfulness, fairness & transparency – Data must be processed legally, fairly, and transparently. 2. Purpose limitation – Data must be collected for specified, explicit purposes. 3. Data minimization – Only data es