In [1]:
pip install pinecone


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/home/student/.virtualenvs/final-project/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import re
import json
import zipfile
from pathlib import Path
import pdfplumber
import docx
from pdfminer.high_level import extract_text
import nltk
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from tqdm import tqdm
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    SpacyTextSplitter
)
from pinecone import Pinecone, ServerlessSpec
import spacy
import subprocess
from nltk.tokenize import sent_tokenize
nltk.download('punkt')


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def ensure_spacy_model(model_name="en_core_web_sm"):
    """Ensure that the given spaCy model is installed."""
    try:
        spacy.load(model_name)
    except OSError:
        print(f"⚠️ spaCy model '{model_name}' not found. Downloading...")
        subprocess.run(["python", "-m", "spacy", "download", model_name], check=True)
        print(f"✅ Downloaded spaCy model: {model_name}")

In [4]:
def recursive_chunking(texts: list[str], chunk_size: int, overlap: int) -> list[str]:
    char_size = chunk_size * AVG_WORD_LEN
    char_overlap = overlap * AVG_WORD_LEN
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = []
    for text in texts:
        chunks.extend(splitter.split_text(text))
    return chunks

In [5]:
def overlapping_chunking(texts: list[str], chunk_size: int, overlap: int) -> list[str]:
    chunks = []
    step = chunk_size - overlap
    for text in texts:
        words = text.split()
        for i in range(0, len(words), step):
            chunk = " ".join(words[i:i + chunk_size])
            if chunk:
                chunks.append(chunk.strip())
    return chunks

In [6]:
def spacy_chunking(texts: list[str], chunk_size: int, overlap: int) -> list[str]:
    char_size = chunk_size * AVG_WORD_LEN
    char_overlap = overlap * AVG_WORD_LEN
    splitter = SpacyTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = []
    for text in texts:
        chunks.extend(splitter.split_text(text))
    return chunks

In [7]:
def select_chunking_strategy(strategy_name: str):
    strategies = {
        "recursive": recursive_chunking,
        "overlapping": overlapping_chunking,
        "spacy": spacy_chunking,
    }
    return strategies.get(strategy_name)

In [8]:
def extract_zip_file(zip_path: Path, extract_to: Path):
    if not extract_to.exists():
        extract_to.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Extracted {zip_path} to {extract_to}")

def remove_cid_artifacts(text):
    return re.sub(r'\(cid:\d+\)', '', text)

def read_pdf(path: Path) -> str:
    try:
        text = extract_text(str(path))
    except Exception as e:
        print(f"⚠️ Error reading {path.name}: {e}")
        text = ""
    return remove_cid_artifacts(text)

def read_docx(path: Path) -> str:
    doc = docx.Document(str(path))
    return "\n".join(p.text for p in doc.paragraphs)

def clean_text(text: str) -> str:
    text = re.sub(r"[^a-zA-Z0-9\.\,\;\:\?\!\-\s]", " ", text)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n+", "\n", text)
    return text.strip()

def embed_chunks(texts: list[str], model: SentenceTransformer) -> np.ndarray:
    if not texts:
        return np.empty((0, model.get_sentence_embedding_dimension()), dtype="float32")
    embs = model.encode(texts, convert_to_numpy=True)
    return normalize(embs, axis=1).astype("float32")

In [9]:
def create_index(pc, index_name, dimension, metric):
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric = metric,
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )
        print(f"Created new Pinecone index: {index_name}")
    else:
        print(f"Using existing Pinecone index: {index_name}")

def save_to_pinecone(records, embeddings, pc, index_name, namespace="default"):
    index = pc.Index(index_name)
    vectors = []
    for i, record in enumerate(records):
        vectors.append({
            "id": f"{Path(record['file']).stem}_chunk_{record['chunk_id']}",
            "values": embeddings[i].tolist(),
            "metadata": {
                "file": Path(record['file']).name,
                "chunk_id": record['chunk_id'],
                "text": record['text'],
                "length": record['length'],
            }
        })
    for i in range(0, len(vectors), 100):
        index.upsert(vectors=vectors[i:i+100], namespace=namespace)
    print(f"📌 Saved {len(vectors)} vectors to Pinecone namespace '{namespace}'.")

In [10]:
MODEL_NAME = "multi-qa-mpnet-base-dot-v1"
ZIP_PATH = Path("Eng_data.zip")
DATA_DIR = ZIP_PATH.with_suffix("")
WORDS_PER_CHUNK = 500
WORDS_OVERLAP = 100
AVG_WORD_LEN = 6  # adjust if needed
OVERLAP = 200
PINECONE_INDEX_NAME = 'dotproduct'
CHUNK_STRATEGY = "spacy"  # Options: recursive / overlapping / spacy
NAMESPACE = f"ENG-{CHUNK_STRATEGY}"

with open(r"../src/api_keys.json") as f:
    api_keys = json.load(f)
pc = Pinecone(api_key=api_keys["pinecone_anna"])

In [11]:
# Loading model
print(f"[1/4] Loading model '{MODEL_NAME}'...")
model = SentenceTransformer(MODEL_NAME)
dimension = model.get_sentence_embedding_dimension()
create_index(pc, PINECONE_INDEX_NAME, dimension, PINECONE_INDEX_NAME)

[1/4] Loading model 'multi-qa-mpnet-base-dot-v1'...
Created new Pinecone index: dotproduct


In [12]:
print("[2/4] Extracting zip and reading documents...")
extract_zip_file(ZIP_PATH, DATA_DIR)
raw_texts = []
sources = []
all_files = list(DATA_DIR.rglob("*"))
with tqdm(all_files, desc="Reading files") as pbar:
    for path in pbar:
        if path.suffix.lower() == ".pdf":
            text = read_pdf(path)
        elif path.suffix.lower() == ".docx":
            text = read_docx(path)
        else:
            continue
        cleaned = clean_text(text)
        raw_texts.append(cleaned)
        sources.append(str(path))

Extracted Eng_data.zip to Eng_data


Reading files:   0%|          | 0/7 [00:00<?, ?it/s]

[2/4] Extracting zip and reading documents...


Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Reading files:  14%|█▍        | 1/7 [00:13<01:22, 13.80s/it]Cannot set gray non-stroke color because /'P117' is an invalid float value
Cannot set gray non-stroke color because /'P125' is an invalid float value
Cannot set gray non-stroke color because /'P133' is an inval

KeyboardInterrupt: 

In [38]:
print("[3/4] Chunking text...")
if CHUNK_STRATEGY == "spacy":
  ensure_spacy_model("en_core_web_sm")

chunk_func = select_chunking_strategy(CHUNK_STRATEGY)
#chunks = chunk_func(raw_texts, chunk_size=CHUNK_SIZE, overlap=OVERLAP)

records = []
for file_path, text in zip(sources, raw_texts):
    file_chunks = chunk_func([text], chunk_size=WORDS_PER_CHUNK, overlap=WORDS_OVERLAP)
    file_name = Path(file_path).stem
    for i, chunk in enumerate(file_chunks):
        records.append({
            "file": file_name,
            "chunk_id": i,
            "text": chunk,
            "length": len(chunk.split()),
        })

print(f"→ Created {len(records)} chunks")

[3/5] Chunking text...


Created a chunk of size 1163, which is longer than the specified 1000
Created a chunk of size 1163, which is longer than the specified 1000
Created a chunk of size 1170, which is longer than the specified 1000
Created a chunk of size 1170, which is longer than the specified 1000
Created a chunk of size 2562, which is longer than the specified 1000
Created a chunk of size 1289, which is longer than the specified 1000
Created a chunk of size 1212, which is longer than the specified 1000
Created a chunk of size 2017, which is longer than the specified 1000
Created a chunk of size 1006, which is longer than the specified 1000
Created a chunk of size 1196, which is longer than the specified 1000
Created a chunk of size 2190, which is longer than the specified 1000
Created a chunk of size 1161, which is longer than the specified 1000
Created a chunk of size 1136, which is longer than the specified 1000
Created a chunk of size 1055, which is longer than the specified 1000
Created a chunk of s

→ Created 1250 chunks


In [39]:
print("[4/4] Embedding and uploading to Pinecone...")
embs = embed_chunks([r["text"] for r in records], model)
save_to_pinecone(records, embs, pc, PINECONE_INDEX_NAME, namespace=NAMESPACE)
print(f"\n✅ Done! Uploaded {len(records)} chunks to Pinecone namespace '{NAMESPACE}'.")

[4/5] Embedding and uploading to Pinecone...
📌 Saved 1250 vectors to Pinecone namespace 'ENG-spacy'.

✅ Done! Uploaded 1250 chunks to Pinecone namespace 'ENG-spacy'.
