In [1]:
from dotenv import load_dotenv
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from uuid import uuid4
import pypdf

  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [2]:
load_dotenv()

True

### Initialize Pinecone and Create and Index

In [3]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

# Load .env
load_dotenv()

api_key = os.getenv("PINECONE_API_KEY")
if api_key is None:
    raise ValueError("Missing Pinecone API key. Did you set it in your .env?")

pc = Pinecone(api_key=api_key)

index_name = "cv2-index"

# Create index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Match SentenceTransformer embedding size
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    #pc.describe_index(index_name).wait_until_ready()

# Connect to the index
index = pc.Index(index_name)
print(f"Connected to Pinecone index: {index_name}")

Connected to Pinecone index: cv2-index


### Load and Chunk Your CV

In [13]:
import os
print(os.getcwd())

c:\Users\Hp\Desktop\RAG_PROJECT\notebooks


In [15]:
# src/components/cv_ingestion_pdf.py
import fitz  # PyMuPDF
import re
import json
from typing import List, Dict
import os


def load_pdf_text(pdf_path: str) -> str:
    """Extract text from a PDF file."""
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"{pdf_path} not found")
    
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()


def split_cv_by_sections(text: str) -> Dict[str, str]:
    """Split CV text into sections using headings."""
    sections = ["Summary", "Skills", "Experience", "Education", "Certifications"]
    pattern = "(" + "|".join(sections) + ")"
    
    matches = list(re.finditer(pattern, text, re.IGNORECASE))
    section_dict = {}
    
    for i, match in enumerate(matches):
        start = match.end()
        end = matches[i+1].start() if i+1 < len(matches) else len(text)
        section_name = match.group().title()
        section_text = text[start:end].strip()
        section_dict[section_name] = section_text
    
    return section_dict


def chunk_section_text(section_text: str, chunk_size: int = 300, overlap: int = 50) -> List[str]:
    """Split section text into word-based chunks (optional overlap)."""
    words = section_text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks


def chunk_cv_sections(cv_text: str) -> List[Dict]:
    """Split CV into section-based chunks with metadata."""
    sections = split_cv_by_sections(cv_text)
    chunks_list = []

    for section_name, section_text in sections.items():
        section_chunks = chunk_section_text(section_text)
        for i, chunk in enumerate(section_chunks):
            chunks_list.append({
                "id": f"{section_name}_{i}",
                "section": section_name,
                "chunk_index": i,
                "content": chunk
            })
    
    return chunks_list


if __name__ == "__main__":
    pdf_path = "../data/csv/resume_project.pdf"
    output_path = "../data/csv/cv_chunks.json"

    # 1. Load PDF text
    cv_text = load_pdf_text(pdf_path)

    # 2. Chunk by sections
    chunks = chunk_cv_sections(cv_text)

    # 3. Save chunks to JSON
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(chunks, f, indent=2, ensure_ascii=False)

    print(f"✅ Total chunks created: {len(chunks)}")
    print(f"📂 Saved to {output_path}")



✅ Total chunks created: 5
📂 Saved to ../data/csv/cv_chunks.json


In [7]:
# Load Sentence Transformer model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [8]:
# src/pipeline/cv_embeddings.py
from sentence_transformers import SentenceTransformer
import json

def embed_cv_chunks(input_json="data/cv_chunks.json", output_json="data/cv_embeddings.json"):
    model = SentenceTransformer("all-MiniLM-L6-v2")

    with open(input_json, "r", encoding="utf-8") as f:
        chunks = json.load(f)

    embeddings = []
    for chunk in chunks:
        emb = model.encode(chunk["content"]).tolist()
        embeddings.append({
            "id": chunk["id"],
            "section": chunk["section"],
            "content": chunk["content"],
            "embedding": emb
        })

    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(embeddings, f, indent=2)

    print(f"Saved {len(embeddings)} CV embeddings to {output_json}")


In [17]:
embed_cv_chunks(input_json="../data/csv/cv_chunks.json", output_json="../data/csv/cv_embeddings.json")

Saved 5 CV embeddings to ../data/csv/cv_embeddings.json


In [18]:
with open("../data/csv/cv_embeddings.json", "r", encoding="utf-8") as f:
    data = json.load(f)

to_upsert = [(d["id"], d["embedding"], {"section": d["section"], "content": d["content"]}) for d in data]
index.upsert(vectors=to_upsert)

print(f"Upserted {len(to_upsert)} CV chunks into {index_name}")

Upserted 5 CV chunks into cv2-index


In [21]:
# src/pipeline/query_cv.py
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import os
from dotenv import load_dotenv

load_dotenv()

def query_cv(query, index_name="cv-index", top_k=3):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    query_vector = model.encode(query).tolist()

    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
    index = pc.Index(index_name)

    results = index.query(
        vector=query_vector,
        top_k=top_k,
        include_metadata=True
    )

    print("🔍 Query Results:")
    for match in results["matches"]:
        print(f"Score: {match['score']:.3f}")
        print(f"Section: {match['metadata']['text']}")
        print(f"Content: {match['metadata']['text'][:200]}...\n")

if __name__ == "__main__":
    query_cv("What are Cleave’s top machine learning skills?")


🔍 Query Results:
Score: 0.501
Section: Soft Skills: Analytical Thinking, Problem-Solving, Communication, Self-Directed Learning 
Experience      
 
Zindi Africa – Data Science Competitions April 2025 – Present 
 Participated in multiple machine learning competitions tackling real-world problems in healthcare and agriculture.
Content: Soft Skills: Analytical Thinking, Problem-Solving, Communication, Self-Directed Learning 
Experience      
 
Zindi Africa – Data Science Competitions April 2025 – Present 
 Participated in multiple m...

Score: 0.492
Section: data into actionable insights and delivering production-ready ML solutions. Competitive ML participant (top 
30% Zindi), passionate about solving high-impact problems with data. 
Skills      
Programming & Analytics: Python (pandas, NumPy, scikit-learn, matplotlib, seaborn), SQL, Excel 
Machine Learning & AI: Classification, Regression, NLP, Deep Learning, Feature Engineering, Model Evaluation 
LLMs & Conversational AI: LangChain, R