In [None]:
# UJ Advisor - Chunk Evaluation Pipeline (Cleaned)
# NOTE: Removed all OpenAI stuff we don't need anymore


!pip install chromadb sentence-transformers tqdm numpy

import os
import json
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings


# These are my cleaned JSON files containing all chunks
# If a file here loads 0  means the format is wrong
RAW_FILES = [
    "/content/CS_normalized.json",
    "/content/SWE_normalized.json",
    "/content/STUDENT_RULES.json",
    "/content/AI_normalized.json",
    "/content/DS_normalized.json",
    "/content/CNE_normalized.json",
    "/content/CY_normalized.json",
]



In [None]:

# Load all the chunks from every file

all_chunks = []
for file in RAW_FILES:
    if not os.path.exists(file):
        print(f"File not found: {file}")
        continue

    extracted = extract_chunks_from_file(file)
    print(f" {file}: {len(extracted)} chunks loaded")
    all_chunks.extend(extracted)

print("TOTAL CHUNKS LOADED:", len(all_chunks))



 /content/CS_normalized.json: 94 chunks loaded
 /content/SWE_normalized.json: 94 chunks loaded
 /content/STUDENT_RULES.json: 158 chunks loaded
 /content/AI_normalized.json: 98 chunks loaded
 /content/DS_normalized.json: 94 chunks loaded
 /content/CNE_normalized.json: 88 chunks loaded
 /content/CY_normalized.json: 92 chunks loaded
TOTAL CHUNKS LOADED: 718


In [18]:

# Load multilingual-E5-large → Our embedding workhorse
model = SentenceTransformer("intfloat/multilingual-e5-large")

# Initialize ChromaDB (persistent folder)
from chromadb import PersistentClient
chroma = PersistentClient(path="/content/chroma_store")
collection = chroma.get_or_create_collection("rag_collection")


In [19]:

# Prepare all components (texts, ids, metadata)

texts = [c["text"] for c in all_chunks]

# FIXED: enumerate to expose both index + chunk object
ids = [c.get("id", f"chunk_{i}") for i, c in enumerate(all_chunks)]

# Metadata cleanup to fit chromaDB needs
def sanitize_metadata(md):
    clean = {}
    for k, v in md.items():
        if isinstance(v, list):
            clean[k] = ", ".join(str(x) for x in v)
        elif isinstance(v, dict):
            clean[k] = json.dumps(v, ensure_ascii=False)
        elif v is None:
            clean[k] = ""
        else:
            clean[k] = v

    if len(clean) == 0:
        clean["source"] = "chunk"

    return clean

# FIXED: c was not defined → must loop like in texts
metadatas = [sanitize_metadata(c.get("metadata", {})) for c in all_chunks]




In [20]:

# Embed all chunks

print("Embedding", len(texts), "chunks…")
embeddings = model.encode(texts, show_progress_bar=True)

collection.add(
    embeddings=embeddings.tolist(),
    documents=texts,
    metadatas=metadatas,
    ids=ids
)

print(" ChromaDB Collection Ready")



Embedding 718 chunks…


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

 ChromaDB Collection Ready


In [23]:
# Simple retrieve()  returns top-k relevant chunks

def retrieve(query, k=3):
    q_emb = model.encode([query])[0].tolist()
    results = collection.query(
        query_embeddings=[q_emb],
        n_results=k
    )
    return results


In [24]:

#  semantic scoring + keyword scoring.

from sentence_transformers import util
import torch

def embed(text):
    return model.encode(text, convert_to_tensor=True)

# Semantic similarity using cosine similarity
def semantic_score(query, retrieved, ground_truth):

    q = embed(query)
    r = embed(retrieved)
    gt = embed(ground_truth)

    sim1 = util.cos_sim(q, r).item()  # how close retrieval is to query
    sim2 = util.cos_sim(r, gt).item() # how close retrieval is to ground truth

    return (sim1 + sim2) / 2

# Simple keyword overlap measure
def keyword_score(retrieved, keywords):
    if not keywords:
        return 0
    return sum(kw.strip() in retrieved for kw in keywords) / len(keywords)

# Final hybrid score: semantic (80%) + keywords (20%)
def hybrid_score(query, retrieved, keywords, ground_truth):

    sem = semantic_score(query, retrieved, ground_truth)
    kw = keyword_score(retrieved, keywords)

    sem_norm = max(0, min((sem + 1) / 2, 1))  # normalize cosine similarity

    score = (sem_norm * 0.8) + (kw * 0.2)
    return round(score * 100, 2)   # return as 0–100 score



In [25]:

# Example evaluation set (I will expand later as needed)

EVAL_SET = [
    {
        "query": "ما هي شروط التخرج؟",
        "keywords": ["المعدل", "التخرج"],
        "ground_truth": "يجب أن يكون المعدل التراكمي لا يقل عن المقبول …"
    },
    {
        "query": "متى يحرم الطالب من الاختبار النهائي؟",
        "keywords": ["75", "حرمان"],
        "ground_truth": "يُحرم الطالب إذا تجاوز غيابه 25٪ …"
    }
]


In [26]:

# Main evaluation loop

results = []

for item in tqdm(EVAL_SET):
    q = item["query"]
    kw = item["keywords"]
    gt = item["ground_truth"]

    retrieved = retrieve(q, k=3)
    docs = retrieved["documents"]
    top_text = docs[0][0] if isinstance(docs[0], list) else docs[0]

    score = hybrid_score(q, top_text, kw, gt)

    results.append({
        "query": q,
        "retrieved": top_text,
        "score": score
    })

import pandas as pd

df = pd.DataFrame(results)
df.to_csv("retrieval_eval_results.csv", index=False)

df


100%|██████████| 2/2 [00:06<00:00,  3.04s/it]


Unnamed: 0,query,retrieved,score
0,ما هي شروط التخرج؟,يتخرج الطالب بعد إنهاء متطلبات التخرج بنجاح حس...,93.48
1,متى يحرم الطالب من الاختبار النهائي؟,على الطالب المنتظم حضور المحاضرات والدروس العم...,84.16
