In [None]:

# UJ Advisor - Chunk Evaluation Pipeline (Cleaned)
# NOTE: Removed all OpenAI stuff we don't need anymore


!pip install chromadb sentence-transformers tqdm numpy

import os
import json
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings


# These are my cleaned JSON files containing all chunks
# If a file here loads 0  means the format is wrong
RAW_FILES = [
    "/content/CS_normalized.json",
    "/content/SWE_normalized.json",
    "/content/STUDENT_RULES.json",
    "/content/AI_normalized.json",
    "/content/DS_normalized.json",
    "/content/CNE_normalized.json",
    "/content/CY_normalized.json",
]


# Helper function:
# Reads any JSON and extracts meaningful text chunks from it
# This handles: rules, plans, nested fields
# this part handles any weird formats

def extract_chunks_from_file(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    chunks = []

    # 1) Simple rule list files
    if isinstance(data, list):
        for item in data:
            if isinstance(item, dict) and "text" in item and isinstance(item["text"], str):
                chunks.append(item)

    # 2) Study plan files (program ‚Üí levels ‚Üí courses)
    if isinstance(data, dict):
        program = data.get("program")
        levels = data.get("levels", [])

        if isinstance(levels, list):
            for lvl in levels:
                level_id = lvl.get("level_id") or lvl.get("level")
                courses = lvl.get("courses", [])
                for c in courses:
                    text = f"{c.get('code','')} - {c.get('name','')} | Credits: {c.get('credits',0)}"
                    chunks.append({
                        "text": text,
                        "metadata": {
                            "program": program,
                            "level": level_id,
                            "dept": c.get("dept",""),
                            "code": c.get("code",""),
                            "credits": c.get("credits",0),
                            "prerequisites": c.get("prerequisites", [])
                        }
                    })

    # 3) Electives (same logic, different structure)
    if isinstance(data, dict) and "electiveList" in data:
        for item in data["electiveList"]:
            text = f"{item.get('course_code','')} - {item.get('course_name','')}"
            chunks.append({
                "text": text,
                "metadata": {
                    "category": item.get("category",""),
                    "credits": item.get("credits", 0),
                    "prerequisites": item.get("prerequisites", [])
                }
            })

    # 4) Deep fallback scan incase of any more weird format
    def deep_scan(obj):
        if isinstance(obj, dict):
            for key in ["text", "description", "rule", "name", "title"]:
                if key in obj and isinstance(obj[key], str):
                    chunks.append({"text": obj[key], "metadata": {}})
            for v in obj.values():
                deep_scan(v)

        elif isinstance(obj, list):
            for item in obj:
                deep_scan(item)

    deep_scan(data)
    return chunks


# Load all the chunks from every file

all_chunks = []
for file in RAW_FILES:
    if not os.path.exists(file):
        print(f"File not found: {file}")
        continue

    extracted = extract_chunks_from_file(file)
    print(f"üì• {file}: {len(extracted)} chunks loaded")
    all_chunks.extend(extracted)

print("TOTAL CHUNKS LOADED:", len(all_chunks))


# Load multilingual-E5-large ‚Üí Our embedding workhorse
model = SentenceTransformer("intfloat/multilingual-e5-large")

# Initialize ChromaDB (persistent folder)
from chromadb import PersistentClient
chroma = PersistentClient(path="/content/chroma_store")
collection = chroma.get_or_create_collection("rag_collection")

# Prepare all components (texts, ids, metadata)

texts = [c["text"] for c in all_chunks]

ids = [c.get("id", f"chunk_{i}") for i in range(len(all_chunks))]

# Metadata cleanup to fit chromaDB needs
def sanitize_metadata(md):
    clean = {}
    for k, v in md.items():
        if isinstance(v, list):
            clean[k] = ", ".join(str(x) for x in v)
        elif isinstance(v, dict):
            clean[k] = json.dumps(v, ensure_ascii=False)
        elif v is None:
            clean[k] = ""
        else:
            clean[k] = v

    if len(clean) == 0:
        clean["source"] = "chunk"

    return clean

metadatas = [sanitize_metadata(c.get("metadata", {})) for c in all_chunks]


# Embed all chunks

print("Embedding", len(texts), "chunks‚Ä¶")
embeddings = model.encode(texts, show_progress_bar=True)

collection.add(
    embeddings=embeddings.tolist(),
    documents=texts,
    metadatas=metadatas,
    ids=ids
)

print(" ChromaDB Collection Ready")


# Simple retrieve()  returns top-k relevant chunks

def retrieve(query, k=5):
    q_emb = model.encode([query])[0].tolist()
    results = collection.query(
        query_embeddings=[q_emb],
        n_results=k
    )
    return results




#  semantic scoring + keyword scoring.

from sentence_transformers import util
import torch

def embed(text):
    return model.encode(text, convert_to_tensor=True)

# Semantic similarity using cosine similarity
def semantic_score(query, retrieved, ground_truth):

    q = embed(query)
    r = embed(retrieved)
    gt = embed(ground_truth)

    sim1 = util.cos_sim(q, r).item()  # how close retrieval is to query
    sim2 = util.cos_sim(r, gt).item() # how close retrieval is to ground truth

    return (sim1 + sim2) / 2

# Simple keyword overlap measure
def keyword_score(retrieved, keywords):
    if not keywords:
        return 0
    return sum(kw.strip() in retrieved for kw in keywords) / len(keywords)

# Final hybrid score: semantic (80%) + keywords (20%)
def hybrid_score(query, retrieved, keywords, ground_truth):

    sem = semantic_score(query, retrieved, ground_truth)
    kw = keyword_score(retrieved, keywords)

    sem_norm = max(0, min((sem + 1) / 2, 1))  # normalize cosine similarity

    score = (sem_norm * 0.8) + (kw * 0.2)
    return round(score * 100, 2)   # return as 0‚Äì100 score



# Example evaluation set (I will expand later as needed)

EVAL_SET = [
    {
        "query": "ŸÖÿß ŸáŸä ÿ¥ÿ±Ÿàÿ∑ ÿßŸÑÿ™ÿÆÿ±ÿ¨ÿü",
        "keywords": ["ÿßŸÑŸÖÿπÿØŸÑ", "ÿßŸÑÿ™ÿÆÿ±ÿ¨"],
        "ground_truth": "Ÿäÿ¨ÿ® ÿ£ŸÜ ŸäŸÉŸàŸÜ ÿßŸÑŸÖÿπÿØŸÑ ÿßŸÑÿ™ÿ±ÿßŸÉŸÖŸä ŸÑÿß ŸäŸÇŸÑ ÿπŸÜ ÿßŸÑŸÖŸÇÿ®ŸàŸÑ ‚Ä¶"
    },
    {
        "query": "ŸÖÿ™Ÿâ Ÿäÿ≠ÿ±ŸÖ ÿßŸÑÿ∑ÿßŸÑÿ® ŸÖŸÜ ÿßŸÑÿßÿÆÿ™ÿ®ÿßÿ± ÿßŸÑŸÜŸáÿßÿ¶Ÿäÿü",
        "keywords": ["75", "ÿ≠ÿ±ŸÖÿßŸÜ"],
        "ground_truth": "ŸäŸèÿ≠ÿ±ŸÖ ÿßŸÑÿ∑ÿßŸÑÿ® ÿ•ÿ∞ÿß ÿ™ÿ¨ÿßŸàÿ≤ ÿ∫Ÿäÿßÿ®Ÿá 25Ÿ™ ‚Ä¶"
    }
]

# Main evaluation loop

results = []

for item in tqdm(EVAL_SET):
    q = item["query"]
    kw = item["keywords"]
    gt = item["ground_truth"]

    retrieved = retrieve(q, k=3)
    docs = retrieved["documents"]
    top_text = docs[0][0] if isinstance(docs[0], list) else docs[0]

    score = hybrid_score(q, top_text, kw, gt)

    results.append({
        "query": q,
        "retrieved": top_text,
        "score": score
    })

import pandas as pd

df = pd.DataFrame(results)
df.to_csv("retrieval_eval_results.csv", index=False)

df
