In [2]:
import json
import numpy as np
from collections import Counter
from difflib import get_close_matches

with open("clean_documents.json", "r") as f:
    CLEAN_DOCS = json.load(f)

with open("tfidf_index.json", "r") as f:
    TFIDF = json.load(f)

tfidf_matrix = np.array(TFIDF["matrix"])
vocab = TFIDF["vocab"]
vocab_index = {w: i for i, w in enumerate(vocab)}
idf = np.array(TFIDF["idf"])
filenames = TFIDF["filenames"]

title_map = {d["title"]: d for d in CLEAN_DOCS}


def preprocess_query(q):
    return q.lower().split()

def compute_tf(tokens):
    vec = np.zeros(len(vocab_index))
    c = Counter(tokens)
    for w, f in c.items():
        if w in vocab_index:
            vec[vocab_index[w]] = f
    return vec

def cosine_similarity(a, b):
    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
        return 0.0
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

def autocorrect_safe(tokens):
    corrected = []
    suggestions = []
    for t in tokens:
        matches = get_close_matches(t, vocab, n=1, cutoff=0.75)
        if matches and matches[0] != t:
            suggestions.append((t, matches[0]))
            corrected.append(t)
        else:
            corrected.append(t)
    return corrected, suggestions

def jaccard_similarity(q_tokens, doc_tokens, title_tokens):
    if len(q_tokens) == 0:
        return 0.0
    q = set(q_tokens)
    d = set(doc_tokens)
    inter = len(q & d)
    union = len(q | d)
    score = inter / union if union > 0 else 0.0
    if len(q & set(title_tokens)) > 0:
        score += 0.3
    return score

def search_tfidf(q_tokens):
    q_tf = compute_tf(q_tokens)
    q_vec = q_tf * idf
    scores = [(i, cosine_similarity(q_vec, doc_vec)) for i, doc_vec in enumerate(tfidf_matrix)]
    scores = [(i, s) for (i, s) in scores if s > 0]
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return [filenames[i] for i, _ in scores]

def search_jaccard(q_tokens):
    scores = []
    for doc in CLEAN_DOCS:
        score = jaccard_similarity(q_tokens, doc["tokens"], doc["title"].lower().split())
        if score > 0:
            scores.append((doc["title"], score))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return [t for t, _ in scores]


def precision_at_k(results, relevant, k):
    results = results[:k]
    hits = sum([1 for r in results if r in relevant])
    return hits / max(len(results), 1)

def recall_at_k(results, relevant, k):
    results = results[:k]
    hits = sum([1 for r in results if r in relevant])
    return hits / len(relevant)

def average_precision(results, relevant):
    score = 0
    hits = 0
    for i, doc in enumerate(results):
        if doc in relevant:
            hits += 1
            score += hits / (i + 1)
    return score / len(relevant)


def evaluate_query(query, relevant_docs, k=10):
    q_tokens = preprocess_query(query)
    q_tokens, _ = autocorrect_safe(q_tokens)

    tfidf_results = search_tfidf(q_tokens)
    jaccard_results = search_jaccard(q_tokens)

    return {
        "query": query,
        "relevant": relevant_docs,
        "tfidf": {
            "precision": precision_at_k(tfidf_results, relevant_docs, k),
            "recall": recall_at_k(tfidf_results, relevant_docs, k),
            "AP": average_precision(tfidf_results, relevant_docs)
        },
        "jaccard": {
            "precision": precision_at_k(jaccard_results, relevant_docs, k),
            "recall": recall_at_k(jaccard_results, relevant_docs, k),
            "AP": average_precision(jaccard_results, relevant_docs)
        }
    }


TEST_SET = [
    ("interstellar", ["Interstellar"]),
    ("i am legend", ["I Am Legend"]),
    ("avengers", ["Avengers Endgame", "Avengers Age of Ultron", "The Avengers"]),
    ("spider-man", ["Spider-Man No Way Home", "Spider-Man Homecoming", "Spider-Man Far From Home", "Spider-Man Across the Spider-Verse"]),
    ("dune", ["Dune", "Dune Part Two"]),
    ("the batman", ["The Batman", "The Batman Part II"])
]


results = []
for q, rel in TEST_SET:
    results.append(evaluate_query(q, rel))


for r in results:
    tf = r["tfidf"]
    jc = r["jaccard"]

    tf_zero = (tf["precision"] == 0 and tf["recall"] == 0 and tf["AP"] == 0)
    jc_zero = (jc["precision"] == 0 and jc["recall"] == 0 and jc["AP"] == 0)

    if tf_zero and jc_zero:
        continue

    print("\n==============================")
    print("QUERY:", r["query"])
    print("Relevant:", r["relevant"])

    if not tf_zero:
        print("TF-IDF → Precision:", round(tf["precision"], 3),
              "Recall:", round(tf["recall"], 3),
              "AP:", round(tf["AP"], 3))

    if not jc_zero:
        print("Jaccard → Precision:", round(jc["precision"], 3),
              "Recall:", round(jc["recall"], 3),
              "AP:", round(jc["AP"], 3))



QUERY: interstellar
Relevant: ['Interstellar']
TF-IDF → Precision: 1.0 Recall: 1.0 AP: 1.0
Jaccard → Precision: 1.0 Recall: 1.0 AP: 1.0

QUERY: i am legend
Relevant: ['I Am Legend']
TF-IDF → Precision: 0.25 Recall: 1.0 AP: 0.25
Jaccard → Precision: 0.143 Recall: 1.0 AP: 1.0

QUERY: avengers
Relevant: ['Avengers Endgame', 'Avengers Age of Ultron', 'The Avengers']
TF-IDF → Precision: 0.1 Recall: 0.333 AP: 0.108
Jaccard → Precision: 0.3 Recall: 1.0 AP: 1.0

QUERY: spider-man
Relevant: ['Spider-Man No Way Home', 'Spider-Man Homecoming', 'Spider-Man Far From Home', 'Spider-Man Across the Spider-Verse']
Jaccard → Precision: 0.4 Recall: 1.0 AP: 0.317

QUERY: dune
Relevant: ['Dune', 'Dune Part Two']
Jaccard → Precision: 1.0 Recall: 1.0 AP: 1.0

QUERY: the batman
Relevant: ['The Batman', 'The Batman Part II']
TF-IDF → Precision: 0.1 Recall: 0.5 AP: 0.056
Jaccard → Precision: 0.1 Recall: 0.5 AP: 0.071
