In [12]:
import json
import numpy as np
from difflib import get_close_matches
from collections import Counter


# ============================================================
#  LOAD CLEAN DOCUMENTS & TF-IDF INDEX
# ============================================================

with open("clean_documents.json", "r") as f:
    CLEAN_DOCS = json.load(f)

with open("tfidf_index.json", "r") as f:
    TFIDF = json.load(f)

tfidf_matrix = np.array(TFIDF["matrix"])
vocab = TFIDF["vocab"]
vocab_index = {w: i for i, w in enumerate(vocab)}
idf = np.array(TFIDF["idf"])
filenames = TFIDF["filenames"]

print("Loaded", len(CLEAN_DOCS), "documents.")


# ============================================================
#  PREPROCESSING UTILS
# ============================================================

def preprocess_query(q):
    q = q.lower().replace("-", " ")    
    return q.split()


def compute_tf(tokens):
    vec = np.zeros(len(vocab_index))
    count = Counter(tokens)
    for word, freq in count.items():
        if word in vocab_index:
            vec[vocab_index[word]] = freq
    return vec


def cosine_similarity(a, b):
    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
        return 0.0
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))



# ============================================================
#  SAFE AUTOCORRECT
# ============================================================

def autocorrect_safe(tokens):
    corrected = []
    suggestions = []

    for t in tokens:
        matches = get_close_matches(t, vocab, n=1, cutoff=0.75)
        if matches and matches[0] != t:
            suggestions.append((t, matches[0]))
            corrected.append(t)   
        else:
            corrected.append(t)

    return corrected, suggestions



# ============================================================
#  TITLE → DOCUMENT MAP
# ============================================================

title_map = { d["title"]: d for d in CLEAN_DOCS }


# ============================================================
#  JACCARD SIMILARITY
# ============================================================

def jaccard_similarity(q_tokens, doc_tokens, title_tokens):

    if len(q_tokens) == 0:
        return 0.0

    q_set = set(q_tokens)
    d_set = set(doc_tokens)

    inter = len(q_set & d_set)
    union = len(q_set | d_set)
    base = inter / union if union > 0 else 0.0

    # BOOST judul
    if len(q_set & set(title_tokens)) > 0:
        base += 0.3

    return base



# ============================================================
#  SEARCH FUNCTION
# ============================================================

def search(query, top_k=10):

    print("\nOriginal Query:", query)

    # === PREPROCESS ===
    q_tokens = preprocess_query(query)
    print("Processed Query:", q_tokens)

    # === SAFE AUTOCORRECT ===
    q_tokens, suggestions = autocorrect_safe(q_tokens)


    # ======================================================
    # TF-IDF SEARCH (WITH TITLE BOOST)
    # ======================================================

    q_tf = compute_tf(q_tokens)
    q_vec = q_tf * idf

    tfidf_scores = []

    for i, doc_vec in enumerate(tfidf_matrix):

        sim = cosine_similarity(q_vec, doc_vec)

        # ==== FIX 2: TITLE BOOST TF-IDF ====
        title = filenames[i].lower().replace("-", " ")
        title_tokens = title.split()

        if len(set(q_tokens) & set(title_tokens)) > 0:
            sim += 0.3

        tfidf_scores.append((i, sim))


    tfidf_ranked = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    tfidf_ranked = [(i, s) for (i, s) in tfidf_ranked if s > 0]


    # ======================================================
    # JACCARD SEARCH
    # ======================================================

    jaccard_scores = []
    for doc in CLEAN_DOCS:
        sim = jaccard_similarity(
            q_tokens,
            doc["tokens"],
            doc["title"].lower().replace("-", " ").split()
        )
        jaccard_scores.append((doc["title"], sim))

    jaccard_ranked = sorted(jaccard_scores, key=lambda x: x[1], reverse=True)
    jaccard_ranked = [(t, s) for (t, s) in jaccard_ranked if s > 0]


    # ======================================================
    # BUILD RESULTS
    # ======================================================

    result_tfidf = []
    for idx, score in tfidf_ranked[:top_k]:
        title = filenames[idx]
        if title in title_map:
            d = title_map[title]
            result_tfidf.append({
                "title": d["title"],
                "poster": d["poster"],
                "description": d["description"],
                "score": score
            })


    result_jaccard = []
    for title, score in jaccard_ranked[:top_k]:
        d = title_map[title]
        result_jaccard.append({
            "title": d["title"],
            "poster": d["poster"],
            "description": d["description"],
            "score": score
        })


    # ======================================================
    # FALLBACK
    # ======================================================

    if not result_tfidf and not result_jaccard:
        print("\nTidak ada hasil relevan.")
        if suggestions:
            print("Apakah maksud kamu:")
            for orig, corr in suggestions:
                print(f"  • {orig} → {corr}")
        return {
            "tfidf": [],
            "jaccard": [],
            "message": "no_result"
        }

    return {
        "tfidf": result_tfidf,
        "jaccard": result_jaccard,
        "suggestions": suggestions
    }


Loaded 412 documents.


In [13]:
def display_results(result):
    print("\n" + "="*60)
    print("                  TF-IDF RESULTS ")
    print("="*60)

    if len(result["tfidf"]) == 0:
        print("Tidak ada hasil.")
    else:
        for i, r in enumerate(result["tfidf"], 1):
            print(f"\n[{i}] {r['title']}  (score: {r['score']:.4f})")
            print(f"Poster : {r['poster']}")
            print(f"Desc   : {r['description'][:200]}...")

    print("\n" + "="*60)
    print("                  JACCARD RESULTS ")
    print("="*60)

    if len(result["jaccard"]) == 0:
        print("Tidak ada hasil.")
    else:
        for i, r in enumerate(result["jaccard"], 1):
            print(f"\n[{i}] {r['title']}  (score: {r['score']:.4f})")
            print(f"Poster : {r['poster']}")
            print(f"Desc   : {r['description'][:200]}...")


In [14]:
res = search("zombie")
display_results(res)



Original Query: zombie
Processed Query: ['zombie']

                  TF-IDF RESULTS 

[1] 28 Days Later  (score: 0.5861)
Poster : https://image.idntimes.com/post/20250618/upload_f14befb9635dc629507b51ada0113bac_0e3c7e70-3dc8-48ae-9b3e-ec54b5e60933.jpg
Desc   : Saat mendengar kata “zombie”, kebanyakan orang langsung membayangkan mayat hidup yang berjalan pelan, mengerang, dan hanya bisa dihentikan dengan tembakan di kepala.

Tapi kalau kamu pernah menonton 2...

[2] The Girl with All the Gifts  (score: 0.3692)
Poster : https://image.idntimes.com/post/20250617/i-am-legend-scaled_c6a07aaf-4af1-402a-bebc-ba17cb1c3d6a.jpg
Desc   : Film bertema zombie memang tidak pernah sepi peminat. Umumnya, genre ini menggabungkan elemenhorror,thriller, dan aksi untuk menciptakan ketegangan yang intens dan mencekam. Gaya ini berbeda dengan fi...

[3] Cargo (2017)  (score: 0.1863)
Poster : https://image.idntimes.com/post/20170324/cargo-1-1-e5097c11e50882aa1e5cac6928dfcb2f.jpg
Desc   : Film Cargo ini rupa