In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import json
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

# === Path Folder ===
base_path = "/content/drive/My Drive/CBR_Project"
cases_path = os.path.join(base_path, "data/processed/cases_all.json")
queries_path = os.path.join(base_path, "data/eval/queries.json")
output_path = os.path.join(base_path, "data/results/predictions.csv")
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# === Load Data Kasus & Query ===
with open(cases_path, "r", encoding="utf-8") as f:
    cases_data = json.load(f)
with open(queries_path, "r", encoding="utf-8") as f:
    queries = json.load(f)

# === Ambil teks utama dan solusi dari setiap kasus
case_ids = [c["case_id"] for c in cases_data]
case_texts = [
    c.get("ringkasan_fakta") or c.get("text_full") or ""
    for c in cases_data
]
case_solutions = {
    c["case_id"]: c.get("argumen_hukum") or c.get("ringkasan_fakta") or c.get("text_full") or "Tidak Diketahui"
    for c in cases_data
}

# === TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(case_texts)

# === Retrieve Function
def retrieve(query: str, k: int = 5):
    query_vec = vectorizer.transform([query])
    sim_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = sim_scores.argsort()[::-1][:k]
    top_ids = [case_ids[i] for i in top_indices]
    top_scores = [sim_scores[i] for i in top_indices]
    return top_ids, top_scores

# === Predict Function
def predict_outcome(query: str, strategy: str = "majority") -> dict:
    top_ids, scores = retrieve(query)
    solutions = [case_solutions[cid] for cid in top_ids]

    if strategy == "majority":
        most_common = Counter(solutions).most_common(1)
        predicted = most_common[0][0] if most_common else "Tidak Diketahui"
    elif strategy == "weighted":
        weighted = {}
        for sol, score in zip(solutions, scores):
            weighted[sol] = weighted.get(sol, 0) + score
        predicted = max(weighted.items(), key=lambda x: x[1])[0]
    else:
        predicted = solutions[0] if solutions else "Tidak Diketahui"

    return {
        "predicted_solution": predicted,
        "top_case_ids": top_ids
    }

# === Prediksi dan Simpan ke CSV
with open(output_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["query_id", "predicted_solution", "top_5_case_ids"])
    writer.writeheader()
    for q in queries:
        result = predict_outcome(q["query_text"], strategy="majority")
        writer.writerow({
            "query_id": q["query_id"],
            "predicted_solution": result["predicted_solution"],
            "top_5_case_ids": ", ".join(map(str, result["top_case_ids"]))
        })

print(f"✅ Semua prediksi selesai. Hasil disimpan di: {output_path}")


✅ Semua prediksi selesai. Hasil disimpan di: /content/drive/My Drive/CBR_Project/data/results/predictions.csv
