In [2]:
import os, json, numpy as np, math
from collections import Counter

def load_clean_docs(folder="dataset_clean"):
    docs = []
    filenames = []
    for fname in os.listdir(folder):
        if fname.endswith(".json"):
            with open(os.path.join(folder, fname), "r") as f:
                tokens = json.load(f)["tokens"]
                docs.append(tokens)
                filenames.append(fname.replace(".json", ""))
    return docs, filenames

docs, filenames = load_clean_docs()

vocab = sorted(list(set([word for doc in docs for word in doc])))
vocab_index = {w:i for i,w in enumerate(vocab)}

N = len(docs)
idf = np.zeros(len(vocab))

for word, idx in vocab_index.items():
    df = sum(1 for doc in docs if word in doc)
    idf[idx] = math.log((N+1)/(df+1)) + 1

matrix = []

for tokens in docs:
    tf_vec = np.zeros(len(vocab))
    count = Counter(tokens)
    for w, c in count.items():
        if w in vocab_index:
            tf_vec[vocab_index[w]] = c
    matrix.append(tf_vec * idf)

matrix = np.array(matrix)

output = {
    "vocab": vocab,
    "idf": idf.tolist(),
    "matrix": matrix.tolist(),
    "filenames": filenames
}

with open("tfidf_index.json", "w") as f:
    json.dump(output, f, indent=2)

print("SELESAI — tfidf_index.json disimpan!")

SELESAI — tfidf_index.json disimpan!
