In [3]:
import os
import re
import json
from tqdm import tqdm

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# STOPWORDS + STEMMER
stopwords = set(StopWordRemoverFactory().get_stop_words())
stemmer = StemmerFactory().create_stemmer()

# ===== CLEANING LEVEL TINGGI =====
def clean_text(text):
    text = text.lower()

    # buang URL
    text = re.sub(r'http\S+|www\S+|pic\S+', ' ', text)

    # buang file gambar: jpg png jpeg webp
    text = re.sub(r'\b\w+\.(jpg|jpeg|png|webp|gif)\b', ' ', text)

    # buang kata scrap random: superjumbo, dccb, fde, aaab, dll
    text = re.sub(r'\b[a-z]{1,4}\b', ' ', text)  # buang kata 1–4 huruf (noise)

    # pisahkan kata sambung kacau: "virusseolah" → "virus seolah"
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    text = re.sub(r'([a-z]+)([A-Z][a-z]+)', r'\1 \2', text)

    # hanya huruf
    text = re.sub(r'[^a-z\s]', ' ', text)

    # collapse spasi
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# ===== TOKENIZER FINAL =====
def preprocess_text(text):
    text = clean_text(text)
    tokens = text.split()

    # remove stopwords
    tokens = [t for t in tokens if t not in stopwords]

    # stemming
    tokens = [stemmer.stem(t) for t in tokens]

    # remove leftover noise
    tokens = [t for t in tokens if len(t) > 2]

    return tokens


# ===== PROCESS ALL FILES =====
def process_corpus(raw_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    all_tokens = []
    files = [f for f in os.listdir(raw_folder) if f.endswith(".txt")]

    print("Total file ditemukan:", len(files))
    progress = tqdm(files, desc="Processing Documents")

    for fname in progress:
        path = os.path.join(raw_folder, fname)
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()

        tokens = preprocess_text(text)
        all_tokens.extend(tokens)

        out_path = os.path.join(output_folder, fname.replace(".txt", ".json"))
        with open(out_path, "w", encoding="utf-8") as o:
            json.dump({"tokens": tokens}, o, indent=2)

    vocabulary = sorted(set(all_tokens))
    json.dump({"vocabulary": vocabulary}, open("dictionary.json","w"), indent=2)

    print("\n=== CLEANING DONE ===")
    print("Dokumen :", len(files))
    print("Vocab   :", len(vocabulary))

In [4]:
process_corpus("dataset_raw", "dataset_clean")

Total file ditemukan: 412


Processing Documents: 100%|██████████| 412/412 [08:46<00:00,  1.28s/it]


=== CLEANING DONE ===
Dokumen : 412
Vocab   : 7661



