In [1]:
import json
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import word_tokenize

In [2]:
# Inisialisasi Sastrawi
stop_factory = StopWordRemoverFactory()
stem_factory = StemmerFactory()
stopwords = set(stop_factory.get_stop_words())
stemmer = stem_factory.create_stemmer()

In [3]:
# Fungsi preprocessing untuk Bahasa Indonesia
def preprocess_text_indonesian(text):
    # Case folding
    text = text.lower()
    
    # Hapus karakter non-alfabet
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenisasi
    tokens = word_tokenize(text)
    
    # Hapus stopwords
    tokens = [word for word in tokens if word not in stopwords]
    
    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Gabung kembali
    return ' '.join(tokens)

In [5]:
# Baca file JSON
with open('../datasets/merge_national_dataset.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Terapkan preprocessing ke setiap item

for idx, item in enumerate(data, start=1):
    if 'lyric' in item:
        item['preprocessed_lyric'] = preprocess_text_indonesian(item['lyric'])
        
        # Ambil 5 kata pertama untuk preview
        preview = ' '.join(item['preprocessed_lyric'].split()[:5])
        
        print(f"lyric-{idx}: {preview}...")


# Simpan hasil ke file baru
with open('../datasets/preprocessed_national_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"Preprocessing selesai. Total lagu: {len(data)}")


lyric-1: sini kau aku biasa sama...
lyric-2: embun pagi buta tebar bau...
lyric-3: verse bagaimana mesti buat jatuh...
lyric-4: betapa rasa kasih pada slalu...
lyric-5: verse lekuk indah hadir pesona...
lyric-6: malam hadir cinta sambut jiwa...
lyric-7: raih bintang ingin bintang cahyanya...
lyric-8: manjakau damba selalu usik hati...
lyric-9: verse asa indah tak lupa...
lyric-10: rasa cinta dulu tlah hilang...
lyric-11: angin malam hembus lirih dingin...
lyric-12: tak habis pikir jadi kau...
lyric-13: senandung lagu cinta cipta untuk...
lyric-14: verse tegun ku pandang kau...
lyric-15: diam diam dingin serang kujur...
lyric-16: ingat masa kecil kau peluk...
lyric-17: jauh nian harap cita sirna...
lyric-18: kali sadar aku jatuh cinta...
lyric-19: akhir akhir aku temu wajah...
lyric-20: terlalu sadis cara jadi diri...
lyric-21: verse sadar sepi jauh lang...
lyric-22: bilang bila kau mau bilang...
lyric-23: intro tak diri jadi kamu...
lyric-24: verse tetes air mata beri...
lyric-25: baga