In [1]:
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
# Pastikan resource NLTK sudah terunduh
nltk.download('punkt')
nltk.download('stopwords')

# Inisialisasi
english_stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Fungsi preprocessing
def preprocess_text_english(text):
    # Case folding
    text = text.lower()
    
    # Hapus karakter non-alfabet
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenisasi
    tokens = word_tokenize(text)
    
    # Hapus stopwords
    tokens = [word for word in tokens if word not in english_stopwords]
    
    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Gabungkan kembali
    return ' '.join(tokens)

In [6]:
# Baca file JSON
with open('../datasets/merge_international_dataset.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Terapkan preprocessing
for idx, item in enumerate(data, start=1):
    if 'lyric' in item:
        item['preprocessed_lyric'] = preprocess_text_english(item['lyric'])
        
        # Ambil 5 kata pertama untuk preview
        preview = ' '.join(item['preprocessed_lyric'].split()[:5])
        
        print(f"lyric-{idx}: {preview}...")


# Simpan ke file baru
with open('../datasets/preprocessed_international_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"Preprocessing selesai. Total lagu: {len(data)}")


lyric-1: thought id end sean wasnt...
lyric-2: yeah breakfast tiffani bottl bubbl...
lyric-3: love move love touch one...
lyric-4: ariana grand nicki minaj ive...
lyric-5: right im state mind wan...
lyric-6: lacigam gnihtemo od ot thgin...
lyric-7: got type way hmm aint...
lyric-8: heaven sent im hopin dont...
lyric-9: hmm might think im crazi...
lyric-10: step two us nobodi know...
lyric-11: take long hit back cant...
lyric-12: like got superpow turn minut...
lyric-13: know hear cri tri hold...
lyric-14: mmm yeah yuh thought life...
lyric-15: doug middlebrook here thing your...
lyric-16: pre ariana grand pharrel life...
lyric-17: ariana grand mmm last night...
lyric-18: shangela laquifa wadley one small...
lyric-19: oh yeah dont need permiss...
lyric-20: ariana grand broke ex im...
lyric-21: marjori grand im tri best...
lyric-22: im bare breath wan na...
lyric-23: wendi rene laughter come tear...
lyric-24: tell got ta look way...
lyric-25: wouldnt let anybodi speak instead...
lyric-26