#1. Membaca dataset yang sudah di casefolding

In [25]:
import pandas as pd
from google.colab import files

# Baca file hasil case folding
try:
    data = pd.read_excel("reviews_casefolded.xlsx")
    print(" File 'reviews_casefolded.xlsx' berhasil dibaca!")
    print(f"Jumlah data: {len(data)}")
    print("Kolom tersedia:", list(data.columns))
except FileNotFoundError:
    print(" File 'reviews_casefolded.xlsx' tidak ditemukan. Pastikan sudah diunggah ke sesi Colab!")

# Ambil kolom yang relevan
if 'cleaned_review' in data.columns:
    data = data[['cleaned_review']]
    print("\n Kolom 'cleaned_review' berhasil diambil!")
else:
    print("\n Kolom 'cleaned_review' tidak ditemukan di file ini. Kolom yang tersedia:", list(data.columns))

# Tampilkan 20 baris pertama hasil case folding
print("\n 20 Baris Pertama Hasil Case Folding:")
display(data.head(20))


 File 'reviews_casefolded.xlsx' berhasil dibaca!
Jumlah data: 1103
Kolom tersedia: ['cleaned_review']

 Kolom 'cleaned_review' berhasil diambil!

 20 Baris Pertama Hasil Case Folding:


Unnamed: 0,cleaned_review
0,"makanan laut segar, harga terjangkau, cocok un..."
1,langsung di sambut dengan ramah n megah
2,salah satu tempat makan seafood di seputaran g...
3,"lokasi strategis, parkir mobil cuma bisa di te..."
4,"enak, ga terlalu pricey, worth it"
5,"enak, lumayan murah karena porsinya besar. keh..."
6,gak perlu diteriakin gapa mas mbak.. kalo saya...
7,pelayanan yang ramah. bau bakaran yang menggug...
8,menu seafood istimewa rasa istimewa harga terj...
9,pelayanannya bagus sekali! 🤗🤩 …


#2. Cleaning
(Menghapus simbol, angka, tanda baca, dan karakter tidak penting)

In [26]:
import re
import pandas as pd
from google.colab import files

# Fungsi cleaning teks
def clean_text(text):
    text = re.sub(r'http\S+', '', text)           # Hapus URL
    text = re.sub(r'[^a-zA-Z\s]', '', text)       # Hapus simbol, angka, tanda baca
    text = re.sub(r'\s+', ' ', text)              # Hapus spasi ganda
    return text.strip()

# Terapkan fungsi cleaning pada kolom 'cleaned_review'
data['cleaned'] = data['cleaned_review'].apply(lambda x: clean_text(str(x)))

# Simpan hasil Cleaning ke Excel
output_file = 'reviews_cleaned.xlsx'
data[['cleaned_review', 'cleaned']].to_excel(output_file, index=False, engine='openpyxl')

print("Cleaning selesai!")
print("Hasil disimpan dalam file:", output_file)

#  Tampilkan 20 baris pertama hasil cleaning
print("\n 20 Baris Pertama Hasil Cleaning:")
display(data[['cleaned_review', 'cleaned']].head(20))

# Unduh file hasil cleaning
files.download(output_file)


Cleaning selesai!
Hasil disimpan dalam file: reviews_cleaned.xlsx

 20 Baris Pertama Hasil Cleaning:


Unnamed: 0,cleaned_review,cleaned
0,"makanan laut segar, harga terjangkau, cocok un...",makanan laut segar harga terjangkau cocok untu...
1,langsung di sambut dengan ramah n megah,langsung di sambut dengan ramah n megah
2,salah satu tempat makan seafood di seputaran g...,salah satu tempat makan seafood di seputaran g...
3,"lokasi strategis, parkir mobil cuma bisa di te...",lokasi strategis parkir mobil cuma bisa di tep...
4,"enak, ga terlalu pricey, worth it",enak ga terlalu pricey worth it
5,"enak, lumayan murah karena porsinya besar. keh...",enak lumayan murah karena porsinya besar kehit...
6,gak perlu diteriakin gapa mas mbak.. kalo saya...,gak perlu diteriakin gapa mas mbak kalo saya p...
7,pelayanan yang ramah. bau bakaran yang menggug...,pelayanan yang ramah bau bakaran yang mengguga...
8,menu seafood istimewa rasa istimewa harga terj...,menu seafood istimewa rasa istimewa harga terj...
9,pelayanannya bagus sekali! 🤗🤩 …,pelayanannya bagus sekali


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#3. Normalization
Menyeragamkan ejaan tidak baku

In [27]:
import pandas as pd
from google.colab import files

# Baca file hasil cleaning
data = pd.read_excel("reviews_cleaned.xlsx")

# Kamus normalisasi
normalization_dict = {
    "gk": "tidak", "ga": "tidak", "nggak": "tidak", "enggak": "tidak", "gak": "tidak", "ngga": "tidak", "lbuh": "lebih", "lbh": "lebih", "lbih": "lebih",
    "tdk": "tidak", "t": "tidak", "tak": "tidak", "n": "dan","x": "nya", "engga":"tidak", "pol" :"sangat", "gitu":"seperti itu", "gapa": "tidak masalah",
    "udh": "sudah", "udah": "sudah", "sdh": "sudah", "suda": "sudah", "nginep":"menginap", "pake":"pakai", "mbak":"kakak", "masaka":"masakan",
    "blm": "belum", "belom": "belum", "staf" :"karyawan", "favorit":"kesukaan", "mani":"manis", "mns":"manis", "mnis":"manis",
    "dg": "dengan", "dgn": "dengan", "sma": "sama", "sm": "sama", "bener": "benar", "nyobain":"mencoba", "nyobain":"mencoba",
    "banget": "sangat", "gercep":"gerak cepat", "lebuh": "lebih", "enakkkk":"enak", "sari":"dari","seafood": "makanan laut", "agam":"ragam",
    "rame": "ramai", "dateng": "datang", "nyoba": "mencoba","bener": "benar", "w":"", "mantullllllll" : "mantap", "terjyata" : "ternyata", "nunggu":"tunggu",
    "banget": "sangat","rame": "ramai","dateng": "datang","nyoba": "mencoba", "mantap": "lezat", "pelayana":"pelayanan",
    "resto": "restoran",  "keren": "bagus","makannya": "makanannya",    "mantep": "mantap" , "seafoods" : "makanan laut",
    "enakk": "enak",   "enakkk": "enak","pesen": "pesan",   "nggak": "tidak", "orderan":"penasan", "agam": "ragam",
    "gak": "tidak",   "ga": "tidak","udh": "sudah","ca":"", "min":"", "gaada": "tidak ada", "muas":"puas",
    "udah": "sudah",   "tempatnya": "tempat", "pelayana":"pelayanan", "recommended": "rekomendasi",
    "maknyus": "lezat",   "makasih": "terima kasih",  "brani": "berani",  "bgus": "bagus",  "bgt": "sangat",
    "skali": "sekali", "bnyk": "banyak", "trs": "terus", "lg": "lagi", "mantap": "lezat", "resto": "restoran", "keren": "bagus",
    "cocok banget": "sangat cocok", "kaget": "terkejut",  "dr": "dari", "tp": "tapi", "tpi": "tapi", "pd": "pada", "jg": "juga", "aja": "saja", "aja.": "saja",
    "aja?": "saja", "aja!": "saja","kl": "kalau", "klo": "kalau", "klu": "kalau", "kalo": "kalau",
    "krn": "karena", "karna": "karena", "bgt": "banget", "bngt": "banget", "bnget": "banget",
    "yg": "yang", "yng": "yang","dlm": "dalam", "utk": "untuk", "buat": "untuk", "bwt": "buat","bs": "bisa", "bsa": "bisa",
    "trs": "terus", "trus": "terus","smg": "semoga", "moga": "semoga",
    "sy": "saya", "aku": "saya", "aq": "saya", "q": "saya","km": "kamu", "kmu": "kamu", "kam": "kamu", "u": "kamu", "lu": "kamu",
    "loe": "kamu", "lo": "kamu", "gw": "saya", "gua": "saya", "dpt": "dapat", "dapet": "dapat",
    "lg": "lagi", "lgi": "lagi", "bilang":"mengatakan", "diteriakin": "diteriaki",
    "bbrp": "beberapa",  "tmn": "teman", "temen": "teman", "mantapppppp":"mantap",
    "bkn": "bukan", "bukanlah": "bukan", "outdoor":"luar ruangan", "outdoor": "luar ruangan",
    "ok": "oke", "okey": "oke", "okee": "oke", "okeh": "oke",   "btw": "ngomong-ngomong", "thx": "terima kasih", "makasih": "terima kasih",
    "mksh": "terima kasih", "tq": "terima kasih",   "pls": "tolong", "plis": "tolong", "tolonglah": "tolong",
    "maafkan": "maaf",   "bisa2": "bisa-bisa", "bodo": "bodoh", "bodoamat": "tidak peduli",
    "parah": "buruk", "sialam":"salam", "frash":"segar", "agam":"supaya",
    "mantul": "mantap betul",  "mantab": "mantap", "seafoodnya" :"makanan laut",
    "nnti": "nanti", "ntar": "nanti", "kyk":"seperti", "sllu":"selalu",
    "td": "tadi", "tdi": "tadi", "weenak":"enak", "rekomen":"rekomendasi", "masaka":"masakan",
    "skrg": "sekarang", "skrng": "sekarang", "freshh" :"segar", "unjung" :"kunjung",
    "besok2": "besok",  "bentar": "sebentar", "rekomende" : "rekomendasi",
    "lbh": "lebih",  "drpd": "daripada","jk":"jika","lgsg":"langsung","hrg":"harga",
    "yogya":"yogyakarta","tpn": "tapi", "lgs":"langsung", "emang":"memang",
    "emg": "memang",  "ajaib": "aneh",   "ny": "nya",  "nih": "ini", "nie": "ini",
    "tu": "itu", "tuh": "itu", "ituu": "itu", "ya": "iya", "yah": "iya" ,"bener": "benar",
    "mantep": "mantap","nyoba": "mencoba",   "nyobain": "mencoba",
    "liat": "lihat",   "kesini": "ke sini","dateng": "datang",   "gitu": "begitu",
    "aja": "saja",    "nih": "",  "banget": "sangat","rame": "ramai",   "kali": "sekali",  "pas": "cocok",  "staff": "pegawai",
    "resto": "restoran","jogja": "yogyakarta", "sampe":"sampai"
}

#  Fungsi normalisasi
def normalize_text(text):
    text = str(text).lower()  # ubah ke string & huruf kecil
    words = text.split()
    normalized_words = [normalization_dict.get(w, w) for w in words]
    return " ".join(normalized_words)

# Terapkan normalisasi
data['normalized'] = data['cleaned'].apply(normalize_text)

# Simpan hasil normalisasi tanpa menampilkan 'cleaned:' di WordCloud
output_file = 'reviews_normalized.xlsx'
data[['normalized']].to_excel(output_file, index=False, engine='openpyxl')

print("Normalization selesai!")
print("Hasil disimpan dalam file:", output_file)

# Tampilkan 20 baris pertama hasil normalisasi
print("\n 20 Baris Pertama Hasil Normalisasi:")
display(data[['normalized']].head(20))

# Unduh file hasil normalisasi
files.download(output_file)


Normalization selesai!
Hasil disimpan dalam file: reviews_normalized.xlsx

 20 Baris Pertama Hasil Normalisasi:


Unnamed: 0,normalized
0,makanan laut segar harga terjangkau cocok untu...
1,langsung di sambut dengan ramah dan megah
2,salah satu tempat makan makanan laut di seputa...
3,lokasi strategis parkir mobil cuma bisa di tep...
4,enak tidak terlalu pricey worth it
5,enak lumayan murah karena porsinya besar kehit...
6,tidak perlu diteriaki tidak masalah mas kakak ...
7,pelayanan yang ramah bau bakaran yang mengguga...
8,menu makanan laut istimewa rasa istimewa harga...
9,pelayanannya bagus sekali


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#4. Tokenization
Memecah kalimat jadi kata-kata (token)

In [28]:
# Import library
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Download resource yang dibutuhkan
nltk.download('punkt')
nltk.download('punkt_tab')

# Baca file hasil normalisasi
data = pd.read_excel("reviews_normalized.xlsx")

# Tokenisasi setiap baris teks
data['tokenized'] = data['normalized'].apply(lambda x: word_tokenize(str(x)))

# Simpan hasil ke file Excel baru
data[['normalized', 'tokenized']].to_excel('reviews_tokenized.xlsx', index=False, engine='openpyxl')

# Tampilkan 20 baris pertama untuk dicek
print("Tokenisasi selesai! Berikut 20 hasil pertama:")
display(data.head(20))



Tokenisasi selesai! Berikut 20 hasil pertama:


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,normalized,tokenized
0,makanan laut segar harga terjangkau cocok untu...,"[makanan, laut, segar, harga, terjangkau, coco..."
1,langsung di sambut dengan ramah dan megah,"[langsung, di, sambut, dengan, ramah, dan, megah]"
2,salah satu tempat makan makanan laut di seputa...,"[salah, satu, tempat, makan, makanan, laut, di..."
3,lokasi strategis parkir mobil cuma bisa di tep...,"[lokasi, strategis, parkir, mobil, cuma, bisa,..."
4,enak tidak terlalu pricey worth it,"[enak, tidak, terlalu, pricey, worth, it]"
5,enak lumayan murah karena porsinya besar kehit...,"[enak, lumayan, murah, karena, porsinya, besar..."
6,tidak perlu diteriaki tidak masalah mas kakak ...,"[tidak, perlu, diteriaki, tidak, masalah, mas,..."
7,pelayanan yang ramah bau bakaran yang mengguga...,"[pelayanan, yang, ramah, bau, bakaran, yang, m..."
8,menu makanan laut istimewa rasa istimewa harga...,"[menu, makanan, laut, istimewa, rasa, istimewa..."
9,pelayanannya bagus sekali,"[pelayanannya, bagus, sekali]"


#5. Stopword Remova
Menghapus kata umum

In [29]:
# Import library
import pandas as pd
import nltk
from nltk.corpus import stopwords
import ast

# Download stopwords bahasa Indonesia dan Inggris (jika belum)
nltk.download('stopwords')

# Load stopwords bawaan
stop_words_id = set(stopwords.words('indonesian'))
stop_words_en = set(stopwords.words('english'))

# === Tambahkan stopword custom ===
tambahan_stopwords = {
    "zona", "seafood", "gejayan", "food", "riview", "greeting", "review", "seafoods","outdoors","fresh","flownya","payment","greetings"
    "wkwkwkwkwkwkwk", "grettingnya", "foodnya","'seafoodbased", "poll","job","cck","seafoodnya", "time",
    "fresh", "good", "best", "worth", "recommended", "recommend", "recomended","recomend", "outdoornya",
    "service", "outdoor", "order", "staff", "indoor", "rekomended","it","fix","seafoodzona", "freshh", "langgananfresh",
    "banget", "mantap", "mantep", "rame", "keren", "bener", "dateng", "nyoba","seafoodhrg"
    "kaget", "resto", "reservasi", "favorit", "aja", "sih", "nya", "nih", "seafoodan",
    "dong", "ya", "deh", "overall", "seafoodsukses","pricey", "it", "kitchen", "hehehe","but","buy",
    "wotht", "mamamia", "lezatosszzzz", "freshcumi","to", "seafoodrasanya","try", "luck", "worthit", "luvv"
    "sea", "experiece", "luv", "affordable", "fkeksibel", "open", "cozy", "seafoodrasanya"
    "smoking", "full", "freshh", "update", "notice", "so", "the", "far", "book", "seafoodnya"
}

# Gabungkan semua stopword
stop_words = stop_words_id.union(stop_words_en).union(tambahan_stopwords)

# Baca file hasil tokenisasi
data = pd.read_excel("reviews_tokenized.xlsx")

# Ubah string list jadi list Python
data['tokenized'] = data['tokenized'].apply(lambda x: ast.literal_eval(x))

# Fungsi hapus stopword
def remove_stopwords(tokens):
    return [w for w in tokens if w.lower() not in stop_words]

# Terapkan fungsi ke setiap baris
data['no_stopwords'] = data['tokenized'].apply(remove_stopwords)

# Simpan hanya kolom no_stopwords
output_file = 'reviews_stopword_removed.xlsx'
data[['no_stopwords']].to_excel(output_file, index=False, engine='openpyxl')

# Tampilkan 20 baris pertama
print("Stopword Removal selesai! Berikut 20 hasil pertama:")
display(data[['no_stopwords']].head(20))


Stopword Removal selesai! Berikut 20 hasil pertama:


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,no_stopwords
0,"[makanan, laut, segar, harga, terjangkau, coco..."
1,"[langsung, sambut, ramah, megah]"
2,"[salah, makan, makanan, laut, seputaran, guram..."
3,"[lokasi, strategis, parkir, mobil, tepi, jalan..."
4,[enak]
5,"[enak, lumayan, murah, porsinya, kehitung, cep..."
6,"[diteriaki, mas, kakak, pribadi, senang]"
7,"[pelayanan, ramah, bau, bakaran, menggugah, bi..."
8,"[menu, makanan, laut, istimewa, istimewa, harg..."
9,"[pelayanannya, bagus]"


#6. Stemming
Mengubah kata ke bentuk dasarnya

In [9]:
# Instal dSastrawi (stemmer bahasa Indonesia)
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [30]:
# Import library
import pandas as pd
import ast
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# from google.colab import files  # aktifkan kalau mau unduh hasil

# Inisialisasi Stemmer Bahasa Indonesia
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Baca file hasil stopword removal
data = pd.read_excel("reviews_stopword_removed.xlsx")

# Konversi string list jadi list Python asli
data['no_stopwords'] = data['no_stopwords'].apply(lambda x: ast.literal_eval(x))

# Fungsi stemming untuk setiap token
def stem_tokens(tokens):
    return [stemmer.stem(w) for w in tokens]

# Terapkan ke kolom
data['stemmed'] = data['no_stopwords'].apply(stem_tokens)

# Fungsi tambahan untuk hapus akhiran "-nya" (biar kata kayak 'makanannya' -> 'makanan')
def remove_suffix_nya(tokens):
    hasil = []
    for w in tokens:
        if w.endswith("nya"):
            w = w[:-3]  # hapus 3 huruf terakhir 'nya'
        hasil.append(w)
    return hasil

# Terapkan fungsi hapus '-nya' ke hasil stemming
data['stemmed'] = data['stemmed'].apply(remove_suffix_nya)

# Simpan hasil ke Excel
data[['no_stopwords', 'stemmed']].to_excel('reviews_stemmed.xlsx', index=False, engine='openpyxl')

# Tampilkan 20 hasil pertama
print("Stemming selesai! Berikut 20 hasil pertama:")
display(data.head(20))


Stemming selesai! Berikut 20 hasil pertama:


Unnamed: 0,no_stopwords,stemmed
0,"[makanan, laut, segar, harga, terjangkau, coco...","[makan, laut, segar, harga, jangkau, cocok, ke..."
1,"[langsung, sambut, ramah, megah]","[langsung, sambut, ramah, megah]"
2,"[salah, makan, makanan, laut, seputaran, guram...","[salah, makan, makan, laut, putar, gurame, bak..."
3,"[lokasi, strategis, parkir, mobil, tepi, jalan...","[lokasi, strategis, parkir, mobil, tepi, jalan..."
4,[enak],[enak]
5,"[enak, lumayan, murah, porsinya, kehitung, cep...","[enak, lumayan, murah, porsi, hitung, cepat, k..."
6,"[diteriaki, mas, kakak, pribadi, senang]","[riak, mas, kakak, pribadi, senang]"
7,"[pelayanan, ramah, bau, bakaran, menggugah, bi...","[layan, ramah, bau, bakar, gugah, bikin, ken, ..."
8,"[menu, makanan, laut, istimewa, istimewa, harg...","[menu, makan, laut, istimewa, istimewa, harga,..."
9,"[pelayanannya, bagus]","[layan, bagus]"


#8. Filtering / Selection
Memilih hanya teks yang relevan

In [31]:
# Import library
import pandas as pd
import ast
# from google.colab import files  # aktifkan kalau mau unduh

# Baca file hasil stemming
data = pd.read_excel("reviews_stemmed.xlsx")

# Pastikan kolom list dalam format list Python
data['stemmed'] = data['stemmed'].apply(lambda x: ast.literal_eval(x))

# Fungsi filtering (hapus kata < 4 huruf)
def filter_tokens(tokens):
    return [w for w in tokens if len(w) > 3]

# Terapkan ke kolom
data['filtered'] = data['stemmed'].apply(filter_tokens)

# Simpan hanya kolom hasil filtering saja
data[['filtered']].to_excel('reviews_filtered.xlsx', index=False, engine='openpyxl')

# Tampilkan 20 hasil pertama untuk dicek
print("Filtering selesai! Berikut 20 hasil pertama:")
display(data[['filtered']].head(20))

print("\nSemua tahap preprocessing selesai dengan sukses!")


Filtering selesai! Berikut 20 hasil pertama:


Unnamed: 0,filtered
0,"[makan, laut, segar, harga, jangkau, cocok, ke..."
1,"[langsung, sambut, ramah, megah]"
2,"[salah, makan, makan, laut, putar, gurame, bak..."
3,"[lokasi, strategis, parkir, mobil, tepi, jalan..."
4,[enak]
5,"[enak, lumayan, murah, porsi, hitung, cepat, k..."
6,"[riak, kakak, pribadi, senang]"
7,"[layan, ramah, bakar, gugah, bikin, mampir, ma..."
8,"[menu, makan, laut, istimewa, istimewa, harga,..."
9,"[layan, bagus]"



Semua tahap preprocessing selesai dengan sukses!


#UNDUH SEMUA DALAM 1 EXCEL

In [None]:
import pandas as pd
import glob

# Ambil semua file hasil preprocessing
files = sorted(glob.glob("reviews_*.xlsx"))
print("File ditemukan:", files)

# Gabungkan ke satu file Excel
with pd.ExcelWriter("full_preprocessing.xlsx", engine="openpyxl") as writer:
    for file in files:
        df = pd.read_excel(file)
        sheet_name = file.replace("reviews_", "").replace(".xlsx", "")
        df.to_excel(writer, sheet_name=sheet_name[:31], index=False)

print("Semua file berhasil digabung jadi 'full_preprocessing.xlsx'!")


File ditemukan: ['reviews_casefolded.xlsx', 'reviews_cleaned.xlsx', 'reviews_filtered.xlsx', 'reviews_normalized.xlsx', 'reviews_stemmed.xlsx', 'reviews_stopword_removed.xlsx', 'reviews_tokenized.xlsx']
Semua file berhasil digabung jadi 'full_preprocessing.xlsx'!
