# 🧹 2. Preprocessing Data

Notebook ini akan membersihkan dan mempersiapkan data review untuk analisis sentimen.

**Proses:**
1. Load data dari `data/raw/`
2. Cleaning teks (URL, emoji, punctuation berlebih, dll)
3. Normalisasi slang bahasa Indonesia
4. Labeling otomatis berdasarkan rating
5. Simpan data bersih ke `data/interim/`

In [None]:
# Import libraries
import pandas as pd
import re
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Buat folder jika belum ada
Path('data/interim').mkdir(parents=True, exist_ok=True)

print("✅ Setup complete!")

## Fungsi Text Cleaning

Fungsi untuk membersihkan teks bahasa Indonesia.

In [None]:
def clean_text(text):
    """
    Membersihkan teks bahasa Indonesia.
    
    Proses:
    - Lowercase
    - Remove URL
    - Remove email
    - Remove mention (@)
    - Remove hashtag (#)
    - Remove emoji
    - Remove punctuation berlebih (!!!, ???)
    - Remove extra whitespace
    """
    if not isinstance(text, str) or not text:
        return ""
    
    # Lowercase
    text = text.lower()
    
    # Remove URL
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove email
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove mention
    text = re.sub(r'@\w+', '', text)
    
    # Remove hashtag (tapi keep kata)
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # Remove emoji (basic)
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map
        u"\U0001F1E0-\U0001F1FF"  # flags
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub('', text)
    
    # Remove excessive punctuation
    text = re.sub(r'([!?.]){2,}', r'\1', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text

# Test fungsi
test_text = "APLIKASI @gojek SANGAT BAGUS!!! Cek https://example.com 😊"
print("Original:", test_text)
print("Cleaned:", clean_text(test_text))

## Normalisasi Slang Indonesia

Mengubah slang/singkatan menjadi kata baku.

In [None]:
# Dictionary slang Indonesia
SLANG_DICT = {
    'gak': 'tidak', 'ga': 'tidak', 'nggak': 'tidak', 'ngga': 'tidak',
    'gk': 'tidak', 'tdk': 'tidak',
    'udah': 'sudah', 'udh': 'sudah',
    'blm': 'belum', 'blum': 'belum',
    'tp': 'tapi', 'tp': 'tapi',
    'yg': 'yang', 'dgn': 'dengan', 'utk': 'untuk',
    'krn': 'karena', 'krna': 'karena',
    'bgt': 'banget', 'bngdt': 'banget',
    'bg': 'bagus', 'bgs': 'bagus',
    'aja': 'saja',
    'hrs': 'harus',
    'trs': 'terus',
    'org': 'orang',
    'jg': 'juga', 'jgn': 'jangan',
    'gmn': 'gimana', 'gmana': 'gimana',
    'emg': 'memang', 'emng': 'memang',
    'skrg': 'sekarang', 'skrng': 'sekarang',
    'smua': 'semua'
}

def normalize_slang(text):
    """Normalisasi slang Indonesia."""
    words = text.split()
    normalized = [SLANG_DICT.get(word, word) for word in words]
    return ' '.join(normalized)

# Test fungsi
test_text = "aplikasi sangat bagus bgt tp kadang error jg"
print("Original:", test_text)
print("Normalized:", normalize_slang(test_text))

## Auto-Labeling

Labeling otomatis berdasarkan rating:
- **1-2 stars** → negatif
- **3 stars** → netral
- **4-5 stars** → positif

In [None]:
def auto_label(score):
    """
    Label sentimen berdasarkan rating.
    
    Args:
        score: Rating (1-5)
    
    Returns:
        'positif', 'netral', atau 'negatif'
    """
    if score >= 4:
        return 'positif'
    elif score == 3:
        return 'netral'
    else:
        return 'negatif'

# Test fungsi
for score in [1, 2, 3, 4, 5]:
    print(f"Score {score} → {auto_label(score)}")

## Load & Preprocess Data

Load semua file dari `data/raw/` dan proses satu per satu.

In [None]:
import os
from glob import glob

# Cari semua file CSV di data/raw/
raw_files = glob('data/raw/*_reviews.csv')

if not raw_files:
    print("❌ Tidak ada file di data/raw/")
    print("⚠️ Jalankan notebook 1_scraping.ipynb terlebih dahulu!")
else:
    print(f"📁 Ditemukan {len(raw_files)} file:")
    for f in raw_files:
        print(f"   - {os.path.basename(f)}")

In [None]:
def preprocess_dataframe(df, app_name):
    """
    Preprocess DataFrame.
    
    Proses:
    1. Drop missing values
    2. Clean text
    3. Normalize slang
    4. Auto-label
    5. Remove duplicates
    6. Remove very short texts
    """
    print(f"\n🔄 Processing {app_name}...")
    print(f"   Initial rows: {len(df)}")
    
    # Drop missing
    df = df.dropna(subset=['content', 'score'])
    print(f"   After drop NaN: {len(df)}")
    
    # Clean text
    df['clean_text'] = df['content'].apply(clean_text)
    
    # Normalize slang
    df['clean_text'] = df['clean_text'].apply(normalize_slang)
    
    # Remove very short texts (< 5 characters)
    df = df[df['clean_text'].str.len() >= 5]
    print(f"   After remove short: {len(df)}")
    
    # Auto-label
    df['label'] = df['score'].apply(auto_label)
    
    # Remove duplicates
    df = df.drop_duplicates(subset=['clean_text'], keep='first')
    print(f"   After dedup: {len(df)}")
    
    # Show label distribution
    print(f"   Label distribution:")
    for label, count in df['label'].value_counts().items():
        percentage = count / len(df) * 100
        print(f"      {label}: {count} ({percentage:.1f}%)")
    
    return df

In [None]:
# Proses semua file
all_clean_data = []

print("=" * 60)
print("🚀 MULAI PREPROCESSING")
print("=" * 60)

for file_path in raw_files:
    # Load data
    app_name = os.path.basename(file_path).replace('_reviews.csv', '')
    df = pd.read_csv(file_path)
    
    # Preprocess
    df_clean = preprocess_dataframe(df, app_name)
    
    # Simpan per app
    output_file = f'data/interim/{app_name}_clean.csv'
    df_clean.to_csv(output_file, index=False, encoding='utf-8')
    print(f"   ✅ Saved to: {output_file}")
    
    all_clean_data.append(df_clean)

print("\n" + "=" * 60)
print("✅ PREPROCESSING SELESAI!")
print("=" * 60)

## Gabungkan Semua Data

Gabungkan data dari semua aplikasi menjadi satu file.

In [None]:
# Gabungkan semua data
df_all = pd.concat(all_clean_data, ignore_index=True)

print(f"\n📊 Total data setelah digabung: {len(df_all)}")

# Overall label distribution
print(f"\n📈 Distribusi Label Keseluruhan:")
label_counts = df_all['label'].value_counts()
for label, count in label_counts.items():
    percentage = count / len(df_all) * 100
    print(f"   {label}: {count} ({percentage:.1f}%)")

# Distribusi per app
print(f"\n📱 Distribusi per Aplikasi:")
for app in df_all['app'].unique():
    count = len(df_all[df_all['app'] == app])
    percentage = count / len(df_all) * 100
    print(f"   {app}: {count} ({percentage:.1f}%)")

# Simpan gabungan
output_all = 'data/interim/all_apps_clean.csv'
df_all.to_csv(output_all, index=False, encoding='utf-8')
print(f"\n💾 Saved combined data to: {output_all}")

## Preview Data Bersih

Lihat contoh data yang sudah dibersihkan.

In [None]:
# Tampilkan contoh per label
print("📋 CONTOH DATA PER LABEL")
print("=" * 80)

for label in ['positif', 'netral', 'negatif']:
    print(f"\n🏷️ {label.upper()}:")
    print("-" * 80)
    
    samples = df_all[df_all['label'] == label].head(3)
    for idx, row in samples.iterrows():
        print(f"App: {row['app']} | Score: {row['score']}")
        print(f"Original: {row['content'][:100]}...")
        print(f"Cleaned: {row['clean_text'][:100]}...")
        print()

## 🎉 Selesai!

Data berhasil dibersihkan dan disimpan di `data/interim/`.

**Output files:**
- `data/interim/gojek_clean.csv`
- `data/interim/grab_clean.csv`
- `data/interim/maxim_clean.csv`
- `data/interim/all_apps_clean.csv` *(gabungan semua)*

**Next steps:**
- Jalankan notebook `3_sentiment_analysis.ipynb` untuk prediksi sentimen dengan IndoBERT

**Statistik:**
- Total data: {len(df_all)} reviews
- Label: positif, netral, negatif
- Siap untuk analisis sentimen!