# Preprocessing Dataset


## Initial Setup

In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Pastikan resource NLTK terunduh
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Baca dataset
df_article = pd.read_csv("jaklingko_news_raw.csv")

# Gunakan hanya kolom 'Content'
df_article = df_article.rename(columns={'Content': 'article_text'})
df_article = df_article.dropna(subset=['article_text']).reset_index(drop=True)

print("Jumlah artikel:", len(df_article))
df_article.head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Jumlah artikel: 156


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,No,Title,Source,Date,URL,article_text,label
0,1,"Nasib Sopir Jaklingko, Saat Tarikan Gas tak Se...",https://www.republika.id,08/06/2025,https://www.republika.id/posts/58688/nasib-sop...,"REPUBLIKA.ID, JAKARTA — Sudah tiga bulan, Didi...",opinion
1,2,"Naik MRT, KRL, Transjakarta, hingga JakLingko ...",https://nasional.kompas.com,08/01/2025,https://nasional.kompas.com/read/2025/08/01/11...,"JAKARTA, KOMPAS.com - Pemerintah menetapkan di...",news
2,3,"Dilema Hadirnya Jaklingko, Sopir Angkot Regula...",https://wartakota.tribunnews.com,09/27/2025,https://wartakota.tribunnews.com/jakarta/86893...,"WARTAKOTALIVE.COM, JAKARTA -Hadirnya Jaklingko...",opinion
3,4,Seluruh Layanan TransJakarta dan Jaklingko Dih...,https://www.metrotvnews.com,08/30/2025,https://www.metrotvnews.com/read/b3JCpLJq-selu...,Jakarta: Seluruh layanan TransJakarta dan Jakl...,opinion
4,5,Pemkot Tangerang Dukung Rencana Perpanjangan R...,https://www.tangerangkota.go.id,04/10/2025,https://www.tangerangkota.go.id/berita/detail/...,Pemerintah Kota (Pemkot) Tangerang menyambut b...,government


## Data Cleaning

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()                                 # lowercase
    text = re.sub(r'<[^>]*>', ' ', text)                # hapus HTML
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)  # hapus URL
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)            # hapus angka & simbol
    text = re.sub(r'\s+', ' ', text).strip()            # hapus spasi berlebih
    return text

df_article['cleaned_text'] = df_article['article_text'].apply(clean_text)

print("🧹 Perbandingan Sebelum & Sesudah Cleaning:")
print(df_article[['article_text', 'cleaned_text']].head())


🧹 Perbandingan Sebelum & Sesudah Cleaning:
                                        article_text  \
0  REPUBLIKA.ID, JAKARTA — Sudah tiga bulan, Didi...   
1  JAKARTA, KOMPAS.com - Pemerintah menetapkan di...   
2  WARTAKOTALIVE.COM, JAKARTA -Hadirnya Jaklingko...   
3  Jakarta: Seluruh layanan TransJakarta dan Jakl...   
4  Pemerintah Kota (Pemkot) Tangerang menyambut b...   

                                        cleaned_text  
0  republika id jakarta sudah tiga bulan didin bu...  
1  jakarta kompas com pemerintah menetapkan disko...  
2  wartakotalive com jakarta hadirnya jaklingko s...  
3  jakarta seluruh layanan transjakarta dan jakli...  
4  pemerintah kota pemkot tangerang menyambut bai...  


## Sentiment Polarity Analysis


In [None]:
!pip install textblob deep-translator --quiet
import textblob

from textblob import TextBlob
from deep_translator import GoogleTranslator
import numpy as np
import time

# Fungsi translasi (aman & stabil)
def translate_text(text):
    try:
        return GoogleTranslator(source='auto', target='en').translate(text)
    except Exception as e:
        print("⚠️ Terjemahan gagal:", e)
        return text

# Potong teks panjang
def split_text_into_batches(text, max_words=500):
    if not isinstance(text, str):
        return []
    words = text.split()
    return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

# Fungsi sentiment
def get_sentiment_polarity_long(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0

    batches = split_text_into_batches(text)
    polarities = []

    for batch in batches:
        try:
            eng_text = translate_text(batch)
            blob = TextBlob(eng_text)
            scores = [s.sentiment.polarity for s in blob.sentences]
            if scores:
                polarities.append(np.mean(scores))
            time.sleep(0.3)
        except Exception as e:
            print("⚠️ Gagal proses batch:", e)
            continue

    return np.mean(polarities) if polarities else 0.0

# Jalankan ke seluruh artikel
df_article = df_article.dropna(subset=['cleaned_text'])
df_article['polarity'] = df_article['cleaned_text'].apply(get_sentiment_polarity_long)
df_article['sentiment'] = df_article['polarity'].apply(lambda x: 'positive' if x > 0 else 'negative')

print("✅ Analisis sentimen selesai!")
print(df_article[['cleaned_text', 'polarity', 'sentiment']].head(5))


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Analisis sentimen selesai!
                                        cleaned_text  polarity sentiment
0  republika id jakarta sudah tiga bulan didin bu...  0.050000  positive
1  jakarta kompas com pemerintah menetapkan disko...  0.128977  positive
2  wartakotalive com jakarta hadirnya jaklingko s...  0.001058  positive
3  jakarta seluruh layanan transjakarta dan jakli... -0.020100  negative
4  pemerintah kota pemkot tangerang menyambut bai...  0.057143  positive


## Tokenization & Stopwords Removal

In [None]:
# Inisialisasi stopword Bahasa Indonesia
stop_words = set(stopwords.words('indonesian'))

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered = [w for w in tokens if w not in stop_words and len(w) > 2]
    return filtered

df_article['tokens'] = df_article['cleaned_text'].apply(tokenize_and_remove_stopwords)

print("🔤 Contoh Tokenisasi:")
df_article[['cleaned_text', 'tokens']].head()


🔤 Contoh Tokenisasi:


Unnamed: 0,cleaned_text,tokens
0,republika id jakarta sudah tiga bulan didin bu...,"[republika, jakarta, didin, nama, trainee, sop..."
1,jakarta kompas com pemerintah menetapkan disko...,"[jakarta, kompas, com, pemerintah, menetapkan,..."
2,wartakotalive com jakarta hadirnya jaklingko s...,"[wartakotalive, com, jakarta, hadirnya, jaklin..."
3,jakarta seluruh layanan transjakarta dan jakli...,"[jakarta, layanan, transjakarta, jaklingko, di..."
4,pemerintah kota pemkot tangerang menyambut bai...,"[pemerintah, kota, pemkot, tangerang, menyambu..."


## Stemming

In [None]:
# Install Sastrawi kalau belum ada (jalankan sekali saja di Colab)
!pip install Sastrawi

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Inisialisasi stemmer Bahasa Indonesia
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# 🔹 Daftar kata penting yang tidak boleh di-stem (dilindungi)
protect_words = [
    'pemerintah', 'pelayanan', 'angkutan', 'penumpang', 'integrasi',
    'transjakarta', 'mikrotrans', 'jaklingko', 'tarif', 'layanan',
    'jakarta', 'dki', 'program', 'masyarakat', 'warga', 'operator'
]

# 🔹 Fungsi stemming dengan pengecualian kata penting
def stem_tokens(tokens):
    stemmed = []
    for token in tokens:
        if token in protect_words:
            stemmed.append(token)  # jangan diubah
        else:
            stemmed.append(stemmer.stem(token))  # lakukan stemming normal
    return stemmed

# 🔹 Terapkan fungsi stemming ke dataframe
df_article['stemmed_tokens'] = df_article['tokens'].apply(stem_tokens)

# 🔹 Gabungkan token hasil stemming jadi kalimat
df_article['stemmed_text'] = df_article['stemmed_tokens'].apply(lambda x: ' '.join(x))

print("🪶 Contoh hasil stemming (dengan kata penting dipertahankan):")
df_article[['tokens', 'stemmed_tokens', 'stemmed_text']].head(5)


Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m204.8/209.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
🪶 Contoh hasil stemming (dengan kata penting dipertahankan):


Unnamed: 0,tokens,stemmed_tokens,stemmed_text
0,"[republika, jakarta, didin, nama, trainee, sop...","[republika, jakarta, din, nama, trainee, sopir...",republika jakarta din nama trainee sopir mikro...
1,"[jakarta, kompas, com, pemerintah, menetapkan,...","[jakarta, kompas, com, pemerintah, tetap, disk...",jakarta kompas com pemerintah tetap diskon tar...
2,"[wartakotalive, com, jakarta, hadirnya, jaklin...","[wartakotalive, com, jakarta, hadir, jaklingko...",wartakotalive com jakarta hadir jaklingko angk...
3,"[jakarta, layanan, transjakarta, jaklingko, di...","[jakarta, layanan, transjakarta, jaklingko, he...",jakarta layanan transjakarta jaklingko henti h...
4,"[pemerintah, kota, pemkot, tangerang, menyambu...","[pemerintah, kota, pemkot, tangerang, sambut, ...",pemerintah kota pemkot tangerang sambut rencan...


## Save To CSV

In [None]:
# Urutan kolom sesuai permintaan
final_cols = [
    'No', 'Title', 'Source', 'Date', 'URL',
    'article_text', 'cleaned_text',
    'tokens', 'stemmed_tokens', 'stemmed_text',
    'polarity', 'sentiment', 'label'
]


# Cek kolom yang tersedia di DataFrame kamu
available_cols = [c for c in final_cols if c in df_article.columns]

# Simpan ke file CSV dengan encoding aman untuk Excel
df_article[available_cols].to_csv("cleaned_jaklingko_text_final.csv", index=False, encoding='utf-8-sig')

print("✅ File 'cleaned_jaklingko_text.csv' berhasil disimpan!")
print(f"Kolom disertakan: {available_cols}")
print(f"Total artikel tersimpan: {len(df_article)}")


✅ File 'cleaned_jaklingko_text.csv' berhasil disimpan!
Kolom disertakan: ['No', 'Title', 'Source', 'Date', 'URL', 'article_text', 'cleaned_text', 'tokens', 'stemmed_tokens', 'stemmed_text']
Total artikel tersimpan: 156
