Proyek ini bertujuan untuk melakukan text summarization otomatis terhadap berita-berita mengenai Perum DAMRI dari berbagai sumber daring.
Proses meliputi tahapan scraping data, preprocessing, embedding berbasis MiniLM, serta pembuatan ringkasan (summary) dengan dua pendekatan:

* Semantic Similarity (MiniLM)
* TF-IDF Extractive Summarization

# Data Preparation -> Clean Data

In [None]:
!pip install trafilatura

## Data Pre-processing

### Data Loading dan Persiapan Awal

In [None]:
import pandas as pd
from time import sleep
import random

In [None]:
input_csv = 'data/damri_article/data_raw/Kelompok1_Link Berita_DAMRI - Data.csv'
output_csv = 'data/damri_article/data_processed/scraped_articles.csv'

df_urls = pd.read_csv(input_csv)
urls = df_urls['link'].dropna().astype(str).tolist()
sources = df_urls['sumber'].dropna().astype(str).tolist()

In [None]:
import re
from bs4 import BeautifulSoup

def clean_text(html_text):
    text = BeautifulSoup(html_text, "html.parser").get_text(separator=" ")
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'Cookies|Setuju|Kebijakan Privasi|Iklan|Advertisement|ADVERTISEMENT|Copyright', '', text, flags=re.IGNORECASE)
    return text.strip()


### Web Scraping Berita Menggunakan Trafilatura

In [None]:
import requests
import trafilatura

def scrape_content(url):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        )
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            return {
                "url": url,
                "title": None,
                "content": None,
                "error": f"Status code: {response.status_code}"
            }
            
        downloaded = trafilatura.extract(
            response.text,
            include_comments=False,
            include_links=False,
            include_tables=False,
            deduplicate=True,
        )

        if not downloaded or not downloaded.strip():
            return {
                "url": url,
                "title": None,
                "content": None,
                "error": "Konten kosong"
            }

        metadata = trafilatura.extract_metadata(response.text)
        title = metadata.title if metadata and metadata.title else None

        return {
            "url": url,
            "title": title,
            "content": downloaded.strip(),
            "error": None
        }

    except Exception as e:
        return {
            "url": url,
            "title": None,
            "content": None,
            "error": str(e)
        }


### Proses Scraping Secara Batch

In [None]:
results = []
for i, (url, sumber) in enumerate(zip(urls, sources), start=1):
    print(f"[{i}/{len(urls)}] Scraping: {url}")
    data = scrape_content(url)
    data['sumber'] = sumber
    results.append(data)
    sleep(random.uniform(1, 2))

In [None]:
df_result = pd.DataFrame(results)
df_result.to_csv(output_csv, index=False, encoding='utf-8-sig')

In [None]:
print(f"Scraping selesai! Hasil disimpan di: {output_csv}")
print(f"Total artikel berhasil: {df_result['content'].notna().sum()} dari {len(df_result)}")


In [None]:
df_ok = df_result.loc[df_result["error"].isna()]
df_ok["sumber"].unique().tolist()

In [None]:
df_error = df_result.loc[df_result["error"].notna()]
df_error["sumber"].unique().tolist()

In [None]:
df_ok = df_result.loc[df_result["error"].isna()].copy()
df_ok["content_cleaned"] = df_ok["content"].apply(clean_text)

### Pembersihan dan Tokenisasi Teks

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

df_ok = df_ok.copy()

df_ok["content_cleaned_tokenize"] = df_ok["content_cleaned"].apply(
    lambda text: sent_tokenize(text) if isinstance(text, str) else []
)

In [None]:
df_ok.content_cleaned_tokenize[1]

## Semantic Embedding dengan MiniLM

In [None]:
from sentence_transformers import SentenceTransformer
import torch

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

df_ok = df_ok.copy()
df_ok["text_for_embedding"] = df_ok["content_cleaned_tokenize"].apply(
    lambda sentences: " ".join(sentences) if isinstance(sentences, list) else str(sentences)
)

texts = df_ok["text_for_embedding"].tolist()
embeddings = model.encode(texts, convert_to_tensor=True, batch_size=16, show_progress_bar=True)

df_ok["embeddings"] = [emb for emb in embeddings]
df_ok[["sumber", "title", "embeddings"]].head()


In [None]:
import numpy as np

sample_sentences = df_ok.iloc[0]["content_cleaned_tokenize"]
sentence_embeddings = model.encode(sample_sentences, convert_to_tensor=True)
print(sentence_embeddings)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(sentence_embeddings.cpu().numpy())
print(similarity_matrix)

sentence_scores = similarity_matrix.mean(axis=1)


In [None]:
top_n = 3
top_sentence_indices = np.argsort(sentence_scores)[-top_n:]

top_sentence_indices.sort()

summary = " ".join([sample_sentences[i] for i in top_sentence_indices])
print(summary)


## Extractive Summarization berbasis Semantic Similarity

In [None]:
def summarize_text(sentences, model, top_n=3):
    if not sentences:
        return ""
    embeddings = model.encode(sentences, convert_to_tensor=True)
    sim = cosine_similarity(embeddings.cpu().numpy())
    scores = sim.mean(axis=1)
    top_idx = np.argsort(scores)[-top_n:]
    top_idx.sort()
    return " ".join([sentences[i] for i in top_idx])

df_ok["summary"] = df_ok["content_cleaned_tokenize"].apply(
    lambda sents: summarize_text(sents, model, top_n=3)
)


In [None]:
df_ok.head()

In [None]:
df_ok[["url", "title", "content", "summary"]].to_csv("data/damri_article/data_processed/data_extraction_minilm_summary.csv", sep=";")

## TF-IDF Based Summarization

In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re

nltk.download('punkt', quiet=True)

def tfidf_summarize(text, num_sentences=3):
    if not isinstance(text, str) or text.strip() == "":
        return ""
    
    text = clean_text(text)
    sentences = nltk.sent_tokenize(text)
    if len(sentences) == 0:
        return ""
    if len(sentences) <= num_sentences:
        return " ".join(sentences)

    clean_sentences = [
        re.sub(r'\s+', ' ', re.sub(r'[^a-zA-Z0-9.,!? ]', '', s)).strip()
        for s in sentences
    ]

    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(clean_sentences)
    sentence_scores = tfidf_matrix.sum(axis=1).A1

    top_indices = np.argsort(sentence_scores)[-num_sentences:]
    top_indices.sort()

    summary = " ".join([sentences[i] for i in top_indices])
    summary = re.sub(r'\s+', ' ', summary).strip()
    return summary

df_ok["tf_idf_summary"] = df_ok["content"].apply(lambda x: tfidf_summarize(x, num_sentences=3))

df_ok[["url", "title", "tf_idf_summary"]].head()


In [None]:
df_ok[["url", "title", "content", "tf_idf_summary"]].to_csv("data\damri_article\data_processed\data_extraction_minilm_summary.csv", sep=";")
df_ok.to_csv("data\damri_article\data_processed\data_extraction_summary[full].csv", sep=";")

In [None]:
print(df_ok.summary[0])
print(df_ok.tf_idf_summary[0])

# Sentiment Exploration

## Data Preparation

### Load Dataset

In [None]:
path_guess = "data\damri_article\data_processed\data_extraction_summary[full].csv"
df = pd.read_csv(path_guess, delimiter=";", engine="python", on_bad_lines="skip")

df['summary'] = df['summary'].astype(str).str.strip()
print('Dataset loaded. Rows:', len(df))
df.head(3)

### Labeling (Keyword-base)

In [None]:
label_keywords = {
    "jadwal": ["jadwal", "berangkat", "keberangkatan", "waktu", "operasional", "pukul", "beroperasi"],
    "rute": ["rute", "trayek", "jalur", "pemberhentian", "melayani", "ke"],
    "harga": ["harga", "tiket", "tarif", "biaya", "rp", "promo", "diskon", "gratis"],
    "layanan": ["layanan", "fasilitas", "kenyamanan", "armada", "pembayaran", "aplikasi", "pelayanan"],
    "umum": ["damri", "mobilisasi", "masyarakat", "program", "dukungan", "pengumuman"]
}

def label_aspek_informasi(text):
    t = str(text).lower()
    scores = {k:0 for k in label_keywords}
    for k, kws in label_keywords.items():
        for kw in kws:
            if kw in t:
                scores[k] += 1
    best = max(scores, key=lambda x: scores[x])
    return best if scores[best] > 0 else "umum"

df['aspek_informasi_auto'] = df['summary'].apply(label_aspek_informasi)
print(df['aspek_informasi_auto'].value_counts())

# Save for manual checking
df.to_csv("data/damri_article/data_processed/data_extraction_labeled_auto.csv", index=False)
print("Saved auto-labeled CSV to /mnt/data/data_extraction_labeled_auto.csv")

### Review

In [None]:
import os
review_path = "data/damri_article/data_processed/data_extraction_labeled_reviewed.csv"
if os.path.exists(review_path):
    df = pd.read_csv(review_path, engine="python", on_bad_lines="skip")
    print("Loaded reviewed labels.")
else:
    print("No reviewed file found; using auto labels.")

## Augmentation

In [None]:
import random

syn_lex = {
    "harga": ["tarif","biaya","ongkos"],
    "tiket": ["karcis","tiket perjalanan"],
    "berangkat": ["berlepas","mulai berangkat"],
    "layanan": ["pelayanan","service"],
    "fasilitas": ["sarana","prasarana"],
    "armada": ["kendaraan","bus"],
    "gratis": ["bebas biaya"],
    "rute": ["jalur","trayek"]
}

def synonym_replace(text, n_repl=1):
    words = text.split()
    if not words: return text
    idxs = list(range(len(words)))
    random.shuffle(idxs)
    for i in idxs:
        w = words[i].lower().strip('.,;:?!')
        if w in syn_lex:
            words[i] = random.choice(syn_lex[w])
            break
    return " ".join(words)

def random_deletion(text, p=0.1):
    words = text.split()
    new_words = [w for w in words if random.random() > p]
    return " ".join(new_words) if new_words else text

def random_swap(text):
    words = text.split()
    if len(words) < 2: return text
    i, j = random.sample(range(len(words)), 2)
    words[i], words[j] = words[j], words[i]
    return " ".join(words)

def augment_text(text):
    return [synonym_replace(text), random_deletion(text), random_swap(text)]

# Build augmented dataset
min_samples_per_class = 20
label_col = 'aspek_informasi_auto'
aug_rows = []

for label, group in df.groupby(label_col):
    count = len(group)
    if count >= min_samples_per_class:
        continue
    needed = min_samples_per_class - count
    samples = group['summary'].tolist()
    i = 0
    while needed > 0:
        src = samples[i % len(samples)]
        for aug in augment_text(src):
            new_row = group.iloc[0].copy()
            new_row['summary'] = aug
            new_row[label_col] = label
            aug_rows.append(new_row)
            needed -= 1
            if needed <= 0: break
        i += 1

df_augmented = pd.concat([df, pd.DataFrame(aug_rows)], ignore_index=True)
print("Label distribution after augmentation:")
print(df_augmented[label_col].value_counts())
df_augmented.to_csv("data/damri_article/data_processed/data_extraction_labeled_augmented.csv", index=False)

## Baseline Model 
TF-IDF + Naive Bayes & Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

X = df_augmented['summary']
y = df_augmented[label_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipe_nb = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=5000)),
    ('clf', MultinomialNB())
])

pipe_lr = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=5000)),
    ('clf', LogisticRegression(max_iter=1000))
])

print("Training Naive Bayes...")
pipe_nb.fit(X_train, y_train)

print("Training Logistic Regression...")
pipe_lr.fit(X_train, y_train)

# Evaluation
y_pred_nb = pipe_nb.predict(X_test)
y_pred_lr = pipe_lr.predict(X_test)

print("\n=== MultinomialNB ===")
print(classification_report(y_test, y_pred_nb, digits=4))

print("\n=== LogisticRegression ===")
print(classification_report(y_test, y_pred_lr, digits=4))

## Check Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

labels_sorted = sorted(df_augmented[label_col].unique())
cm = confusion_matrix(y_test, y_pred_lr, labels=labels_sorted)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels_sorted, yticklabels=labels_sorted)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (Logistic Regression)')
plt.show()