In [30]:
%pip install feedparser

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [31]:
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
import feedparser
import pandas as pd
import time
import re
import concurrent.futures


In [32]:
def scrape_berita(url, source):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "Referer": "https://www.google.com/",
        "Accept-Language": "id,en-US;q=0.9,en;q=0.8",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    }
    try:
        resp = requests.get(url, headers=headers, timeout=10)
        if resp.status_code != 200:
            print(f"  ⚠️ Status code: {resp.status_code}")
            return None
        soup = BeautifulSoup(resp.content, "html.parser")

        judul_tag = soup.select_one("h1")
        if not judul_tag:
            print("  ⚠️ Tidak menemukan tag <h1>")
            return None

        isi_paragraf = soup.select(".side-article p")
        if not isi_paragraf:
            print("  ⚠️ Tidak menemukan isi artikel")
            return None

        judul = judul_tag.text.strip()
        isi = "\n".join([p.text.strip() for p in isi_paragraf if p.text.strip()])

        # Tanggal dan waktu
        tanggal_tag = soup.select_one("time") or soup.select_one(".date")
        tanggal_full = tanggal_tag.text.strip() if tanggal_tag else "Tidak ditemukan"
        match = re.search(r"(\d{1,2}/\d{1,2}/\d{4})[ ,]*(\d{1,2}:\d{2})?", tanggal_full)
        date = match.group(1) if match else tanggal_full
        time_ = match.group(2) if match and match.group(2) else ""

        # Breadcrumb: kategori dan sub-kategori
        breadcrumb = soup.select(".breadcrumb li a")
        category = breadcrumb[1].text.strip() if len(breadcrumb) > 1 else ""
        sub_category = breadcrumb[2].text.strip() if len(breadcrumb) > 2 else ""

        return {
            "title": judul,
            "source": source,
            "date": date,
            "time": time_,
            "category": category,
            "sub-category": sub_category,
            "content": isi,
            "url": url,
        }

    except Exception as e:
        print(f"  ❌ Gagal scraping {url}: {e}")
        return None

In [33]:
# --- Ambil RSS ---
dataset = pd.read_csv('link_scrapping.csv')
tribun_rss = dataset[dataset['website'] == 'tribun']['link'].tolist()

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Referer": "https://www.google.com/",
}

urls = set()
for rss_url in tribun_rss:
    try:
        print(f"Mengambil dari: {rss_url}")
        resp = requests.get(rss_url, headers=headers, timeout=10)
        feed = feedparser.parse(resp.text)
        for entry in feed.entries:
            urls.add(entry.link)
        time.sleep(1)
    except Exception as e:
        print(f"❌ Gagal ambil dari {rss_url}: {e}")

urls = list(urls)[:1000]
print(f"✅ Total link yang berhasil dikumpulkan: {len(urls)}")


# --- Scraping Paralel ---
def scrape_wrapper(args):
    url, source = args
    print(f"Scraping: {url} | Source: {source}")
    return scrape_berita(url, source)

berita_list = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    args_list = [(url, urlparse(url).netloc) for url in urls]
    results = list(executor.map(scrape_wrapper, args_list))
    for data in results:
        if data:
            berita_list.append(data)

# --- Simpan ke CSV ---
df = pd.DataFrame(berita_list, columns=[
    "title", "source", "date", "time", "category", "sub-category", "content", "url"
])
df.to_csv("berita.csv", index=False)
print("✅ CSV disimpan! Jumlah artikel:", len(df))

Mengambil dari: https://www.tribunnews.com/rss
Mengambil dari: https://medan.tribunnews.com/rss
Mengambil dari: https://jabar.tribunnews.com/rss
Mengambil dari: https://surabaya.tribunnews.com/rss
Mengambil dari: http://jabar.tribunnews.com/rss
Mengambil dari: https://wartakota.tribunnews.com/rss
Mengambil dari: https://jogja.tribunnews.com/rss
Mengambil dari: http://jateng.tribunnews.com/rss
Mengambil dari: http://aceh.tribunnews.com/rss
Mengambil dari: https://www.kompas.com/rss
Mengambil dari: https://regional.kompas.com/rss
✅ Total link yang berhasil dikumpulkan: 160
Scraping: https://www.tribunnews.com/bisnis/2025/06/10/adupi-desak-regulasi-baru-untuk-daur-ulang-soroti-urgensi-inovasi-dan-ekonomi-sirkular | Source: www.tribunnews.com
Scraping: https://medan.tribunnews.com/2025/06/10/kontroversi-kapal-jkw-dan-dewi-iriana-di-pertambangan-nikel-raja-ampat-begini-tanggapan-bahlil | Source: medan.tribunnews.com
Scraping: https://wartakota.tribunnews.com/2025/06/10/suarakan-penolakan-ta