In [94]:
%pip install feedparser



In [95]:
from IPython import get_ipython
from IPython.display import display
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
import feedparser
import pandas as pd
import time
import re
import concurrent.futures


In [96]:
def parse_website(soup, url, source):
    """
    Mengurai objek BeautifulSoup dengan selector h1 untuk judul
    dan mencoba dua selector untuk konten: div.body-paragraph p atau .deskrip-foto.

    Args:
        soup (BeautifulSoup object): Objek BeautifulSoup dari halaman yang di-scrape.
        url (str): URL artikel.
        source (str): Nama sumber website.

    Returns:
        dict or None: Dictionary data artikel jika berhasil, None jika gagal.
    """
    try:
        # Judul - MENGGUNAKAN SELECTOR h1
        judul_tag = soup.select_one("h1")
        if not judul_tag:
            print(f"  ⚠️ Tidak menemukan h1 untuk judul pada {url}")
            return None
        judul = judul_tag.text.strip()

        # Penulis
        penulis_tag = soup.select_one(".read-page--header--author__name.fn")
        penulis = penulis_tag.text.strip() if penulis_tag else "Tidak Ditemukan"

        # Tanggal
        tanggal_tag = soup.select_one(".read-page--header--author__modified-time")
        tanggal_full = tanggal_tag.text.strip() if tanggal_tag else "Tidak Ditemukan"

        match_date = re.search(r"(\d{1,2} \w+ \d{4})", tanggal_full)
        match_time = re.search(r"(\d{1,2}:\d{2})", tanggal_full)

        date = match_date.group(1) if match_date else tanggal_full
        time_ = match_time.group(1) if match_time else ""

        # Konten - class=article-content-body__item-content
        # Asumsi konten ada di dalam div ini dan mungkin terdiri dari beberapa paragraf atau elemen lain
        konten_div = soup.select_one(".article-content-body__item-content")
        if not konten_div:
            print(f"  ⚠️ Tidak menemukan konten artikel (.article-content-body__item-content) untuk {url}")
            # Anda bisa memilih untuk return None atau dictionary dengan konten kosong
            # return None
            isi = ""
        else:
            # Ambil semua teks di dalam elemen konten_div
            isi = konten_div.get_text(separator="\n", strip=True)

        # Kategori dan Sub-kategori
        category = soup.select_one(".read-page--header--subtitle")
        category = category.text.strip() if category else "Tidak Ditemukan"
        sub_category = ""

        return {
            "title": judul,
            "source": source,
            "date": date,
            "time": time_,
            "category": category,
            "sub-category": sub_category,
            "content": isi,
            "url": url,
            "author": penulis,
        }

    except Exception as e:
        print(f"  ❌ Gagal mengurai {url}: {e}")
        return None

# %%
def scrape_berita(url, source):
    """
    Mengambil HTML dan memanggil fungsi parse_website (yang spesifik untuk Kapanlagi).

    Args:
        url (str): URL artikel.
        source (str): Nama sumber website.

    Returns:
        dict or None: Dictionary data artikel jika berhasil di-scrape dan diurai (menggunakan parse_website), None jika gagal.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "Referer": "https://www.google.com/",
        "Accept-Language": "id,en-US;q=0.9,en;q=0.8",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    }
    try:
        resp = requests.get(url, headers=headers, timeout=10)
        # Tambahkan pemeriksaan status code
        if resp.status_code != 200:
            print(f"  ⚠️ Status code: {resp.status_code} for {url}")
            return None

        # Buat objek BeautifulSoup
        soup = BeautifulSoup(resp.content, "html.parser")

        # LANGSUNG PANGGIL FUNGSI parse_website
        # INGAT: Fungsi ini dirancang untuk KAPANLAGI.COM dan kemungkinan besar GAGAL untuk situs lain
        return parse_website(soup, url, source)

    except requests.exceptions.Timeout:
        print(f"  ❌ Timeout saat mengambil {url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"  ❌ Error saat mengambil {url}: {e}")
        return None
    except Exception as e:
        # Tangani error lain selama proses
        print(f"  ❌ Gagal memproses HTML dari {url}: {e}")
        return None

# %%
def full_rss_scrape(list_of_rss_urls, output_csv="berita.csv", max_articles=1000):
    import xml.etree.ElementTree as ET  # Tambahan untuk sitemap XML
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Referer": "https://www.google.com/",
    }

    urls = set()
    print("➡️ Mengumpulkan link dari RSS/Sitemap...")

    for rss_url in list_of_rss_urls:
        try:
            print(f"  Mengambil dari: {rss_url}")
            resp = requests.get(rss_url, headers=headers, timeout=10)
            content = resp.content.decode("utf-8")

            # Cek apakah ini RSS feed biasa atau sitemap XML (berdasarkan root tag)
            if "<rss" in content or "<feed" in content:
                # ➤ Tangani sebagai RSS standar
                feed = feedparser.parse(content)
                for entry in feed.entries:
                    article_link = entry.get('link') or entry.get('loc')
                    if article_link:
                        urls.add(article_link)
            elif "<urlset" in content:
                # ➤ Tangani sebagai Sitemap XML
                soup = BeautifulSoup(content, "xml")
                loc_tags = soup.find_all("loc")
                for tag in loc_tags:
                    if tag.text.strip():
                        urls.add(tag.text.strip())
            else:
                print(f"  ⚠️ Tidak dikenali sebagai RSS atau Sitemap: {rss_url}")

            time.sleep(1)

        except Exception as e:
            print(f"  ❌ Gagal ambil dari {rss_url}: {e}")

    urls_to_scrape = list(urls)[:max_articles]
    print(f"✅ Total link unik yang berhasil dikumpulkan: {len(urls)}")
    print(f"➡️ Akan men-scrape {len(urls_to_scrape)} artikel...")

    def scrape_wrapper(args):
        url, source = args
        print(f"  Scraping: {url} | Source: {source}")
        return scrape_berita(url, source)

    berita_list = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        args_list = [(url, urlparse(url).netloc) for url in urls_to_scrape]
        results = list(executor.map(scrape_wrapper, args_list))
        for data in results:
            if data:
                berita_list.append(data)

    df = pd.DataFrame(berita_list, columns=[
        "title", "source", "date", "time", "category", "sub-category", "content", "url", "author"
    ])

    print(f"➡️ Menyimpan hasil ke {output_csv}...")
    df.to_csv(output_csv, index=False)
    print(f"✅ CSV disimpan! Jumlah artikel berhasil di-scrape: {len(df)}")

    return df


In [97]:
# Contoh penggunaan fungsi full_rss_scrape:

# Ambil daftar link RSS dari file CSV (misalnya hanya untuk 'tribun')
dataset = pd.read_csv('https://raw.githubusercontent.com/gikirima/indonews-scrapper/refs/heads/main/link_scrapping.csv')
tribun_rss_links = dataset[dataset['website'] == 'tribun']['link'].tolist()
sindonews_rss_links = dataset[dataset['website'] == 'sindonews']['link'].tolist()
liputan6_rss_links = dataset[dataset['website'] == 'liputan6']['link'].tolist()
detik_rss_links = dataset[dataset['website'] == 'detik']['link'].tolist()
kapanlagi_rss_links = dataset[dataset['website'] == 'kapanlagi']['link'].tolist()
fimela_rss_links = dataset[dataset['website'] == 'fimela']['link'].tolist()
okezone_rss_links = dataset[dataset['website'] == 'okezone']['link'].tolist()
posmetro_rss_links = dataset[dataset['website'] == 'posmetro']['link'].tolist()
kompas_rss_links = dataset[dataset['website'] == 'kompas']['link'].tolist()
republika_rss_links = dataset[dataset['website'] == 'republika']['link'].tolist()
tempo_rss_links = dataset[dataset['website'] == 'tempo']['link'].tolist()

In [98]:
# df_sindonews = full_rss_scrape(sindonews_rss_links, output_csv="berita_sindonews.csv", max_articles=500)
# df_liputan6 = full_rss_scrape(liputan6_rss_links, output_csv="berita_liputan6.csv", max_articles=500)
# df_detik = full_rss_scrape(detik_rss_links, output_csv="berita_detik", max_articles=500)
# df_kapanlagi = full_rss_scrape(kapanlagi_rss_links, output_csv="berita_kapanlagi.csv", max_articles=500)
df_fimela = full_rss_scrape(fimela_rss_links, output_csv="berita_fimela.csv", max_articles=6000)
# df_okezone = full_rss_scrape(okezone_rss_links, output_csv="berita_okezone.csv", max_articles=500)
# df_posmetro = full_rss_scrape(posmetro_rss_links, output_csv="berita_posmetro.csv", max_articles=500)
# df_kompas = full_rss_scrape(kompas_rss_links, output_csv="berita_kompas.csv", max_articles=500)
# df_republika = full_rss_scrape(republika_rss_links, output_csv="berita_republika.csv", max_articles=500)
# df_tempo = full_rss_scrape(tempo_rss_links, output_csv="berita_tempo.csv", max_articles=500)

➡️ Mengumpulkan link dari RSS/Sitemap...
  Mengambil dari: https://www.fimela.com/sitemap_news.xml
  Mengambil dari: https://www.fimela.com/beauty/sitemap_news.xml
  Mengambil dari: https://www.fimela.com/entertainment/sitemap_news.xml
  Mengambil dari: https://www.fimela.com/fashion/sitemap_news.xml
  Mengambil dari: https://www.fimela.com/lifestyle/sitemap_news.xml
  Mengambil dari: https://www.fimela.com/food/sitemap_news.xml
✅ Total link unik yang berhasil dikumpulkan: 5020
➡️ Akan men-scrape 5020 artikel...
  Scraping: https://www.fimela.com/beauty/read/5948427/kulit-cerah-alami-6-tips-memilih-produk-perawatan-kulit-yang-sesuai | Source: www.fimela.com
  Scraping: https://www.fimela.com/lifestyle/read/6038071/5-desain-rumah-split-level-sederhana-yang-nyaman-dan-modern-bisa-jadi-inspirasi-hunian-idaman | Source: www.fimela.com
  Scraping: https://www.fimela.com/entertainment/read/6022967/maudy-ayunda-tampil-effortless-tapi-tetap-artsy-lewat-padu-padan-minimalis-ini | Source: www.fi

KeyboardInterrupt: 