## Article Scraping

In [15]:
!pip install feedparser pandas dateparser --quiet

import feedparser
import pandas as pd
import hashlib
from datetime import datetime
from urllib.parse import quote, urlparse, parse_qs
import dateparser
import json

def get_real_url(possible_redirect_link):
    """
    Google News RSS sering memberikan link redirect seperti:
    https://news.google.com/articles/CAIi...&url=https://actual.site/...
    Fungsi ini mencoba mengekstrak parameter `url=` jika ada; kalau tidak, return link as-is.
    """
    try:
        parsed = urlparse(possible_redirect_link)
        qs = parse_qs(parsed.query)
        if "url" in qs and qs["url"]:
            return qs["url"][0]
    except Exception:
        pass
    return possible_redirect_link

def normalize_date(date_str):

    if not date_str:
        return None
    try:
        dt = dateparser.parse(str(date_str), languages=["id", "en"])
        if dt:
            return dt.isoformat()
    except Exception:
        pass
    # fallback: return original trimmed
    return str(date_str).strip()

def scrape_google_news_rss(query, max_items=None):
    """
    query: string, e.g. 'Jaklingko OR "Jak Lingko"'
    max_items: optional int to limit number of entries processed
    returns: list of dicts
    """
    encoded_q = quote(query)
    rss_url = f"https://news.google.com/rss/search?q={encoded_q}&hl=id&gl=ID&ceid=ID:id"
    feed = feedparser.parse(rss_url)

    out = []
    entries = feed.entries or []
    if max_items:
        entries = entries[:max_items]

    for e in entries:
        title = e.get("title")
        raw_link = e.get("link")
        link = get_real_url(raw_link) if raw_link else None

        # feedparser sometimes stores source as object-like; handle safely
        source = None
        src = e.get("source")
        if isinstance(src, dict):
            source = src.get("title")
        elif hasattr(src, "title"):
            try:
                source = src.title
            except Exception:
                source = str(src)
        else:
            # fallback: try to parse from entry.author or other fields
            source = e.get("author") or e.get("publisher") or None

        # date fields may be published/updated/updated_parsed etc.
        published_raw = e.get("published") or e.get("updated") or e.get("published_parsed") or None
        published = normalize_date(published_raw)

        # summary/description if available (RSS often includes short snippet)
        summary = e.get("summary") or e.get("description") or ""

        uid = hashlib.sha256((link or title or "").encode("utf-8")).hexdigest()[:16]

        out.append({
            "id": uid,
            "query_used": query,
            "title": title,
            "url": link,
            "source": source,
            "published": published,
            "summary": summary
        })

    return out

if __name__ == "__main__":
    # --- CONFIGURE QUERY HERE ---
    # You can change query to other variants, e.g. 'Jaklingko OR "Jak Lingko" OR "tarif integrasi"'
    query = 'Jaklingko OR "Jak Lingko"'
    max_items = None   # set integer to limit results, e.g. 50

    print("Scraping Google News RSS for query:", query)
    data = scrape_google_news_rss(query, max_items=max_items)
    print(f"Found {len(data)} entries from RSS")

    # convert to DataFrame
    df = pd.DataFrame(data)

    # deduplicate by URL (keep first)
    if not df.empty:
        df = df.drop_duplicates(subset=["url"], keep="first")

    # filenames
    date_tag = datetime.now().strftime("%Y%m%d")
    csv_name = f"jaklingko_news_{date_tag}.csv"
    jsonl_name = f"jaklingko_news_{date_tag}.jsonl"

    # Save CSV
    df.to_csv(csv_name, index=False, encoding="utf-8-sig")
    print("Saved CSV:", csv_name)

    # Save JSONL
    with open(jsonl_name, "w", encoding="utf-8") as jf:
        for row in data:
            jf.write(json.dumps(row, ensure_ascii=False) + "\n")
    print("Saved JSONL:", jsonl_name)

    # Preview
    if df.empty:
        print("⚠️ DataFrame kosong — tidak ada artikel ter-ekstrak dari RSS.")
    else:
        print(df.head(10).to_string(index=False))

    # note for Colab: to download files, use:
    # from google.colab import files
    # files.download(csv_name)


Scraping Google News RSS for query: Jaklingko OR "Jak Lingko"
Found 100 entries from RSS
Saved CSV: jaklingko_news_20250930.csv
Saved JSONL: jaklingko_news_20250930.jsonl
              id                query_used                                                                                                                        title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           url                    source                 published                                                    

In [5]:
!pip install feedparser pandas dateparser --quiet

import feedparser
import pandas as pd
import hashlib
from datetime import datetime
from urllib.parse import quote, urlparse, parse_qs
import dateparser
import json

def get_real_url(possible_redirect_link):
    """
    Google News RSS sering memberikan link redirect seperti:
    https://news.google.com/articles/CAIi...&url=https://actual.site/...
    Fungsi ini mencoba mengekstrak parameter `url=` jika ada; kalau tidak, return link as-is.
    """
    try:
        parsed = urlparse(possible_redirect_link)
        qs = parse_qs(parsed.query)
        if "url" in qs and qs["url"]:
            return qs["url"][0]
    except Exception:
        pass
    return possible_redirect_link

def normalize_date(date_str):
    if not date_str:
        return None
    try:
        dt = dateparser.parse(str(date_str), languages=["id", "en"])
        if dt:
            return dt.isoformat()
    except Exception:
        pass
    return str(date_str).strip()

def scrape_google_news_rss(query, max_items=None):
    """
    query: string, e.g. 'Jaklingko OR "Jak Lingko"'
    max_items: optional int to limit number of entries processed
    returns: list of dicts
    """
    encoded_q = quote(query)
    rss_url = f"https://news.google.com/rss/search?q={encoded_q}&hl=id&gl=ID&ceid=ID:id"
    feed = feedparser.parse(rss_url)

    out = []
    entries = feed.entries or []
    if max_items:
        entries = entries[:max_items]

    for e in entries:
        title = e.get("title")
        raw_link = e.get("link")
        link = get_real_url(raw_link) if raw_link else None

        # Ambil source
        source = None
        src = e.get("source")
        if isinstance(src, dict):
            source = src.get("title")
        elif hasattr(src, "title"):
            try:
                source = src.title
            except Exception:
                source = str(src)
        else:
            source = e.get("author") or e.get("publisher") or None

        # Ambil tanggal publikasi
        published_raw = e.get("published") or e.get("updated") or e.get("published_parsed") or None
        published = normalize_date(published_raw)

        uid = hashlib.sha256((link or title or "").encode("utf-8")).hexdigest()[:16]

        out.append({
            "id": uid,
            "query_used": query,
            "title": title,
            "url": link,
            "source": source,
            "published": published
        })

    return out

if __name__ == "__main__":
    # --- CONFIGURE QUERY HERE ---
    query = 'Jaklingko OR "Jak Lingko"'
    max_items = None   # set integer kalau mau dibatasi, misal 20

    print("Scraping Google News RSS for query:", query)
    data = scrape_google_news_rss(query, max_items=150)
    print(f"Found {len(data)} entries from RSS")

    df = pd.DataFrame(data)

    if not df.empty:
        # Deduplicate
        df = df.drop_duplicates(subset=["url"], keep="first")

        # Format sesuai tabel target
        df_formatted = pd.DataFrame({
            "No": range(1, len(df) + 1),
            "Title": df["title"],
            "Link": df["url"],
            "Source": df["source"],
            "Date": df["published"],
            "PIC": ""  # kosong dulu
        })

        # Filenames
        date_tag = datetime.now().strftime("%Y%m%d")
        csv_name = f"jaklingko_news_{date_tag}.csv"
        jsonl_name = f"jaklingko_news_{date_tag}.jsonl"

        # Save CSV
        df_formatted.to_csv(csv_name, index=False, encoding="utf-8-sig")
        print("Saved CSV:", csv_name)

        # Save JSONL
        with open(jsonl_name, "w", encoding="utf-8") as jf:
            for _, row in df_formatted.iterrows():
                jf.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")
        print("Saved JSONL:", jsonl_name)

        # Preview
        print(df_formatted.head(10).to_string(index=False))
    else:
        print("⚠️ DataFrame kosong — tidak ada artikel ter-ekstrak dari RSS.")


Scraping Google News RSS for query: Jaklingko OR "Jak Lingko"
Found 100 entries from RSS
Saved CSV: jaklingko_news_20250930.csv
Saved JSONL: jaklingko_news_20250930.jsonl
 No                                                                                                                        Title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          Link                    Source                      Date PIC
  1 Naik Angkot Jaklingko Gratis Itu Menyenangkan, Kecuali Kalau Sopirnya Nyetir Kayak

In [14]:
!pip install feedparser pandas dateparser --quiet

import feedparser
import pandas as pd
import hashlib
from datetime import datetime
from urllib.parse import quote, urlparse, parse_qs
import dateparser
import json

def get_real_url(possible_redirect_link):
    try:
        parsed = urlparse(possible_redirect_link)
        qs = parse_qs(parsed.query)
        if "url" in qs and qs["url"]:
            return qs["url"][0]
    except Exception:
        pass
    return possible_redirect_link

def normalize_date(date_str):
    if not date_str:
        return None
    try:
        dt = dateparser.parse(str(date_str), languages=["id", "en"])
        if dt:
            return dt.isoformat()
    except Exception:
        pass
    return str(date_str).strip()

def scrape_google_news_rss(query, max_items=None):
    encoded_q = quote(query)
    rss_url = f"https://news.google.com/rss/search?q={encoded_q}&hl=id&gl=ID&ceid=ID:id"
    feed = feedparser.parse(rss_url)

    out = []
    entries = feed.entries or []
    if max_items:
        entries = entries[:max_items]

    for e in entries:
        title = e.get("title")
        raw_link = e.get("link")
        link = get_real_url(raw_link) if raw_link else None

        source = None
        src = e.get("source")
        if isinstance(src, dict):
            source = src.get("title")
        elif hasattr(src, "title"):
            try:
                source = src.title
            except Exception:
                source = str(src)
        else:
            source = e.get("author") or e.get("publisher") or None

        published_raw = e.get("published") or e.get("updated") or e.get("published_parsed") or None
        published = normalize_date(published_raw)

        summary = e.get("summary") or e.get("description") or ""

        uid = hashlib.sha256((link or title or "").encode("utf-8")).hexdigest()[:16]

        out.append({
            "id": uid,
            "query_used": query,
            "title": title,
            "url": link,
            "source": source,
            "published": published,
            "summary": summary
        })

    return out


if __name__ == "__main__":
    all_data = []

    # Loop setiap bulan di 2023
    for month in range(1, 13):
        start = f"2023-{month:02d}-01"
        if month == 12:
            end = "2023-12-31"
        else:
            end = f"2023-{month+1:02d}-01"

        query = f'Jaklingko after:{start} before:{end}'
        print(f"Scraping bulan {month} 2023...")

        data = scrape_google_news_rss(query, max_items=100)
        all_data.extend(data)

    # Buat DataFrame
    df = pd.DataFrame(all_data)

    # Konversi ke datetime & filter hanya tahun 2023
    df["published"] = pd.to_datetime(df["published"], errors="coerce")
    df_2023 = df[df["published"].dt.year == 2023].drop_duplicates(subset=["url"]).reset_index(drop=True)

    # Save ke CSV
    date_tag = datetime.now().strftime("%Y%m%d")
    csv_name = f"jaklingko_news_2023_{date_tag}.csv"
    df_2023.to_csv(csv_name, index=False, encoding="utf-8-sig")

    print(f"✅ Artikel tahun 2023: {len(df_2023)}")
    print("Saved CSV:", csv_name)
    print(df_2023.head(10).to_string(index=False))


Scraping bulan 1 2023...
Scraping bulan 2 2023...
Scraping bulan 3 2023...
Scraping bulan 4 2023...
Scraping bulan 5 2023...
Scraping bulan 6 2023...
Scraping bulan 7 2023...
Scraping bulan 8 2023...
Scraping bulan 9 2023...
Scraping bulan 10 2023...
Scraping bulan 11 2023...
Scraping bulan 12 2023...
✅ Artikel tahun 2023: 169
Saved CSV: jaklingko_news_2023_20250930.csv
              id                                   query_used                                                                                                                                           title                                                                                                                                                                                                                                                                                                                                                                                                                                      