In [1]:
import csv
import os
import re
import requests
from bs4 import BeautifulSoup

KEYWORDS = {"ببینید", "فیلم", "عکس", "ویدیو", "ویدئو", "تصاویر", "تصاویری"}

def _normalize_persian(text: str) -> str:
    if not text:
        return ""
    return (text.replace("ي", "ی")
                .replace("ك", "ک")
                .replace("ۀ", "ه"))

def read_existing_news_ids(csv_file: str) -> set:
    if not os.path.isfile(csv_file):
        return set()

    existing_ids = set()
    with open(csv_file, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        header = next(reader, None)
        if header:
            news_id_index = header.index("news_id") if header else 0

        else:
            print("Header is missing.")
            return set()

        for row in reader:
            if len(row) > news_id_index:
                existing_ids.add(row[news_id_index])
    
    return existing_ids


def get_news(url: str) -> str:
    r = requests.get(url, allow_redirects=True, timeout=20)

    # Check if the HTTP status code is 404
    if r.status_code == 404:
        print(f"Skipped: {url} - 404 Not Found")
        return None  # Skip if 404 error

    r.raise_for_status()  # Raise an exception for other HTTP errors
    return r.text

def parse_news_html(html: str):
    soup = BeautifulSoup(html, "html.parser")

    # news_id
    news_id_tag = soup.find("li", class_="id")
    news_id = news_id_tag.find("span").get_text(strip=True) if news_id_tag else ""

    # category
    breadcrumb = soup.find("ol", class_="breadcrumb")
    category = _normalize_persian(breadcrumb.find_all("a", rel="index")[1].get_text(strip=True)) if breadcrumb else ""

    # date
    date_span = soup.select_one("div.item-date > span")
    date = _normalize_persian(date_span.get_text(strip=True)) if date_span else ""

    # url + title
    h1 = soup.find("h1", class_="title")
    a_title = h1.find("a") if h1 else None
    url = ("https://www.khabaronline.ir" + a_title["href"]) if (a_title and a_title.has_attr("href")) else ""
    title = _normalize_persian(a_title.get_text(strip=True)) if a_title else ""

    # introtext
    intro = soup.find("p", class_="introtext")
    introtext = _normalize_persian(intro.get_text(" ", strip=True)) if intro else ""

    # news text
    text_lines = []
    content_root = soup.select_one(".item-body") or soup
    stop = False
    for node in content_root.descendants:
        if stop:
            break
        if getattr(node, "name", None) is None:
            continue
        if node.name == "div" and "item-code" in (node.get("class") or []):
            stop = True
            break
        if node.name in {"p","h1","h2","h3","h4","h5","h6"}:
            if node.find_parent(["ul", "ol"]) is not None:
                continue
            txt = node.get_text(" ", strip=True)
            if txt:
                text_lines.append(_normalize_persian(txt))
    
    # delete last <p> without any info
    if text_lines and re.match(r"^\d[\d\s]*$", text_lines[-1]):
        text_lines = text_lines[:-1]

    # text = "\n".join(text_lines)
    text = " [n] ".join(text_lines)

    # tags
    tags_section = soup.select("section.box.tags a[rel=tag]")
    tags = ",".join([_normalize_persian(a.get_text(strip=True)) for a in tags_section]) if tags_section else ""

    return {
        "news_id": news_id,
        "category": category,
        "date": date,
        "url": url,
        "title": title,
        "introtext": introtext,
        "text": text,
        "tags": tags,
    }

def _should_skip(row_dict: dict) -> bool:
    """Condition for completely skipping the news from CSV"""

    title = row_dict.get("title", "") or ""
    text = row_dict.get("text", "") or ""
    introtext = row_dict.get("introtext", "") or ""

    # Condition 1: Forbidden words in the title
    if any(kw in title for kw in KEYWORDS):
        return True

    # Condition 2: Empty introtext
    if not introtext.strip():
        return True

    # Condition 3: Text word count less than 10
    word_count = len(re.findall(r"\S+", text))
    if word_count < 10:
        return True

    return False

def write_news_csv(row_dict: dict, csv_file: str = "news.csv"):
    headers = ["news_id", "category", "date", "url", "title", "introtext", "text", "tags"]
    file_exists = os.path.isfile(csv_file)
    need_header = (not file_exists) or os.path.getsize(csv_file) == 0

    with open(csv_file, "a", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        if need_header:
            w.writerow(headers)
        w.writerow([row_dict.get(h, "") for h in headers])

def parse_news_from_url(url: str, csv_file: str = "news.csv", news_id: str = ""):
    # Checking news_id's in CSV
    existing_ids = read_existing_news_ids(csv_file)

    if str(news_id) in existing_ids:
        print(f"Existed: {news_id} - Already exists in CSV.")
        return None
    
    html = get_news(url)

    if html is None:
        return None
    
    row = parse_news_html(html)

    if _should_skip(row):
        print(f"Skipped: {row['news_id']} ({row['title']})")
        return None

    write_news_csv(row, csv_file)
    return row


In [2]:
if __name__ == "__main__":
    start_id = 2105700
    end_id   = 2105900
    csv_file = "news_khabaronline_20.csv"

    for news_id in range(start_id, end_id + 1, 8):
        url = f"https://www.khabaronline.ir/news/{news_id}"
        parse_news_from_url(url, csv_file, news_id)
        # try:
        #     data = parse_news_from_url(url, csv_file, news_id)
        #     print(f"Done {news_id}")
        #     for k, v in data.items():
        #         print(f"{k}: {v}")
        #     print("-" * 50)
        # except Exception as e:
        #     print(f"Error in {news_id}: {e}")


Skipped: 2105756 (ببینید |  تصاویر تازه‌تر از حملات تروریستی امروز در ایرانشهر)
Skipped: 2105772 (ببینید | تصاویر مردم‌آزاری با هلی‌کوپتر شخصی در ساحل شمال ایران!)
Skipped: 2105804 (ببینید | مغانلو پنالتی گرفت؛ سامان گل زد!)
Skipped: https://www.khabaronline.ir/news/2105876 - 404 Not Found
Skipped: https://www.khabaronline.ir/news/2105884 - 404 Not Found
Skipped: https://www.khabaronline.ir/news/2105892 - 404 Not Found
