In [1]:
import csv
import os
import re
import requests
from bs4 import BeautifulSoup
import json

KEYWORDS = {"ببینید", "فیلم", "عکس", "ویدیو", "ویدئو", "تصاویر", "تصاویری"}

def _normalize_persian(text: str) -> str:
    if not text:
        return ""
    return (text.replace("ي", "ی")
                .replace("ك", "ک")
                .replace("ۀ", "ه"))

def read_existing_news_ids(csv_file: str) -> set:
    if not os.path.isfile(csv_file):
        return set()

    existing_ids = set()
    with open(csv_file, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        header = next(reader, None)
        if header:
            news_id_index = header.index("news_id") if header else 0

        else:
            print("Header is missing.")
            return set()

        for row in reader:
            if len(row) > news_id_index:
                existing_ids.add(row[news_id_index])
    
    return existing_ids


def get_news(url: str) -> str:
    r = requests.get(url, allow_redirects=True, timeout=20)

    # Check if the HTTP status code is 404
    if "صفحه مورد نظر یافت نشد" in BeautifulSoup(r.text, 'html.parser').find('title').get_text(strip=True):
        print(f"Skipped: {url} - 404 Not Found")
        return None  # Skip if 404 error
    
    if r.history:
        if not r.url.startswith("https://www.tasnimnews.com/fa/news/"):
            print(f"Redirected to a different URL: {r.url}")
            return None

    r.raise_for_status()  # Raise an exception for other HTTP errors
    return r.text

def parse_news_html(html: str):
    soup = BeautifulSoup(html, "html.parser")

    # news_id
    script_tag = soup.find('script', string=re.compile(r"const\s+id\s*=\s*'\d+'"))
    news_id = re.search(r"const\s+id\s*=\s*'(\d+)'", script_tag.string).group(1) if script_tag else ""

    # category
    ul_tag = soup.find("ul", class_="list-inline details")
    if ul_tag == None:
        category = None
        date = None
    else:
        category = _normalize_persian(ul_tag.find("li", class_="service").find("a").get_text(strip=True)) if ul_tag.find("li", class_="service") else ""

    # date
        date = _normalize_persian(ul_tag.find("li", class_="time").get_text(strip=True)) if ul_tag.find("li", class_="time") else ""

    # url + title
    a_title = soup.find("h1", class_="title")
    meta_tag = soup.find("meta", property="og:url") if soup else None
    url = ("https:" + meta_tag["content"]) if meta_tag and meta_tag.has_attr("content") else ""
    title = _normalize_persian(a_title.get_text(strip=True)) if a_title else ""

    # introtext
    intro = soup.find("h3", class_="lead")
    introtext = _normalize_persian(intro.get_text(" ", strip=True)) if intro else ""

    text_lines = []
    article_tag = soup.find('article', class_='single-news')

    if article_tag:
        for p_tag in soup.find_all('p'):
            txt = p_tag.get_text(" ", strip=True)

            if not txt or txt == '&nbsp;':
                continue
            
            if p_tag.find_parent('div', class_='markup-container readmore-container'):
                continue
    
            text_lines.append(_normalize_persian(txt))

    # delete last <p> without any info
    if text_lines and re.search(r"انتهای پیام/.*", text_lines[-1]):
        text_lines = text_lines[:-1]

    # text = "\n".join(text_lines)
    text = " [n] ".join(text_lines)
    
    # tags
    script_tag = soup.find('script', type='application/ld+json')

    if script_tag:
        json_data = json.loads(script_tag.string)
        tags = json_data.get("keywords", "") if "keywords" in json_data else ""
    else:
        tags = ""



    return {
        "news_id": news_id,
        "category": category,
        "date": date,
        "url": url,
        "title": title,
        "introtext": introtext,
        "text": text,
        "tags": tags,
    }

def _should_skip(row_dict: dict) -> bool:
    """Condition for completely skipping the news from CSV"""

    title = row_dict.get("title", "") or ""
    text = row_dict.get("text", "") or ""
    introtext = row_dict.get("introtext", "") or ""

    # Condition 1: Forbidden words in the title
    if any(kw in title for kw in KEYWORDS):
        return True

    # Condition 2: Empty introtext
    if not introtext.strip():
        return True

    # Condition 3: Text word count less than 10
    word_count = len(re.findall(r"\S+", text))
    if word_count < 10:
        return True

    return False

def write_news_csv(row_dict: dict, csv_file: str = "news.csv"):
    category = row_dict.get("category", "").strip()
    if not category:
        print(f"Skipped: {row_dict.get("news_id", "").strip()} ({row_dict.get("title", "").strip()}) Category is empty.")
        return
    
    tag = row_dict.get("tags", "").strip()
    if not tag:
        print(f"Skipped: {row_dict.get("news_id", "").strip()} ({row_dict.get("title", "").strip()}) Tags is empty.")
        return
    headers = ["news_id", "category", "date", "url", "title", "introtext", "text", "tags"]
    file_exists = os.path.isfile(csv_file)
    need_header = (not file_exists) or os.path.getsize(csv_file) == 0

    with open(csv_file, "a", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        if need_header:
            w.writerow(headers)
        w.writerow([row_dict.get(h, "") for h in headers])

def parse_news_from_url(url: str, csv_file: str = "news.csv", news_id: str = ""):
    # Checking news_id's in CSV
    existing_ids = read_existing_news_ids(csv_file)

    if str(news_id) in existing_ids:
        print(f"Existed: {news_id} - Already exists in CSV.")
        return None
    
    html = get_news(url)

    if html is None:
        return None
    
    row = parse_news_html(html)

    if _should_skip(row):
        print(f"Skipped: {row['news_id']} ({row['title']})")
        return None

    write_news_csv(row, csv_file)
    return row


In [2]:
if __name__ == "__main__":
    start_id = 3392500
    end_id   = 3392930
    csv_file = "news_tasnim_20.csv"

    for news_id in range(start_id, end_id + 1, 8):
        url = f"https://www.tasnimnews.com/{news_id}"
        parse_news_from_url(url, csv_file, news_id)
        # try:
        #     data = parse_news_from_url(url, csv_file, news_id)
        #     print(f"Done {news_id}")
        #     for k, v in data.items():
        #         print(f"{k}: {v}")
        #     print("-" * 50)
        # except Exception as e:
        #     print(f"Error in {news_id}: {e}")


Skipped: 3392500 (طرح جدید فروش لاماری ایما با پنج رنگ متنوع) Tags is empty.
Skipped: 3392508 (صهیونیست‌ها برای فرار و گرفتن گذرنامه خود را به آب و آتش می زنند) Tags is empty.
Redirected to a different URL: https://www.tasnimnews.com/he/news/2025/09/05/3392516/%D7%91%D7%99%D7%9F-%D7%94%D7%A4%D7%A6%D7%95%D7%A2%D7%99%D7%9D-%D7%AA%D7%99%D7%A0%D7%95%D7%A7-20-%D7%A0%D7%A4%D7%92%D7%A2%D7%99%D7%9D-%D7%91%D7%9E%D7%AA%D7%A7%D7%A4%D7%AA-%D7%9E%D7%AA%D7%A0%D7%97%D7%9C%D7%99%D7%9D-%D7%A2%D7%9C-%D7%97-%D7%9C%D7%AA-%D7%90-%D7%93%D7%91%D7%A2-%D7%91%D7%93%D7%A8%D7%95%D7%9D-%D7%94%D7%A8-%D7%97%D7%91%D7%A8%D7%95%D7%9F
Skipped: 3392540 (آغاز عملیات آب‌رسانی به ۱۰ روستای دلفان) Tags is empty.
Skipped: 3392564 (تمسک به ولایت فقیه محور تحقق اتحاد و انسجام در جامعه است) Tags is empty.
Redirected to a different URL: https://www.tasnimnews.com/ar/news/2025/09/05/3392572/%D8%AE%D8%B7%DB%8C%D8%A8-%D8%AC%D9%85%D8%B9%D8%A9-%D8%B7%D9%87%D8%B1%D8%A7%D9%86-%D8%B2%DB%8C%D8%A7%D8%B1%D8%A9-%D8%A7%D9%84%D8%B1%D8%A6%DB%8C