In [1]:
import requests
from bs4 import BeautifulSoup

def create_session():
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "iframe",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "same-origin",
        "Priority": "u=4",
        "TE": "trailers"
    })
    return session


In [2]:
def make_request(session, url):
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        return response.content        
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return


In [3]:
def extract_unique_navigation_urls(html_content, base_url="https://www.thedailystar.net"):
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        unique_links = {}

        for ul in soup.find_all("ul", class_="sub-category-menu"):
            for li in ul.find_all("li"):
                a_tag = li.find("a")
                if a_tag:
                    title = a_tag.text.strip()
                    link = a_tag.get("href", "")
                    if link.startswith("/"):
                        link = base_url + link
                    
                    unique_links[link] = title

        return [{"title": title, "url": link} for link, title in unique_links.items()]
    except Exception:
        return []


In [4]:
def extract_news_links(html_content, base_url="https://www.thedailystar.net"):
    try:
        if not html_content:
            return []

        soup = BeautifulSoup(html_content, 'html.parser')
        news_links = {}

        for h3 in soup.find_all("h3", class_=["title", "title fs-20"]):  
            a_tag = h3.find("a", href=True)
            if a_tag:
                title = a_tag.get_text(strip=True)
                link = a_tag["href"]
                
                # Ensure full URL
                if link.startswith("/"):
                    link = base_url + link

                # Store unique titles
                news_links[link] = title

        return [{"title": title, "url": url} for url, title in news_links.items()]
    
    except Exception as e:
        print(f"❌ Error extracting news links: {e}")
        return []




In [5]:
from bs4 import BeautifulSoup

def extract_news_content(html_content):
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        news_section = soup.find("div", class_="pb-20 clearfix")

        if not news_section:
            return ""

        paragraphs = [p.get_text(strip=True) for p in news_section.find_all("p")]
        return " ".join(paragraphs)
    except Exception:
        return ""


In [6]:
from bs4 import BeautifulSoup

def extract_unique_news(html_content):
    try:
        if not html_content:
            return []

        soup = BeautifulSoup(html_content, 'html.parser')

        # Find all articles
        articles = soup.find_all("article", class_="article-section")

        news_data = {}
        
        for article in articles:
            # Extract the news title
            title_tag = article.find("h1", class_="article-title")
            title = title_tag.get_text(strip=True) if title_tag else "No Title"

            # Extract the news content
            content_section = article.find("div", class_="pb-20 clearfix")
            paragraphs = [p.get_text(strip=True) for p in content_section.find_all("p")] if content_section else []
            content = " ".join(paragraphs) if paragraphs else "No Content"

            # Add to dictionary to ensure uniqueness (title as key)
            news_data[title] = content

        # Convert back to list of dictionaries
        return [{"title": title, "content": content} for title, content in news_data.items()]
    except Exception as e:
        print(f"❌ Error extracting news: {e}")
        return []


In [7]:
def main():
    session = create_session()
    url = "https://www.thedailystar.net/news"

    html_content = make_request(session, url)
    urls = extract_unique_navigation_urls(html_content,)
    #print(urls)
    for page in urls:
        print(f"Scraping {page['title']}")
        new_page = make_request(session,page["url"])
        news_titles = extract_news_links(new_page,page["title"].lower())
        print(len(news_titles))
        for news in news_titles:
            #print(news["title"])
            #contents = make_request(session, news["url"])
            #description = extract_unique_news(contents)
            #for news in description:
                # print("📰 Title:", news["title"])
                # print("📄 Content:", news["content"][:500])  # Print first 500 characters
                # print("-" * 80)
                pass
        


In [8]:
main()

Scraping Bangladesh
21
Scraping Investigative Stories
18
Scraping Asia
21
Scraping World
21
