# Scrapping BBC Headlines

# Fetch the news page

In [5]:
import requests
from bs4 import BeautifulSoup

# URL for BBC News homepage
news_url = "https://www.bbc.com/news"

# Fetch and parse the page
response = requests.get(news_url)
news_soup = BeautifulSoup(response.content, "html.parser")

# Try multiple selectors for headlines
headlines = news_soup.find_all("h3", class_="gs-c-promo-heading__title")

# If not found, try anchor tags with class 'gs-c-promo-heading'
if not headlines:
    promo_anchors = news_soup.select("a.gs-c-promo-heading")
    headlines = [a for a in promo_anchors if a.text.strip()]

# If still not found, fallback to all anchor tags with '/news/' in href and non-empty text
if not headlines:
    headlines = [
        a for a in news_soup.find_all("a", href=True)
        if "/news/" in a["href"] and a.text.strip()
    ]

if not headlines:
    print("No headlines found using known selectors.")
else:
    for idx, headline in enumerate(headlines, start=1):
        # Get the headline text
        headline_text = headline.text.strip()
        print(f"{idx}. {headline_text}")

1. Israel-Gaza War
2. War in Ukraine
3. US & Canada
4. UK
5. Africa
6. Asia
7. Australia
8. Europe
9. Latin America
10. Middle East
11. In Pictures
12. BBC InDepth
13. BBC Verify
14. Israel-Gaza War
15. War in Ukraine
16. US & Canada
17. UK
18. UK Politics
19. England
20. N. Ireland
21. N. Ireland Politics
22. Scotland
23. Scotland Politics
24. Wales
25. Wales Politics
26. Africa
27. Asia
28. China
29. India
30. Australia
31. Europe
32. Latin America
33. Middle East
34. In Pictures
35. BBC InDepth
36. BBC Verify
37. LIVETrump hails 'bullseye' strikes on Iranian nuclear sites as UN says damage is unclearTehran accuses the US of having "waged a war" against it "under a fabricated and absurd pretext".
38. Decoy flights and seven B-2 stealth bombers - how US says it hit Iran's nuclear sitesA timeline of how the complex mission unfolded was laid out in a Pentagon briefing.9 hrs agoWorld
39. US strikes on Iran trigger protests internationallyDemonstrators take to the streets in France, Pakis

# Headlines With Links

In [6]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# URL for BBC News homepage
news_url = "https://www.bbc.com/news"

# Fetch and parse the page
response = requests.get(news_url)
news_soup = BeautifulSoup(response.content, "html.parser")

# Try multiple selectors for headlines
headlines = news_soup.find_all("h3", class_="gs-c-promo-heading__title")

# If not found, try anchor tags with class 'gs-c-promo-heading'
if not headlines:
    promo_anchors = news_soup.select("a.gs-c-promo-heading")
    headlines = [a for a in promo_anchors if a.text.strip()]

# If still not found, fallback to all anchor tags with '/news/' in href and non-empty text
if not headlines:
    headlines = [
        a for a in news_soup.find_all("a", href=True)
        if "/news/" in a["href"] and a.text.strip()
    ]

if not headlines:
    print("No headlines found using known selectors.")
else:
    for idx, headline in enumerate(headlines, start=1):
        # Get the headline text and link
        headline_text = headline.text.strip()
        # Try to get the URL from the parent anchor or from the tag itself
        link = None
        if headline.name == "a" and headline.has_attr("href"):
            link = headline["href"]
        else:
            parent_a = headline.find_parent("a", href=True)
            if parent_a:
                link = parent_a["href"]
        # Make sure the link is absolute
        if link and link.startswith("/"):
            link = "https://www.bbc.com" + link
        # Print headline
        print(f"{idx}. {headline_text}")
        if link:
            print(f"   Link: {link}")
            # Fetch the news article page
            try:
                article_resp = requests.get(link)
                article_soup = BeautifulSoup(article_resp.content, "html.parser")
                # Try to extract all paragraphs in the article body
                # BBC often uses <article> tag or role="main"
                article_tag = article_soup.find("article")
                if not article_tag:
                    article_tag = article_soup.find(attrs={"role": "main"})
                if article_tag:
                    paragraphs = article_tag.find_all("p")
                else:
                    paragraphs = article_soup.find_all("p")
                # Combine the text of all paragraphs
                article_text = " ".join([p.get_text(strip=True) for p in paragraphs])
                # Print a snippet (first 400 chars)
                snippet = article_text[:400] + ("..." if len(article_text) > 400 else "")
                # Try to extract date and time
                date_str = ""
                # Look for <time> tag with datetime attribute
                time_tag = article_soup.find("time")
                if not time_tag:
                    # Try to find meta tag with property 'article:published_time'
                    meta_time = article_soup.find("meta", attrs={"property": "article:published_time"})
                    if meta_time and meta_time.has_attr("content"):
                        date_str = meta_time["content"]
                if not date_str and time_tag and time_tag.has_attr("datetime"):
                    date_str = time_tag["datetime"]
                elif not date_str and time_tag:
                    date_str = time_tag.get_text(strip=True)
                if date_str:
                    try:
                        dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
                        date_str = dt.strftime("%Y-%m-%d %H:%M:%S %Z")
                    except Exception:
                        pass
                print(f"   Date: {date_str if date_str else '(No date found)'}")
                print(f"   News: {snippet}")
            except Exception as e:
                print(f"   Date: (No date found)")
                print(f"   News: (Could not fetch article: {e})")
        else:
            print("   Link: (No link found)")
            print("   Date: (No date found)")
            print("   News: (No article found)")

1. Israel-Gaza War
   Link: https://www.bbc.com/news/topics/c2vdnvdg6xxt
   Date: (No date found)
2. War in Ukraine
   Link: https://www.bbc.com/news/war-in-ukraine
   Date: (No date found)
   News: Russian missiles and drones hit residential areas and hospitals in Ukraine's capital, the interior minister says. Commander-in-chief Oleksandr Syrsky disputed Russian claims that Ukraine had been pushed out of Kursk. Ukrainian officials say a drone smashed into the building, leaving a number of people dead and wounded. Kyiv says it received 1,245 bodies on Monday, while Moscow says 78 of its dead ...
3. US & Canada
   Link: https://www.bbc.com/news/us-canada
   Date: (No date found)
   News: (Could not fetch article: HTTPSConnectionPool(host='www.bbc.com', port=443): Max retries exceeded with url: /news/us-canada (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001D2322F3390>, 'Connection to www.bbc.com timed out. (connect timeout=None)')))
4. UK
   Link: ht

KeyboardInterrupt: 