# The Guardian News Web Crawler
## Using BeautifulSoup library

In [1]:
# Add notebook into GitHub repository
!git add .
!git commit -m "Initialise the_guardian.ipynb"
!git push

[main 7ec91be] Initialise the_guardian.ipynb
 3 files changed, 26 insertions(+), 5 deletions(-)
 create mode 100644 .ipynb_checkpoints/the_guardian-checkpoint.ipynb
 create mode 100644 the_guardian.ipynb
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 8 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 815 bytes | 815.00 KiB/s, done.
Total 5 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/gaylejuntilla/MA3831-A3.git
   5703832..7ec91be  main -> main


In [2]:
import requests
from bs4 import BeautifulSoup

In [4]:
response = requests.get("https://www.theguardian.com/australia-news/australian-election-2025")
soup = BeautifulSoup(response.text, 'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <!-- Hello there, HTML enthusiast! -->
  <!-- DCR commit hash 23a8130d3967b689f2aa10bc02d3f3600bc15d57 -->
  <title>
   Australian election 2025 | The Guardian
  </title>
  <meta content="Latest Australian election 2025 news, comment and analysis from the Guardian, the world's leading liberal voice" name="description"/>
  <meta charset="utf-8"/>
  <link href="https://www.theguardian.com/australia-news/australian-election-2025" rel="canonical"/>
  <meta content="width=device-width,minimum-scale=1,initial-scale=1" name="viewport"/>
  <meta content="#052962" name="theme-color"/>
  <link href="https://assets.guim.co.uk/static/frontend/manifest.json" rel="manifest"/>
  <link href="https://assets.guim.co.uk/static/frontend/icons/homescreen/apple-touch-icon.svg" rel="apple-touch-icon" sizes="any"/>
  <link href="https://assets.guim.co.uk/static/frontend/icons/homescreen/apple-touch-icon-512.png" rel="apple-touch-icon" sizes="512x512"/>
  <link href="

In [61]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
from datetime import datetime
import time

class GuardianScraper:
    def __init__(self, start_url):
        self.start_url = start_url
        self.articles = pd.DataFrame(columns=['headline', 'title', 'content', 'date', 'url'])  # DataFrame to store article data
        self.current_page = 1

    def scrape_articles(self, url):
        """Scrapes articles from the page and handles pagination."""
        
        while url:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'lxml')

            # Step 1: Scrape articles on the current page
            container = soup.find('main')
            articles = container.find_all('li')

            for article in articles:
                try:
                    # Get article info (headline, URL)
                    article_url = self.fetch_article_url(article)
                    print(article_url)

                    # Scrape the article details: title, content, date
                    title, content, date = self.scrape_article_details(article_url)

                    # Add data to DataFrame
                    if title and content and date:
                        new_row = pd.DataFrame({
                                'headline': [title],  # Wrap values in lists
                                'title': [title],
                                'content': [content],
                                'date': [date],
                                'url': [article_url]
                            })
                        self.articles = pd.concat([self.articles, new_row], ignore_index=True)
                        print(f"Collected: {title}")
                    else:
                        print(f"Skipping article: {title}")

                except Exception as e:
                    print(f"Error processing article: {e}")
                    continue
                time.sleep(5)
                
            url = self.get_next_page(soup)
            if not url:
                print("No more pages found, stopping...")
                break
            self.current_page += 1

    
    def fetch_article_url(self, article):
        """Helper method to extract article information like headline and URL."""
        base_url = "https://www.theguardian.com"
        article_link = article.find('a')['href']
        if article_link.startswith('/'):
            article_url = base_url + article_link  # Make it a full URL
        else:
            article_url = article_link
        
        return article_url

    def scrape_article_details(self, article_url):
        """Scrapes article content, title, date, and other details from individual article pages."""
        response = requests.get(article_url)
        soup = BeautifulSoup(response.text, 'html.parser')

        try:
            # Extracting title
            title = soup.find('h1').text

            # Extracting date (Assuming it's in a 'time' element with a specific class)


            # Extract date text
            date_text = soup.find('span', {'class': 'dcr-u0h1qy'}).text.strip()

            # The date format might have the time and timezone, so we need to handle that
            # Example: "Sun 30 March 2025 10:15 AEDT"
            date_text_without_time_zone = re.sub(r'\s[A-Za-z]+$', '', date_text)  # Remove the time and timezone (like "AEDT")

            # Convert to datetime object
            date_only = ' '.join(date_text_without_time_zone.split()[:4])
            date_obj = datetime.strptime(date_only, "%a %d %b %Y")

            # If you want to output the date in the format '%d %B %Y':
            date = date_obj.strftime("%d %B %Y")

            print(date)  # For example: '30 March 2025'


            # Extracting content (Assuming paragraphs are in 'div' with class 'content__article-body')
            content_element = soup.find('article')
            paragraphs = content_element.find_all('p')
            content = " ".join([p.text for p in paragraphs]).strip()

            return title, content, date

        except Exception as e:
            print(f"Error processing article: {e}")
            return None, None, None  # Return None in case of an error

    def get_next_page(self, soup):
        """Finds the next page URL from pagination links at the bottom of the page."""
        pagination_links = soup.find('div', {'class': 'dcr-stdtpu'})  # Find pagination section

        if pagination_links:
            all_pages = pagination_links.find_all('a', string=re.compile(r'\d+'))  # Find all numbered page links

            # Otherwise, continue to the next page in sequence
            for page_link in all_pages:
                if int(page_link.text.strip()) == self.current_page + 1:
                    next_page_url = page_link['href'] # Get the next sequential page
                    if next_page_url.startswith('/'):
                        next_page_url = "https://www.theguardian.com" + next_page_url
                    return next_page_url

    def save_data_to_file(self, filename='guardian_articles.csv'):
        """Saves the collected articles to a CSV file."""
        self.articles.to_csv(filename, index=False)
        print(f"Data saved to {filename}.")



In [62]:
# Example: Start scraping from the first page
start_url = "https://www.theguardian.com/australia-news/australian-election-2025/all"
left_scraper = GuardianScraper(start_url=start_url)


In [None]:
left_scraper.scrape_articles(url=start_url)

https://www.theguardian.com/australia-news/2025/apr/01/rba-interest-rates-hold-today
01 April 2025
Collected: RBA holds rates at 4.1% but Michele Bullock says room for cuts if global trade war takes toll
https://www.theguardian.com/australia-news/2025/apr/01/australian-election-peter-dutton-liberal-coalition-gas-energy-plan-power-prices
01 April 2025
Collected: Gas producers question whether Coalition’s energy plan will cut consumer prices
https://www.theguardian.com/australia-news/2025/apr/01/melbourne-airport-train-peter-dutton-funding-cuts-suburban-rail-loop
01 April 2025
Collected: What do Peter Dutton’s proposed funding cuts really mean for Melbourne’s long-awaited airport rail?
https://www.theguardian.com/australia-news/2025/apr/01/labor-dutton-trump-comparison-doge-school-curriculum
01 April 2025
Collected: Labor accuses Dutton of copying Trump with suggestion children being ‘indoctrinated’ at school
https://www.theguardian.com/australia-news/2025/apr/01/fifteen-seconds-of-summa

https://www.theguardian.com/australia-news/video/2025/mar/28/key-takeaways-from-duttons-sliding-doors-budget-reply-video
28 March 2025
Collected: Key takeaways from Dutton's 'sliding doors' budget reply – video
https://www.theguardian.com/australia-news/2025/mar/28/australian-federal-election-date-anthony-albanese-calls-poll-for-3-may
28 March 2025
Collected: Anthony Albanese calls Australian federal election for 3 May
https://www.theguardian.com/australia-news/audio/2025/mar/28/its-a-date-the-2025-federal-election-has-finally-been-called-full-story-podcast
28 March 2025
Collected: It’s a date: the 2025 federal election has finally been called – Full Story podcast
https://www.theguardian.com/australia-news/video/2025/mar/28/voting-101-the-australian-election-has-been-called-heres-what-that-means-for-you-video
28 March 2025
Collected: Voting 101: The Australian election has been called, here’s what that means for you - video
https://www.theguardian.com/australia-news/commentisfree/2025/

In [40]:
!git add the_guardian.ipynb
!git commit -m "works pretty well"
!git push

[main 0ef3915] works pretty well
 1 file changed, 439 insertions(+), 53 deletions(-)
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 8 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 4.02 KiB | 316.00 KiB/s, done.
Total 3 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/gaylejuntilla/MA3831-A3.git
   ce43d36..0ef3915  main -> main
