# Sky News Web Crawler

In [3]:
!git add sky_news.ipynb
!git commit -m "scraper draft"
#!git push

[main a4e6e86] scraper draft
 1 file changed, 124 insertions(+), 16 deletions(-)


In [25]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
from datetime import datetime
import time

class SkyNewsScraper:
    def __init__(self, start_url, cutoff_date="1 January 2025"):
        self.start_url = start_url
        self.cutoff_date = datetime.strptime(cutoff_date, "%d %B %Y")  # Convert string to datetime
        self.articles = pd.DataFrame(columns=['headline', 'title', 'content', 'date', 'url'])  # DataFrame to store article data
        self.current_page = 1
        
    def scrape_articles(self, url):
        """Scrapes articles from the page and handles pagination."""
        
        while url:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'lxml')

            articles = soup.find_all('article', class_=lambda x: x == "storyblock")

            if not articles:
                print("No articles found on page, stopping...")
                break

            last_article_date = None  # Track the last article's date
            
            for article in articles:
                try:
                    # Extract article info
                    article_info = self.fetch_article_info(article)
                    article_date = datetime.strptime(article_info['date'], "%d %B %Y")  # Convert to datetime object
                    
                    # Check if this article is older than cutoff_date
                    if article_date < self.cutoff_date:
                        print(f"Reached article older than cutoff date: {article_info['date']}, stopping...")
                        return  # Exit function to stop scraping

                    # Scrape article details
                    title, content = self.scrape_article_details(article_info['article_url'])

                    if title and content:
                        new_row = pd.DataFrame({
                            'headline': [article_info['headline']],
                            'title': [title],
                            'content': [content],
                            'date': [article_info['date']],
                            'url': [article_info['article_url']]
                        })
                        self.articles = pd.concat([self.articles, new_row], ignore_index=True)
                        print(f"Collected: {title}")
                    else:
                        print(f"Skipping article: {title}")

                except Exception as e:
                    print(f"Error processing article: {e}")
                    continue

                time.sleep(5)
                
            # Step 2: Find the next page (pagination)
            url = self.get_next_page(soup)
            if not url:
                print("No more pages found, stopping...")
                break
            self.current_page += 1
    
    def fetch_article_info(self, article):
        """Helper method to extract article information like headline and URL."""
        attributes = article.find('h4', {'class': "storyblock_title"})
        article_url = attributes.find('a')['href']
        headline = attributes.find('a').text
        date_str = article.find('time')['datetime']  # Extract the date string
        date_obj = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S%z")  # Convert to datetime object
        date = date_obj.strftime("%d %B %Y")

        return {'headline': headline, 'article_url': article_url, 'date': date} 

    def scrape_article_details(self, article_url):
        """Scrapes article content, title, date, and other details from individual article pages."""
        response = requests.get(article_url)
        soup = BeautifulSoup(response.text, 'html.parser')

        try:
            # Extracting title
            title = soup.find('h1').text

            # Extracting content (Assuming paragraphs are in 'div' with class 'content__article-body')
            content_element = soup.find('div', {'id': 'story-primary'})
            paragraphs = content_element.find_all('p')
            content = " ".join([p.text for p in paragraphs]).strip()

            return title, content

        except Exception as e:
            print(f"Error processing article: {e}")
            return None, None, None  # Return None in case of an error

    def get_next_page(self, soup):
        """Finds the next page URL from pagination links at the bottom of the page."""
        pagination_links = soup.find('ul', {'class': 'page-numbers'})  # Find pagination section

        if pagination_links:
            all_pages = pagination_links.find_all('a', text=re.compile(r'\d+'))  # Find all numbered page links

            # Otherwise, continue to the next page in sequence
            for page_link in all_pages:
                if int(page_link.text.strip()) == self.current_page + 1:
                    return page_link['href']  # Get the next sequential page

    def save_data_to_file(self, filename='guardian_articles.csv'):
        """Saves the collected articles to a CSV file."""
        self.articles.to_csv(filename, index=False)
        print(f"Data saved to {filename}.")


In [26]:
skynews_url = "https://www.skynews.com.au/australia-news/politics"
right_scraper = SkyNewsScraper(start_url=skynews_url)

In [None]:
right_scraper.scrape_articles(url=skynews_url)

Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: Peter Dutton accuses Anthony Albanese of ducking additional debates despite the PM agreeing to Sky News People’s Forum
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: 'This is not Australian': Federal Election takes disturbing turn, as LNP member’s corflutes defaced with outlawed Nazi symbols
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: Tony Abbott tells Peta Credlin that Australia is being 'bullied and pushed around' by China under the Albanese govt
Collected: Teenage hairdressing apprentice 'in shock' after Bradfield Teal candidate Nicolette Boele's sexualised joke at Sydney salon
Error processing article: 'NoneType' object has no attribut

Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' obje

  all_pages = pagination_links.find_all('a', text=re.compile(r'\d+'))  # Find all numbered page links


Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: Federal Election: Peter Dutton warns small businesses about Labor's proposed super changes, taxing unrealised capital gains
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object

Collected: Prime Minister Anthony Albanese trips over question on power sharing deal with Greens in first major gaffe of 2025 Federal Election
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: Labor MP suggests government providing 'lot of support' to small businesses despite insolvencies and calls for tax cut
Collected: Opposition Leader Peter Dutton campaigns in key seat as fight over energy dominates Federal Election campaign
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: Federal Election 2025: Anthony Albanese under f

Collected: Newspoll, Resolve Political Monitor, and YouGov show surge in Labor’s primary vote as election campaign begins
Collected: Independent MP Dai Le provides major hint about who she’ll side with if election results in hung parliament
Collected: 'Prices will come down': Coalition to unveil more details about its energy price reduction pledge ‘over the next couple of days’
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: YouGov poll shows likely minority government as Labor falls short by just one seat
Collected: Opposition Leader Peter Dutton labels Prime Minister Anthony Albanese ‘weak as water’ amid battle over supermarket policy
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to u

Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: Coalition refuses to promise specific energy price cuts following Labor’s failed $275 power promise
Collected: Prime Minister Anthony Albanese speaks to reporters in Canberra as 2025 Federal Election campaign kicks off
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error process

Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: Labor pledges to make supermarket price gouging illegal if re-elected in bid to gain voters ahead of federal election
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has n

Collected: Leaders hit the campaign trail to heckles, cheers and jeers as PM Albanese and Opposition Leader Peter Dutton face disruptions
Collected: Dutton's visit to Chinese restaurant in Brisbane crashed by anti-nuclear campaigner just hours after climate protester interrupted his XXXX factory visit
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: Anthony Albanese declares Labor 'deserves to be re-elected' but Peter Dutton says only he can win majority government
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: Nationals Leader David Littleproud says Peter Dutton's 'strong campaign' can win back Greens-held seats of Ryan, Brisbane
Error processing article

Collected: Right wing, anti-immigration activist interrupts PM Anthony Albanese’s press conference on first day of Federal Election campaign
Collected: Opposition Leader Peter Dutton interrupted by anti-nuclear protester while visiting XXXX factory in Brisbane
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: Former premier Campbell Newman predicts Greens to lose all three of its Queensland seats in federal election
Collected: Federal Election 2025: Labor and Coalition go head-to-head on key issues, including cost of living, energy and defence
Collected: Residents in NSW marginal seat where PM Anthony Albanese bought $4.

Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' obje

Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: Opposition Leader Peter Dutton vows to slash tens of billions of 'wasteful' government spending in budget reply
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: 'The way to do it': Liberal Party urged to attack Labor’s primary vote in order to win Teal seats in election
Collected: 'That is a lie': Sky News host Laura Jayes clashes with Labor Minister Tim Ayres over cost of living, power prices in fiery interview
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'f

Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Error processing article: 'NoneType' object has no attribute 'find_all'
Error processing article: too many values to unpack (expected 2)
Collected: Question Time underway ahead of Coalition's budget reply, with tax cuts set to be key topic of debate
Collected: 'Buried in the backyard': Hanson-Young’s admission after making headlines for brandishing dead salmon in parliament
