# Sky News Web Crawler

In [2]:
!git add sky_news.ipynb
!git commit -m "copy "
#!git push

[main 9515212] copy template from The Guardian
 1 file changed, 68 insertions(+), 2 deletions(-)


In [None]:
# change to check publish date during pagination
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
from datetime import datetime
import time

class SkyNewsScraper:
    def __init__(self, start_url, cutoff_date="1 January 2025"):
        self.start_url = start_url
        self.cutoff_date = datetime.strptime(cutoff_date, "%d %B %Y")  # Convert string to datetime
        self.articles = pd.DataFrame(columns=['headline', 'title', 'content', 'date', 'url'])  # DataFrame to store article data
        self.current_page = 1
        
    def scrape_articles(self, url):
        """Scrapes articles from the page and handles pagination."""
        while url:
        #while True:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'lxml')

            articles = soup.find_all('article')
            if not articles:
                print("No articles found on page, stopping...")
                break

            last_article_date = None  # Track the last article's date
            
            for article in articles:
                try:
                    # Extract article info
                    article_info = self.fetch_article_info(article)
                    article_date = datetime.strptime(article_info['date'], "%d %B %Y")  # Convert to datetime object
                    
                    # Check if this article is older than cutoff_date
                    if article_date < self.cutoff_date:
                        print(f"Reached article older than cutoff date: {article_info['date']}, stopping...")
                        return  # Exit function to stop scraping

                    # Scrape article details
                    title, content = self.scrape_article_details(article_info['article_url'])

                    if title and content:
                        new_row = pd.DataFrame({
                            'headline': [article_info['headline']],
                            'title': [title],
                            'content': [content],
                            'date': [article_info['date']],
                            'url': [article_info['article_url']]
                        })
                        self.articles = pd.concat([self.articles, new_row], ignore_index=True)
                        print(f"Collected: {title}")
                    else:
                        print(f"Skipping article: {title}")

                except Exception as e:
                    print(f"Error processing article: {e}")
                    continue

                time.sleep(5)
                
            #break
            # Step 2: Find the next page (pagination)
            url = self.get_next_page(soup)
            if not url:
                print("No more pages found, stopping...")
                break
            self.current_page += 1
    
    def fetch_article_info(self, article):
        """Helper method to extract article information like headline and URL."""
        attributes = article.find('h4', {'class': "storyblock_title"})
        article_url = attributes.find('a')['href']
        headline = attributes.find('a').text
        date_str = article.find('time')['datetime']  # Extract the date string
        date_obj = datetime.strptime(date_str, "%d/%m/%Y")  # Convert to datetime object
        date = date_obj.strftime("%d %B %Y")

        return {'headline': headline, 'article_url': article_url, 'date': date} 

    def scrape_article_details(self, article_url):
        """Scrapes article content, title, date, and other details from individual article pages."""
        response = requests.get(article_url)
        soup = BeautifulSoup(response.text, 'html.parser')

        try:
            # Extracting title
            title = soup.find('h1').text

            # Extracting content (Assuming paragraphs are in 'div' with class 'content__article-body')
            content_element = soup.find('div', {'id': 'story-primary'})
            paragraphs = content_element.find_all('p')
            content = " ".join([p.text for p in paragraphs]).strip()

            return title, content

        except Exception as e:
            print(f"Error processing article: {e}")
            return None, None, None  # Return None in case of an error

    def get_next_page(self, soup):
        """Finds the next page URL from pagination links at the bottom of the page."""
        pagination_links = soup.find('ul', {'class': 'page-numbers'})  # Find pagination section

        if pagination_links:
            all_pages = pagination_links.find_all('a', text=re.compile(r'\d+'))  # Find all numbered page links

            # Otherwise, continue to the next page in sequence
            for page_link in all_pages:
                if int(page_link.text.strip()) == self.current_page + 1:
                    return page_link['href']  # Get the next sequential page

    def save_data_to_file(self, filename='guardian_articles.csv'):
        """Saves the collected articles to a CSV file."""
        self.articles.to_csv(filename, index=False)
        print(f"Data saved to {filename}.")
