In [1]:
! git init

Reinitialized existing Git repository in /Users/gaylejuntilla/MA3831_A3/.git/


In [13]:
!git add .
!git commit -m "Initialise abc_news.ipynb"

[main 7717188] Initialise abc_news.ipynb


In [14]:
!git push -u origin main

Enumerating objects: 12, done.
Counting objects: 100% (12/12), done.
Delta compression using up to 8 threads
Compressing objects: 100% (11/11), done.
Writing objects: 100% (11/11), 3.21 KiB | 3.21 MiB/s, done.
Total 11 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), done.[K
To https://github.com/gaylejuntilla/MA3831-A3.git
   cd64767..7717188  main -> main
branch 'main' set up to track 'origin/main'.


In [24]:
#pip install webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver_manager
Successfully installed webdriver_manager-4.0.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
from datetime import datetime, timezone, timedelta

options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run headless (no UI)
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

class ABCNewsScraper:
    def __init__(self, start_url, cutoff_date="1 October 2024"):
        self.start_url = start_url
        self.cutoff_date = datetime.strptime(cutoff_date, "%d %B %Y")  # Convert string to datetime
        self.articles = pd.DataFrame(columns=['headline', 'title', 'date', 'content', 'url'])  # Store data
        self.driver = webdriver.Chrome('chromedriver',chrome_options=options)  # Initialize WebDriver
        
    def run(self):
        """Runs the scraper: loads page, extracts articles, stops at cutoff date."""
        self.driver.get(self.start_url)
        time.sleep(3)  # Let page load
        
        # Step 1: Load articles until the cutoff date
        self.load_articles_until_cutoff()
        
        # Step 2: Extract articles after loading all
        self.extract_articles()
        
        self.driver.quit()  # Close browser
        self.save_data_to_file()

    def load_articles_until_cutoff(self):
    """Keep loading more articles until the last article is older than the cutoff date."""
        while True:
            # Get all currently loaded articles
            articles = self.driver.find_elements(By.CSS_SELECTOR, 'ul[aria-labelledby="Latest election articles"] li')

            if not articles:
                print("No articles found on page. Stopping.")
                break

            # Find the date of the last article
            try:
                last_article = articles[-1]
                last_date_str = last_article.find_element(By.XPATH, ".//time[1]").get_attribute("datetime")
                last_article_date_utc = datetime.strptime(last_date_str, "%Y-%m-%dT%H:%M:%S.000Z")
                last_article_date = self.convert_to_brisbane_time(last_article_date_utc)

                # Stop loading if the last article's date is older than the cutoff
                if last_article_date < self.cutoff_date:
                    print("Reached cutoff date. Stopping 'Load More' clicks.")
                    break

            except Exception as e:
                print(f"Error extracting date from last article: {e}")
                break  # Stop in case of unexpected errors

            # Try clicking "Load More" to get more articles
            try:
                load_more_button = WebDriverWait(self.driver, 5).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[data-component="PaginationLoadMoreButton"]')))
                load_more_button.click()
                time.sleep(2)  # Wait for more articles to load

            except:
                print("No more 'Load More' button. Ending loading.")
                break
    
    def convert_to_brisbane_time(self, utc_datetime):
    """Converts a UTC datetime string to Brisbane time and formats it as 'DD Month YYYY'."""
    
        # Define UTC timezone
        utc_zone = timezone.utc

        # Define Brisbane timezone (UTC+10, no daylight savings)
        brisbane_zone = timezone(timedelta(hours=10))

        # Convert to Brisbane time
        brisbane_datetime = utc_datetime.replace(tzinfo=utc_zone).astimezone(brisbane_zone)
        
        # Format as 'DD Month YYYY'
        return brisbane_datetime
    
    def extract_articles(self):
    """Extracts article details: headline, date, content, and URL."""
        
        articles = self.driver.find_elements(By.CSS_SELECTOR, 'ul[aria-labelledby="Latest election articles"] li')
        
        time.sleep(10)

        for article in articles:
            try:
                
                # Extract date
                article_date_str = article.find_element(By.XPATH, ".//time[1][@datetime]").get_attribute('datetime')

                # Convert the UTC datetime string to a datetime object
                article_date_utc = datetime.strptime(article_date_str, "%Y-%m-%dT%H:%M:%S.000Z")

                # Format it to just Day, Month, and Year
                article_date = self.convert_to_brisbane_time(article_date_utc)
                
                # Stop if we've reached the cutoff date
                if article_date < self.cutoff_date:
                    break
                    
                # Extract the article URL
                link_element = article.find_element(By.TAG_NAME, 'a')
                article_url = link_element.get_attribute("href")

                # Extract headline
                headline = article.find_element(By.CSS_SELECTOR, "a[data-component='Link']").text 
                
                # Extract content
                self.driver.execute_script("window.open('');")  # Open new tab
                self.driver.switch_to.window(self.driver.window_handles[1])
                self.driver.get(article_url)
                time.sleep(3) # time to load article
                
                title = self.driver.find_element(By.CSS_SELECTOR, "h1[data-component='Typography']").text

                content_elements = self.driver.find_elements(By.CSS_SELECTOR, 'div[class*="ArticleRender_article"] p')
                content = " ".join([p.text for p in content_elements])

                # Close tab and switch back
                self.driver.close()
                self.driver.switch_to.window(self.driver.window_handles[0])

                # Add data to DataFrame
                self.articles.loc[len(self.articles)] = [headline, title, article_date, content, article_url]
                print(f"Collected: {headline}")

            except Exception as e:
                print(f"Error processing article: {e}")

    
    def save_data_to_file(self):
        """Saves the scraped data to a CSV file."""
        self.articles.to_csv("abc_news_articles.csv", index=False)
        print("Data saved to abc_news_articles.csv.")


In [23]:
!git add .
!git commit -m "fix small errors"
#!git push

[main 5ddb222] fix small errors
 1 file changed, 10 insertions(+), 22 deletions(-)
