In [None]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import re
from datetime import datetime

Defining a function to extract location and date information

In [None]:
def extract_simplified_information(html: str):
    """
    Extracts the location and the date from the page title.
    
    Args:
        html (str): HTML code containing the information.
    
    Returns:
        tuple: Location and date as a tuple, or (None, None) if not found.
    """
    match = re.search(r'<h1>(?P<location>.+?)(?:<br/?>|,)\s*(?P<date>\w+ \d{4})', html, re.IGNORECASE)
    if match:
        location = match.group('location').split(',')[0].strip()
        date = match.group('date').strip()
        return location, date
    else:
        return None, None

Defining the URL to scrape

In [None]:
# URL of the Australian weather data page
locations = {'Canberra': '2801',
             'Tuggeranong': '2802',
             'Mount Ginini': '2804'}
url = f"https://reg.bom.gov.au/climate/dwo/IDCJDW{locations['Canberra']}.latest.shtml"

Fetching the web page

In [None]:
# Make an HTTP request to the URL
page_html = requests.get(url)

# Parse the HTML content of the page using BeautifulSoup
soup = bs(page_html.content, 'lxml')

Extracting data from the web page

In [None]:
# Find the table in the HTML that contains the weather data
table = soup.find('table', attrs={'class': 'data'})

# Extract all rows from the table body
rows = table.find('tbody').find_all('tr')

Processing table headers and rows

In [None]:
# Extracting table headers (days of the month)
days = [header.text for header in row.find_all('th') for row in rows]

# Extracting weather data for each day
data = []
for row in rows:
    day_data = [data.text if data.text != '\xa0' else 'NA' for data in row.find_all('td')]
    data.append(day_data)

Extracting date information

In [None]:
# Extracting location and date from the page header
header = soup.find('div', attrs={'class': 'content'}).find('h1')
location, date = extract_simplified_information(str(header))

Formatting date information

In [None]:
# Formatting the date information
month_year_datetime = datetime.strptime(date, "%B %Y")
formatted_dates = [month_year_datetime.replace(day=int(day)).strftime("%Y-%m-%d") for day in days]

Creating and displaying the DataFrame

In [None]:
# Define column names for the DataFrame
columns = ['Day', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 
           'WindGustDir', 'WindGustSpeed', 'WindGustTime', 
           'Temp9am', 'Humidity9am', 'Cloud9am', 'WindDir9am', 'WindSpeed9am', 'Pressure9am', 
           'Temp3pm', 'Humidity3pm', 'Cloud3pm', 'WindDir3pm', 'WindSpeed3pm', 'Pressure3pm']

# Create the DataFrame
df = pd.DataFrame(data, columns=columns)

# Drop unnecessary columns
df.drop(['Day', 'WindGustTime'], axis=1, inplace=True)

# Add formatted date and location to the DataFrame
df['Date'] = formatted_dates
df['Location'] = location

# Display the DataFrame
display(df)