In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

CURATED_DATA_DIR = "../../data/curated"
LANDING_DATA_DIR = "../../data/landing"
RAW_DATA_DIR = "../../data/raw"


# Download the mapping from postcode to suburb from AustraliaPost
Example: 3053 --> Carlton; 3052 --> Parkville

Note that median income and population gonna take ~20 mins to run

In [2]:
def clean_name(x):
    """
    This function is used to reformat the name of a suburb, including lowering 
    and stripping white space: Carlton --> carlton
    """
    x= x.split(",")[0]
    x = x.lower()
    x = x.strip() # remove excess padding
    return x

def scrape_website(url):
    """Scrapes a website and extracts the text in <td class = "second"> 
    for each <tr class> tag text, and saves it to a dataframe.

    Args:
        url: The URL of the website to scrape.

    Returns:
        A Pandas DataFrame containing the scraped data.
    """

    # get the content of html
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # find the location at which the data is located
    tbody = soup.find("tbody")
    if not tbody:
        return None
    tr_tags = tbody.find_all("tr")


    # extract the data: 3053 --> carlton
    suburb_lst = []
    for tr_tag in tr_tags:
        td_second = tr_tag.find("td", class_="second").text
        td_second = clean_name(td_second)
        suburb_lst.append(td_second)
    return suburb_lst


# start to create df for storage
df = pd.DataFrame()
new_rows = []
# Example usage:
for postcode in range(3000,3997):
    # print(postcode)
    url = f"https://auspost.com.au/postcode/{postcode}"
    suburbs = scrape_website(url)
    if not suburbs:
        continue
    # print(suburbs)
    new_row = [postcode, suburbs]
    new_rows.append(new_row)

# finalise the dataframe
df = df.append(new_rows, ignore_index=True)
print(df.shape)
df = df.rename({0: 'postcode', 1: 'suburbs'}, axis = 1)
df


(719, 2)


  df = df.append(new_rows, ignore_index=True)


Unnamed: 0,postcode,suburbs
0,3000,[melbourne]
1,3001,[melbourne]
2,3002,[east melbourne]
3,3003,[west melbourne]
4,3004,"[melbourne, st kilda road central, st kilda ro..."
...,...,...
714,3990,[glen forbes]
715,3991,[bass]
716,3992,"[blackwood forest, dalyston, ryanston, west cr..."
717,3995,"[anderson, archies creek, cape paterson, harme..."


In [3]:
# save to csv file
df.to_csv(f"{CURATED_DATA_DIR}/postcode_to_suburb.csv", index = False)

# Download the population and income data from ABS

In [11]:
import re
median_income_df = pd.DataFrame()
population_df = pd.DataFrame()
median_text = "Median incomes have been estimated for each income range \
using data from the Survey of Income and Housing."

# this code's gonna take ~20mins


def scrape_abs(website):
    response = requests.get(website)
    soup = BeautifulSoup(response.content, "html.parser")
    median_income, population = None, None

    # find the median income data
    # Find the `th` tag with the title "Median ..."
    th_tag = soup.find("th", 
                       title=re.compile(median_text))

    # If the `th` tag is found, extract the number in the `td` tag next to it.
    if th_tag:
        td_tag = th_tag.find_next_sibling('td')
        median_income = td_tag.text

        # Remove all whitespace from the median income.
        median_income = median_income.strip()
        # remove all $ and ,
        median_income = re.sub(r"[,$]", "", median_income)


    # find the population data
    # Find the table class = "summaryTable qsPeople".
    table = soup.find('table', class_='summaryTable qsPeople')
    if not table:
        return None, None
    # Find the tbody tag in the table.

    # Find all tr tags in the tbody tag.
    # Find the tr tag with the th tag that contains the text "People".
    th_tag = table.find('th', text = re.compile("People"))

        # If the `th` tag is found, extract number in the `td` tag next to it.
    if th_tag:
        td_tag = th_tag.find_next_sibling('td')
        population = td_tag.text

        # Remove all whitespace from the median income.
        population = population.strip()
        # remove all $ and ,
        population = re.sub(r"[,$]", "", population)

    # print(median_income, population)
    return median_income, population
    


for year in range(2006, 2022, 5):
    print(year)
    # declare corresponding variables for each year
    median_income_df_year = pd.DataFrame()
    population_df_year = pd.DataFrame()
    median_income_rows = []
    population_rows = []

    for postcode in range(3000,3997):
        # print(postcode)
        url = f"https://www.abs.gov.au/census/find-census-data/\
quickstats/{year}/POA{postcode}"
        # extract data
        median_income, population = scrape_abs(url)
        # url does not exist
        if not median_income and not population:
            continue
        # row appended
        new_row_income = [postcode, median_income]
        new_row_pop = [postcode, population]
        median_income_rows.append(new_row_income)
        population_rows.append(new_row_pop)

    # income handling
    # update the median income each year
    median_income_df_year = median_income_df_year.append(median_income_rows, 
                                                         ignore_index = True)
    median_income_df_year = median_income_df_year.rename(
        {1: year}, axis = 1).set_index(0).transpose()
    
    # update the main income_df
    median_income_df = pd.concat([median_income_df, median_income_df_year])


    # population handling
    # update the population each year
    population_df_year = population_df_year.append(population_rows, 
                                                   ignore_index = True)
    population_df_year = population_df_year.rename(
        {1: year}, axis = 1).set_index(0).transpose()
    
    # update the main population_df
    population_df = pd.concat([population_df, population_df_year])


2006


  median_income_df_year = median_income_df_year.append(median_income_rows,
  population_df_year = population_df_year.append(population_rows,


2011


  median_income_df_year = median_income_df_year.append(median_income_rows,
  population_df_year = population_df_year.append(population_rows,


2016


  median_income_df_year = median_income_df_year.append(median_income_rows,
  population_df_year = population_df_year.append(population_rows,


2021


  median_income_df_year = median_income_df_year.append(median_income_rows,
  population_df_year = population_df_year.append(population_rows,


In [12]:
# median_income_df = median_income_df.drop(2026)
population_df

Unnamed: 0,3000,3002,3003,3004,3005,3006,3008,3010,3011,3012,...,3708,3718,3762,3785,3893,3967,3990,3062,3336,3358
2006,14538,4330,3037,5822,590.0,9365,3348,1359.0,16048,20767,...,,,,,,,,,,
2011,20627,4714,3744,7737,692.0,11302,5096,1517.0,18053,23127,...,,,,,,,,,,
2016,37975,4964,5515,9307,525.0,18808,10437,1593.0,21462,26298,...,60.0,72.0,131.0,77.0,23.0,125.0,106.0,,,
2021,43084,4896,8025,11482,,22699,15495,,22278,27023,...,66.0,91.0,117.0,69.0,25.0,108.0,118.0,6.0,17737.0,3440.0


In [13]:
from sklearn.linear_model import LinearRegression
def inference(df):
    preds_2023 = []
    preds_2026 = []

    for suburb in df.columns:
        df_sub = df[[suburb]]
        df_sub = df_sub[~df_sub[suburb].isnull()]
        training_data = np.array(list(df_sub.index)).reshape((-1,1))
        response = np.array(df_sub).reshape((-1,1))
        lin_reg = LinearRegression()
        lin_reg.fit(training_data, response)
        pred_2026 = lin_reg.predict(np.array(2026).reshape((-1,1)))[0]
        pred_2023 = lin_reg.predict(np.array(2023).reshape((-1,1)))[0]
        preds_2026.append(pred_2026[0])
        preds_2023.append(pred_2023[0])

    return preds_2023, preds_2026

samp_income = median_income_df.copy()
income_preds_2023, income_preds_2026 = inference(samp_income)

samp_income = samp_income.transpose()
samp_income[2023] = np.array(income_preds_2023)
samp_income[2026] = np.array(income_preds_2026)

samp_population = population_df.copy()
income_preds_2023, income_preds_2026 = inference(samp_population)

samp_population = samp_population.transpose()
samp_population[2023] = np.array(income_preds_2023)
samp_population[2026] = np.array(income_preds_2026)

samp_income = samp_income.reset_index().rename({0: 'postcode'}, axis = 1)
samp_population = samp_population.reset_index().rename({0: 'postcode'}, 
                                                       axis = 1)
samp_income

Unnamed: 0,postcode,2006,2011,2016,2021,2023,2026
0,3000,785,994,955,1306,1299.56,1391.0
1,3002,1596,1989,2285,2345,2536.92,2689.5
2,3003,1424,1704,1766,1788,1889.76,1959.0
3,3004,1569,1921,2006,2090,2209.62,2308.5
4,3005,1439,1944,2182,,2746.60,2969.5
...,...,...,...,...,...,...,...
695,3967,,,939,1274,1408.00,1609.0
696,3990,,,1075,1562,1756.80,2049.0
697,3062,,,,1187,1187.00,1187.0
698,3336,,,,2304,2304.00,2304.0


In [14]:
samp_population

Unnamed: 0,postcode,2006,2011,2016,2021,2023,2026
0,3000,14538,20627,37975,43084,48623.340000,54802.500000
1,3002,4330,4714,4964,4896,5096.120000,5213.000000
2,3003,3037,3744,5515,8025,8259.900000,9264.000000
3,3004,5822,7737,9307,11482,12111.500000,13224.500000
4,3005,590,692,525,,524.333333,504.833333
...,...,...,...,...,...,...,...
695,3967,,,125,108,101.200000,91.000000
696,3990,,,106,118,122.800000,130.000000
697,3062,,,,6,6.000000,6.000000
698,3336,,,,17737,17737.000000,17737.000000


In [15]:
# rename columns
samp_population = samp_population[['postcode', 2023, 2026]]
samp_income = samp_income[['postcode', 2023, 2026]]

samp_income = samp_income.rename({2023: 'income_2023', 2026: 'income_2026'},
                                  axis = 1)
samp_population = samp_population.rename({2023: 'population_2023', 
                                          2026: 'population_2026'}, axis=1)

In [16]:
print(samp_income.head(5))
print(samp_population.head(5))


   postcode  income_2023  income_2026
0      3000      1299.56       1391.0
1      3002      2536.92       2689.5
2      3003      1889.76       1959.0
3      3004      2209.62       2308.5
4      3005      2746.60       2969.5
   postcode  population_2023  population_2026
0      3000     48623.340000     54802.500000
1      3002      5096.120000      5213.000000
2      3003      8259.900000      9264.000000
3      3004     12111.500000     13224.500000
4      3005       524.333333       504.833333


In [17]:
# save to csv files
samp_income.to_csv(f"{CURATED_DATA_DIR}/income_historical.csv",
                    index = False)
samp_population.to_csv(f"{CURATED_DATA_DIR}/population_historical.csv",
                        index = False)
