This python code scrapes the suburb index from Australia Post website https://auspost.com.au/postcode/suburb-index/.

It randomises the order of the letters in which it works, and uses a short random delay to try
to look innocent.

In [9]:
from requests import Session
import re
import numpy
import time
import sys
import csv
import datetime
import random
import string

def get_letter(letter, follow_subsequent=True):
	print("doing get_letter(\"{}\").".format(letter))
	s = Session()
	headers = {"User-Agent":  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15",
			   "Origin" : "https://auspost.com.au/postcode",
			   "Referer": "https://auspost.com.au/postcode",
			   "Accept-Language" : "en-au",
			   "Host" : "auspost.com.au",
			   "Accept-Encoding" : "gzip, deflate",
			   "X-Requested-With" : "XMLHttpRequest",
	"Accept": "*/*",
	"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
	"Connection": "keep-alive",
	}

	target_url = "https://auspost.com.au/postcode/suburb-index/{}".format(letter)
	response = s.get(target_url, headers = headers)
	result_list = []
	subsequent_page_list = []

	for i in range(len(response.text.splitlines())):
		line = response.text.splitlines()[i]
		this_result_tuple = ()
		lid = re.match( r'.*class=\"pol-suburb-index-link js-pol-suburb-index-link\">(.+)</a>', line, flags=re.IGNORECASE)
		if lid:
			this_result = lid.group(1)
			result_list.append(this_result)
		lid = re.match( r'.*<a href=\"/postcode/suburb-index/({}\d)\">\d</a>'.format(letter), line, flags=re.IGNORECASE)
		if lid:
			print("Found subsequent page: {}".format(lid.group(1)))
			this_subsequent_page = lid.group(1)
			subsequent_page_list.append(this_subsequent_page)
	if follow_subsequent:
		for subsequent_page in subsequent_page_list:
			result_list.extend(get_letter(subsequent_page, False))
	return result_list

def scrape_AP_suburb_list():
	target_letter_list = list(string.ascii_lowercase)
	random.shuffle(target_letter_list)
	number_of_target_letters = len(target_letter_list)
	i = 0
	oall_result_list = []
	for this_target_letter in target_letter_list:
		i += 1
		print("Doing letter {} of {}.".format(i, number_of_target_letters))
		if (1 < i):
			delay = 1/60 * numpy.random.exponential(10, 1)[0]
			print("Delaying {} s.".format(delay))
			sys.stdout.flush()
			time.sleep(delay)
		oall_result_list.extend(get_letter(this_target_letter))
	oall_result_list.sort()
	return oall_result_list

def write_result_list_to_file(AP_suburb_list):
	with open('AP_suburb_list.csv','w') as out:
	    csv_out=csv.writer(out)
	    csv_out.writerow(['name'])
	    for item in AP_suburb_list:
	        csv_out.writerow([item])

In [10]:
AP_suburb_list = scrape_AP_suburb_list()

Doing letter 1 of 26.
doing get_letter("t").
Found subsequent page: t2
Found subsequent page: t3
Found subsequent page: t4
Found subsequent page: t5
doing get_letter("t2").
doing get_letter("t3").
doing get_letter("t4").
doing get_letter("t5").
Doing letter 2 of 26.
Delaying 0.08513744738802942 s.
doing get_letter("k").
Found subsequent page: k2
Found subsequent page: k3
Found subsequent page: k4
Found subsequent page: k5
doing get_letter("k2").
doing get_letter("k3").
doing get_letter("k4").
doing get_letter("k5").
Doing letter 3 of 26.
Delaying 0.13581963854453855 s.
doing get_letter("v").
Doing letter 4 of 26.
Delaying 0.5939432006690641 s.
doing get_letter("c").
Found subsequent page: c2
Found subsequent page: c3
Found subsequent page: c4
Found subsequent page: c5
doing get_letter("c2").
doing get_letter("c3").
doing get_letter("c4").
doing get_letter("c5").
Doing letter 5 of 26.
Delaying 0.1811288274418808 s.
doing get_letter("i").
Doing letter 6 of 26.
Delaying 0.1947868686227867

In [12]:
len(AP_suburb_list)

11721

In [13]:
AP_suburb_list

['Aarons Pass',
 'Abba River',
 'Abbey',
 'Abbeyard',
 'Abbeywood',
 'Abbotsbury',
 'Abbotsford',
 'Abbotsham',
 'Abeckett Street',
 'Abels Bay',
 'Abercorn',
 'Abercrombie',
 'Abercrombie River',
 'Aberdare',
 'Aberdeen',
 'Aberfeldie',
 'Aberfeldy',
 'Aberfoyle',
 'Aberfoyle Park',
 'Aberglasslyn',
 'Abergowrie',
 'Abermain',
 'Abernethy',
 'Abingdon Downs',
 'Abington',
 'Abminga Station',
 'Acacia Creek',
 'Acacia Gardens',
 'Acacia Hills',
 'Acacia Ridge',
 'Acheron',
 'Acland',
 'Acton',
 'Acton Park',
 'Ada',
 'Adaminaby',
 'Adams Estate',
 'Adamstown',
 'Adamstown Heights',
 'Adamsvale',
 'Adare',
 'Adavale',
 'Addington',
 'Adelaide',
 'Adelaide',
 'Adelaide',
 'Adelaide',
 'Adelaide Airport',
 'Adelaide Bc',
 'Adelaide Lead',
 'Adelaide Park',
 'Adelaide River',
 'Adelong',
 'Adjungbilly',
 'Advancetown',
 'Adventure Bay',
 'Aeroglen',
 'Afterlee',
 'Agery',
 'Agnes',
 'Agnes Banks',
 'Agnes Water',
 'Ainslie',
 'Aintree',
 'Airdmillan',
 'Airds',
 'Aire Valley',
 'Aireys Inl