### Collecting Melbourne housing price by BeautifulSoup
> In this tutorial we will scrap housing price in Melbourne from this web site: https://www.domain.com.au/auction-results/melbourne/

#### Required python packages
> We need urllib for connect to the website, and BeautifulSoup for parsing html sources.
> After that, the data can be stored in a json file.

In [10]:
# Load python packages
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import json

In [26]:
# Define functions for parsing and collecting data

def getListings(url):
    "Get a list of auction results"
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None    
    try:
        # Parsing data
        bs = BeautifulSoup(html.read(), 'html.parser')
        articles = bs.findAll('article', {'class':'css-3xqrp1'})
        for atc in articles:
            for c in atc.children:
                if c.name == 'header':
                    suburb = c.h3.text
                if c.name == 'ul':
                    getListing(c, suburb)
    except AttributeError as e:
        return None
    
def getListing(tag, suburb=None):
    " Get a list in each suburb "
    ladd,lagen,htype,hInfo,soldInfo,price = tuple(["Unknown" for i in range(6)])
    listing = list(tag.children)
    ladd  = listing[0].text
    if listing[1].name == 'li':
        htype,hInfo = getHouseInfo(listing[1])
    if listing[2].name == 'li':
        soldInfo,price = getSoldInfo(listing[2])
    if listing[3].name == 'li':
        lagen = listing[3].text
    listings.append( {'suburb':suburb, 'street':ladd, 'agent':lagen, 'type': htype, 'info': hInfo, 
                      'sold':soldInfo, 'price':price})
    
def getSoldInfo(tag):
    sold = list(tag.children)
    if len(sold) >= 2: return sold[0].text, sold[1].text
    else: return sold[0].text, "Unknown"

def getHouseInfo(tag):
    house = list(tag.children)
    if len(house) >= 2: return house[0].text, house[1].text
    else: return house[0].text, "Unknown"

In [28]:
listings = [] # Store all listings
url = "https://www.domain.com.au/auction-results/melbourne/"
getListings(url)

#### Check results

In [32]:
# Number of listings
len(listings)

637

In [33]:
# Show first five listings
listings[1:5]

[{'suburb': 'Abbotsford',
  'street': '1/47 Nicholson St',
  'agent': 'Biggin & Scott Richmond',
  'type': 'Townhouse',
  'info': '2 beds',
  'sold': 'Sold prior to auction',
  'price': '$1.12m'},
 {'suburb': 'Abbotsford',
  'street': '12 Paterson St',
  'agent': 'Biggin & Scott Richmond',
  'type': 'House',
  'info': '4 beds',
  'sold': 'Sold',
  'price': '$1.886m'},
 {'suburb': 'Abbotsford',
  'street': '4 Turner St',
  'agent': 'Jellis Craig Fitzroy',
  'type': 'House',
  'info': '3 beds',
  'sold': 'Sold',
  'price': 'Price withheld'},
 {'suburb': 'Airport West',
  'street': '2/74 Fraser St',
  'agent': 'Barry Plant Essendon',
  'type': 'Unit',
  'info': '3 beds',
  'sold': 'Sold prior to auction',
  'price': '$671.5k'}]

#### Store results

In [16]:
# Store into a json file for further analysis
with open('data/listings.json', 'w') as f:
    json.dump(listings, f)