In [1]:
import pandas as pd
import requests
import re
from scrapy.selector import Selector



# Parsing tables

We can use the pandas `read_html` function to easily parse tables from web sites. As an example, we can parse the worlds fastest marathon runners:

In [2]:
fastest_marathon_runners = pd.read_html("https://www.runnersworld.com/races-places/a20823734/these-are-the-worlds-fastest-marathoners-and-marathon-courses/", header=0)
fastest_marathon_runners[0]

Unnamed: 0,Runner,Finish Time,Pace/Mile,Marathon
0,Kelvin Kiptum (Kenya),2:00:35,4:36.0,"Chicago, 2023"
1,Eliud Kipchoge (Kenya),2:01:09,4:37.2,"Berlin, 2022"
2,Kenenisa Bekele (Ethiopia),2:01:41,4:38.5,"Berlin, 2019"
3,Sisay Lemma (Ethiopia),2:01:48,4:38.7,"Valencia, 2023"
4,Birhanu Legese (Ethiopia),2:02:48,4:41.0,"Berlin, 2019"
5,Mosinet Geremew (Ethiopia),2:02:55,4:41.3,"London, 2019"
6,Dennis Kimetto (Kenya),2:02:57,4:41.4,"Berlin, 2014"
7,Evans Chebet (Kenya),2:03:00,4:41.5,"Valencia, 2020"
8,Gabriel Geay (Tanzania),2:03:00,4:41.5,"Valencia, 2022"
9,Lawrence Cherono (Kenya),2:03:04,4:41.6,"Valencia, 2022"


Keep in mind, the website had multiple tables, the code above just displays the very first one.

# Exercise 1 - Parsing Wikipedia

We are looking for  statistical data regarding the Melbourne Area. A good first stop is often Wikipedia, which in this case provides https://en.wikipedia.org/wiki/Local_government_areas_of_Victoria#Municipalities_of_Greater_Melbourne . Your task is to parse the first table found on this site which contains details about each local government area.

In [3]:
land_areas = pd.read_html("https://en.wikipedia.org/wiki/Local_government_areas_of_Victoria#Municipalities_of_Greater_Melbourne")
land_areas = land_areas[0]
land_areas

Unnamed: 0,Local government area,Council seat,Date established,Land area,Population (2021) [1],Pop. density,Councillors (2022),Map,Council / Shire Icon
0,City of Melbourne,Melbourne,12 August 1842,37 km2 (14 sq mi),149615,11897,11,,
1,City of Port Phillip,St Kilda,22 June 1994,21 km2 (8.1 sq mi),101942,13961,9,,
2,City of Stonnington,Malvern,22 June 1994,26 km2 (10 sq mi),104703,11576,9,,
3,City of Yarra,Richmond,22 June 1994,20 km2 (7.7 sq mi),90114,12758,9,,
4,City of Banyule,Greensborough,15 December 1994,63 km2 (24 sq mi),126236,5354,9,,
5,City of Bayside,Sandringham,15 December 1994,37 km2 (14 sq mi),101306,7400,7,,
6,City of Boroondara,Camberwell,22 June 1994,60 km2 (23 sq mi),167900,7826,11,,
7,City of Darebin,Preston,22 June 1994,54 km2 (21 sq mi),148570,7751,9,,
8,City of Glen Eira,Caulfield North,15 December 1994,39 km2 (15 sq mi),148908,10218,9,,
9,City of Hobsons Bay,Altona,22 June 1994,64 km2 (25 sq mi),91322,3904,7,,


If you look at the data, you will notice that the "land area" column cannot be parsed as pure number because it also contains the measurement unit. Clean up this data and add a new column to your dataframe containing the square kilometers as number.

In [4]:
def normalize_string(text):
    # The website uses non-breaking spaces. Replace them by normal whitespace
    return text.replace(u'\xa0', ' ');

def extract_land_area_str(text):
    text = normalize_string(text)
    first_whitespace = text.find(" ")
    return float(text[:first_whitespace].replace(",", ""))

def extract_land_area_regex(text):
    return float(re.match("([\d,\.]+)", text).groups(0)[0].replace(",", ""))

def extract_land_area_split(text):
    return float(text.split("km")[0].replace(",", ""))



example = land_areas["Land area"].iloc[0]
print(example)
print(extract_land_area_split(example))

land_areas['square_km'] = land_areas['Land area'].apply(extract_land_area_split)
land_areas

37 km2 (14 sq mi)
37.0


Unnamed: 0,Local government area,Council seat,Date established,Land area,Population (2021) [1],Pop. density,Councillors (2022),Map,Council / Shire Icon,square_km
0,City of Melbourne,Melbourne,12 August 1842,37 km2 (14 sq mi),149615,11897,11,,,37.0
1,City of Port Phillip,St Kilda,22 June 1994,21 km2 (8.1 sq mi),101942,13961,9,,,21.0
2,City of Stonnington,Malvern,22 June 1994,26 km2 (10 sq mi),104703,11576,9,,,26.0
3,City of Yarra,Richmond,22 June 1994,20 km2 (7.7 sq mi),90114,12758,9,,,20.0
4,City of Banyule,Greensborough,15 December 1994,63 km2 (24 sq mi),126236,5354,9,,,63.0
5,City of Bayside,Sandringham,15 December 1994,37 km2 (14 sq mi),101306,7400,7,,,37.0
6,City of Boroondara,Camberwell,22 June 1994,60 km2 (23 sq mi),167900,7826,11,,,60.0
7,City of Darebin,Preston,22 June 1994,54 km2 (21 sq mi),148570,7751,9,,,54.0
8,City of Glen Eira,Caulfield North,15 December 1994,39 km2 (15 sq mi),148908,10218,9,,,39.0
9,City of Hobsons Bay,Altona,22 June 1994,64 km2 (25 sq mi),91322,3904,7,,,64.0


# Exercise 2 - Parsing citypopulation.de 
We have found another web site containing more detailed information. To expand on our Wikipedia data, we want to parse this second data source too. This is slightly more complicated, as we need to follow all the detail links on http://www.citypopulation.de/en/australia/melbourne/ to find the actual data. To get you started, this first part has already been implemented:

In [5]:
# load website
melbourne_page = requests.get("http://www.citypopulation.de/en/australia/melbourne/").text
melbourne_document = Selector(text=melbourne_page)

# extract links of interest
links = list(
    filter(
        lambda x: x.startswith("/en"),
        melbourne_document.css("tbody tr td a::attr(href)").getall()))

# missing data/special cases - removing to simplify processing
links.remove("/en/australia/melbourne/208031192__moorabbin_airport/")
links.remove("/en/australia/melbourne/206041507__royal_botanic_gardens_vic/")
links.remove("/en/australia/melbourne/206041127__west_melbourne_industria/")

links

['/en/australia/melbourne/206071139__abbotsford/',
 '/en/australia/melbourne/210011226__airport_west/',
 '/en/australia/melbourne/206051128__albert_park/',
 '/en/australia/melbourne/206021110__alphington_fairfield/',
 '/en/australia/melbourne/213021341__altona/',
 '/en/australia/melbourne/213021342__altona_meadows/',
 '/en/australia/melbourne/213021343__altona_north/',
 '/en/australia/melbourne/213011328__ardeer_albion/',
 '/en/australia/melbourne/206061135__armadale/',
 '/en/australia/melbourne/206031113__ascot_vale/',
 '/en/australia/melbourne/207011146__ashburton/',
 '/en/australia/melbourne/212051319__ashwood_chadstone/',
 '/en/australia/melbourne/208031183__aspendale_gardens_water/',
 '/en/australia/melbourne/210011533__avondale_heights/',
 '/en/australia/melbourne/207011147__balwyn/',
 '/en/australia/melbourne/207011148__balwyn_north/',
 '/en/australia/melbourne/211011251__bayswater/',
 '/en/australia/melbourne/211031263__bayswater_north/',
 '/en/australia/melbourne/212011546__be

The next function should scrape the data for a single location. Your task is to implement the todos in `get_detail_data` so that the function returns the parsed data. You should at least return the counts for male / female inhabitants. You can extend this and e.g. also return the number of inhabitants per age group.

In [6]:
# This function should load the data of interest from the given link
def get_detail_data(link):
    url = "http://www.citypopulation.de" + link
    tables = pd.read_html(url)

    ageTable = tables[3]
    ageTable = ageTable.transpose()
    ageTable.columns = ageTable.iloc[0]
    ageTable = ageTable[1:]
    ages = ageTable
    result = ageTable.to_dict(orient = "records")[0]

    genderTable = tables[1]
    males = genderTable[genderTable.iloc[:,0] == "Males"].iloc[0, 1]
    females = genderTable[genderTable.iloc[:,0] == "Females"].iloc[0, 1]
    

    result['link'] = link
    result['males'] = males
    result['females'] = females
    return result


print(get_detail_data(links[0]))

{'90+ years': 39, '80-89 years': 160, '70-79 years': 392, '60-69 years': 597, '50-59 years': 776, '40-49 years': 1152, '30-39 years': 2788, '20-29 years': 2272, '10-19 years': 368, '0-9 years': 560, 'link': '/en/australia/melbourne/206071139__abbotsford/', 'males': 4637, 'females': 4457}


In [7]:
# Limit to fist 15 entries for testing purposes
df = pd.DataFrame([get_detail_data(link) for link in links[:15]])
df

Unnamed: 0,90+ years,80-89 years,70-79 years,60-69 years,50-59 years,40-49 years,30-39 years,20-29 years,10-19 years,0-9 years,link,males,females
0,39,160,392,597,776,1152,2788,2272,368,560,/en/australia/melbourne/206071139__abbotsford/,4637,4457
1,86,442,685,803,985,1079,1409,973,719,988,/en/australia/melbourne/210011226__airport_west/,3979,4191
2,105,569,1485,1722,2249,2259,2653,2020,1388,1388,/en/australia/melbourne/206051128__albert_park/,7665,8159
3,56,251,513,896,1282,1237,1577,1297,931,939,/en/australia/melbourne/206021110__alphington_...,4257,4735
4,137,632,1182,1500,1791,2079,2070,1194,1295,1662,/en/australia/melbourne/213021341__altona/,6470,7084
5,146,626,1483,2357,2594,2410,2762,2198,1795,2116,/en/australia/melbourne/213021342__altona_mead...,8939,9535
6,206,923,1062,1134,1484,1961,2715,1949,1379,2135,/en/australia/melbourne/213021343__altona_north/,7293,7638
7,84,248,512,672,721,1025,1549,1254,567,876,/en/australia/melbourne/213011328__ardeer_albion/,3933,3569
8,89,392,886,844,1066,1177,1696,1661,713,814,/en/australia/melbourne/206061135__armadale/,4253,5087
9,83,364,801,1493,1884,2278,2738,2201,1456,1884,/en/australia/melbourne/206031113__ascot_vale/,7466,7728
