# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
from xml.etree import ElementTree as ET

In [7]:
document = ET.parse( './data/mondial_database.xml' )

In [8]:
root = document.getroot()

In [93]:
# 10 countries with the lowest infant mortality rates
d = {}
for country in root.findall('country'):
    rate = country.find('infant_mortality')
    name = country.find('name')
    if (rate and name) is not None:
        d[name.text] = float(rate.text)
dd = sorted(d.items(), key=lambda x: x[1])
print(dd[:10])

[('Monaco', 1.81), ('Japan', 2.13), ('Norway', 2.48), ('Bermuda', 2.48), ('Singapore', 2.53), ('Sweden', 2.6), ('Czech Republic', 2.63), ('Hong Kong', 2.73), ('Macao', 3.13), ('Iceland', 3.15)]


In [146]:
# 10 cities with the largest population
d2 = {}
for country in root.findall('country'):
    for city in country.findall('city'):
        p = city.findall('population')
        if p:
            pop = p[-1].text
            name = city.find('name')
            d2[name.text] = int(pop)
dd2 = sorted(d2.items(), key=lambda x: x[1], reverse=True)
print(dd2[:10])

[('Seoul', 9708483), ('Al Qahirah', 8471859), ('Bangkok', 7506700), ('Hong Kong', 7055071), ('Ho Chi Minh', 5968384), ('Singapore', 5076700), ('Al Iskandariyah', 4123869), ('New Taipei', 3939305), ('Busan', 3403135), ('Pyongyang', 3255288)]


In [147]:
# 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
d3 = {}
for country in root.findall('country'):
    ethnic = country.find('ethnicgroup')
    if ethnic is not None:
        p = country.findall('population')
        if p:
            pop = p[-1].text
            if d3.get(ethnic.text) is None:
                d3[ethnic.text] = int(pop)
            else:
                d3[ethnic.text] += int(pop)
dd3 = sorted(d3.items(), key=lambda x: x[1], reverse=True)
print(dd3[:10])

[('Han Chinese', 1360720000), ('Dravidian', 1210854977), ('European', 816382124), ('Javanese', 252124458), ('Mestizo', 225127188), ('African', 218663211), ('Chinese', 206681204), ('Arab', 203940018), ('Bengali', 149772364), ('Russian', 143666931)]


In [185]:
# name and country of a) longest river, b) largest lake and c) airport at highest elevation

# part a
len_max = 0
for river in root.findall('river'):
    if river.find('length') is not None:
        #print(airport.find('elevation').text)
        if float(river.find('length').text) > len_max:
            len_max = float(river.find('length').text)
            river_name = river.find('name').text
            if river.find('located') is not None:
                country_name = river.find('located').get('country')
            else:
                country_name = None
if country_name is not None:
    print('a) Longest rivern:', river_name,',', country_name,',', len_max)

# part b
a_max = 0
for lake in root.findall('lake'):
    if lake.find('area') is not None:
        if float(lake.find('area').text) > a_max:
            a_max = float(lake.find('area').text)
            lake_name = lake.find('name').text
            country_name2 = lake.get('country')
print('b) Largest lake:', lake_name,',',country_name2,',', a_max)

# part c
ele_max = 0
for airport in root.findall('airport'):
    if airport.find('elevation').text is not None:
        if int(airport.find('elevation').text) > ele_max:
            ele_max = int(airport.find('elevation').text)
            airport_name = airport.find('name').text
            country_name3 = airport.get('country')
print('c) Airport at highest elevation:', airport_name,',',country_name3,',', ele_max)

a) Longest rivern: Amazonas , CO , 6448.0
b) Largest lake: Caspian Sea , R AZ KAZ IR TM , 386400.0
c) Airport at highest elevation: El Alto Intl , BOL , 4063
