# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [3]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [4]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

<xml.etree.ElementTree.ElementTree at 0x14f69595748>

In [9]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [14]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [4]:
# 10 countries with the lowest infant mortality rates

import operator

document = ET.parse( './data/mondial_database.xml' )
d = {}
for element in document.iterfind('country'):
    if element.find('infant_mortality') is None:
        pass
    else:
        country = element.find('name').text
        im = element.find('infant_mortality').text
        d[country] = float(im)
sorted_d = sorted(d.items(), key=operator.itemgetter(1))
sorted_d[:10]

[('Monaco', 1.81),
 ('Japan', 2.13),
 ('Bermuda', 2.48),
 ('Norway', 2.48),
 ('Singapore', 2.53),
 ('Sweden', 2.6),
 ('Czech Republic', 2.63),
 ('Hong Kong', 2.73),
 ('Macao', 3.13),
 ('Iceland', 3.15)]

In [89]:
# 10 cities with the largest population

largePop = {}
for element in document.iter('city'):
    if element.find('population[@year="2011"]') is None:
        pass
    else:
        pop = element.find('population[@year="2011"]').text
        city = element.find('name').text
        largePop[city] = int(pop)
sorted_largePop = sorted(largePop.items(), key=operator.itemgetter(1))
sorted_largePop[-10:]

[('Sydney', 4605992),
 ('Chennai', 4646732),
 ('Luanda', 5000000),
 ('Ahmadabad', 5577940),
 ('Hyderabad', 6731790),
 ('Dhaka', 7423137),
 ('Tehran', 8154051),
 ('Bangalore', 8443675),
 ('Delhi', 11034555),
 ('Mumbai', 12442373)]

In [51]:
# 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
eg = {}
for element in document.iter('country'):
    if element.find('ethnicgroup') is None:
        pass
    else:
        egName = element.find('ethnicgroup[1]').text
        egPercentage = element.find('ethnicgroup[1]').get('percentage')
        pop = element.find('population[last()]').text
        egPop = float(egPercentage) * float(pop)
        eg[egName] = int(egPop)
sorted_eg = sorted(eg.items(), key=operator.itemgetter(1))
sorted_eg[-10:]

[('Mediterranean Nordic', 4681591600),
 ('English', 5359232674),
 ('Turkish', 6393539043),
 ('Viet/Kinh', 7607837530),
 ('Javanese', 11345600610),
 ('Russian', 11464621093),
 ('Japanese', 12653421200),
 ('Bengali', 14677691672),
 ('Dravidian', 30271374425),
 ('Han Chinese', 124505880000)]

In [58]:
# Name and country of a) longest river, b) largest lake and c) airport at highest elevation
longest = 0
longest_river_name = ''
country = ''
for element in document.iter('river'):
    if element.find('length') is None:
        pass
    else:
        length = float(element.find('length').text)
        if length > longest:
            longest = length
            longest_river_name = element.find('name').text
            country = element.get('country')
print(longest_river_name + ' ' + str(longest) + ' ' + country)

Amazonas 6448.0 CO BR PE


In [62]:
# Largest Lake

largest = 0
largest_lake_name = ' '
country = ' '

for element in document.iter('lake'):
    if element.find('area') is None:
        pass
    else:
        area = float(element.find('area').text)
        if area > largest:
            largest = area
            largest_lake_name = element.find('name').text
            country = element.get('country')
print(largest_lake_name + ' ' + str(largest) + ' ' + country)            

Caspian Sea 386400.0 R AZ KAZ IR TM


In [70]:
# airport at highest elevation

highest = 0
highest_airport_name = ''
country = ''

for element in document.iter('airport'):
    if element.find('elevation').text is None:
        pass
    else:
        elevation = float(element.find('elevation').text)
        if elevation > highest:
            highest = elevation
            highest_airport_name = element.find('name').text
            country = element.get('country')
print(highest_airport_name + ' ' + str(highest) + ' ' + country)            

El Alto Intl 4063.0 BOL
