# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [2]:
document = ET.parse( './data/mondial_database.xml' )

###1. 10 countries with the lowest infant mortality rates

In [6]:
#root
mondial = document.getroot()

In [4]:
mondial.tag

'mondial'

In [5]:
mondial.attrib

{}

In [49]:
# Create a list.
elements = []
#for child in root:
for country in mondial:
    #print country.tag, country.attrib, country.text
    infantmor = None
    #i += 1
    for node in country.getchildren():
        #print node.tag, node.attrib, node.text, node.tail        
        if node.tag == 'name':
            name = node.text
        elif node.tag == 'infant_mortality':
            infantmor = node.text
    if infantmor is not None:
        # Append tuples
        elements.append((name,float(infantmor) ))
        #print name, infantmor

#Sort tuples inside list
elements.sort(key=lambda tup: tup[1])
print ("Top Ten with lowest infant mortality")
print ('Country     Rate')
for i in range(0,10):
    #print('Country ' + elements[i][0] + '-' + 'rate '+ str(elements[i][1]) )
    print(elements[i][0] + '    ' + str(elements[i][1]) )  

Top Ten with lowest infant mortality
Country     Rate
Monaco    1.81
Japan    2.13
Norway    2.48
Bermuda    2.48
Singapore    2.53
Sweden    2.6
Czech Republic    2.63
Hong Kong    2.73
Macao    3.13
Iceland    3.15


###2. 10 cities with the largest population

In [65]:
# Create a list.
elements = []
#for child in root:
for country in mondial:
    #print country.tag, country.attrib, country.text
    year1 = 0
    for node in country.getchildren():
        #print node.tag, node.attrib, node.text, node.tail        
        if node.tag == 'name':
            name = node.text
        elif node.tag == 'population':
            #Attribute is a dictionary
            year2 = int(node.attrib['year'])
            if year2 > year1:
                population = node.text
                year1 = year2
    if year1 > 0:
        # Append tuples
        elements.append((name,float(population), int(year2) ))
        #print name, infantmor

#Sort tuples inside list
elements.sort(key=lambda tup: tup[1], reverse=True)
print ("Top Ten with largest population")
print ('Country     Population    Year')
for i in range(0,10):
    #print('Country ' + elements[i][0] + '-' + 'rate '+ str(elements[i][1]) )
    print(elements[i][0] + '    ' + str(elements[i][1]) + '    ' + str(elements[i][2]) )  

Top Ten with largest population
Country     Population    Year
China    1360720000.0    2013
India    1210854977.0    2011
United States    318857056.0    2014
Indonesia    252124458.0    2014
Brazil    202768562.0    2014
Pakistan    173149306.0    2010
Nigeria    164294516.0    2011
Bangladesh    149772364.0    2011
Russia    143666931.0    2014
Japan    127298000.0    2013


###3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [71]:
# Create a list.
elements = []
#for child in root:
for country in mondial:
    #print country.tag, country.attrib, country.text
    year1 = 0
    for node in country.getchildren():
        #print node.tag, node.attrib, node.text, node.tail        
        if node.tag == 'name':
            name = node.text
        elif node.tag == 'population':
            #Attribute is a dictionary
            year2 = int(node.attrib['year'])
            if year2 > year1:
                population = int(node.text)
                year1 = year2
        elif node.tag == 'ethnicgroup': 
            eth_name = node.text
            eth_per = float(node.attrib['percentage'])
            eth_pop = (population*eth_per/100)
            elements.append((name,population, int(year2),eth_name, int(eth_pop) ))            

#Sort tuples inside list
elements.sort(key=lambda tup: tup[4], reverse=True)
print ("Top Ten ethnic groups")
print ('Country     Population    Year    EthnicGroup     EthnicPopulation')
for i in range(0,10):
    print(elements[i][0] + '    ' + str(elements[i][1]) + '    ' + str(elements[i][2])+ '    ' + str(elements[i][3])+ '    ' + str(elements[i][4]) )  

Top Ten ethnic groups
Country     Population    Year    EthnicGroup     EthnicPopulation
China    1360720000    2013    Han Chinese    1245058800
India    1210854977    2011    Indo-Aryan    871815583
India    1210854977    2011    Dravidian    302713744
United States    318857056    2014    European    254958101
Nigeria    164294516    2011    African    162651570
Bangladesh    149772364    2011    Bengali    146776916
Japan    127298000    2013    Japanese    126534212
Russia    143666931    2014    Russian    114646210
Indonesia    252124458    2014    Javanese    113456006
Brazil    202768562    2014    European    108886717


###4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [97]:
#Iterate nodes including. Best way to filter by markup
#rivers = document.find( 'users' )
# Create a list.
elements = []
#for child in root:
for river in document.findall( 'river' ): #tag name
    country_name = river.attrib['country']
    for node in river.getchildren():
        if node.tag== 'name':
            river_name = node.text
        elif node.tag == 'length':
            river_length= float(node.text)
    elements.append((country_name,river_name, float(river_length) ))    

#Sort by longest river
elements.sort(key=lambda tup: tup[2], reverse=True)
print ("Longest River")
print ('Country     River    Length')
for i in range(0,1):
    print(elements[i][0] + '    ' + str(elements[i][1]) + '    ' + str(elements[i][2]) )  

Longest River
Country     River    Length
CO BR PE    Amazonas    6448.0


In [128]:
#Iterate nodes including. Best way to filter by markup
#rivers = document.find( 'users' )
# Create a list.
elements = []

#for child in root:
for lake in document.findall( 'lake' ): #tag name
    country_name = lake.attrib['country']
    for node in lake.getchildren():
        if node.tag== 'name':
            lake_name = node.text
        elif node.tag == 'area':
            lake_area= float(node.text)
    elements.append((country_name,lake_name, float(lake_area) ))    

#Sort by longest river
elements.sort(key=lambda tup: tup[2], reverse=True)
print ("Largest Lake")
print ('Country     Lake    Area')
for i in range(0,1):
    print(elements[i][0] + '    ' + str(elements[i][1]) + '    ' + str(elements[i][2]) )  

Largest Lake
Country     Lake    Area
R AZ KAZ IR TM    Caspian Sea    386400.0


In [127]:
#Iterate nodes including. Best way to filter by markup
#rivers = document.find( 'users' )
# Create a list.
elements = []
#for child in root:
for airport in document.findall( 'airport' ): #tag name
    country_name = airport.attrib['country']
    for node in airport.getchildren():
        if node.tag== 'name':
            airport_name = node.text
        elif node.tag == 'elevation':
            if node.text > 0:
                airport_elevation = node.text 
            else:
                airport_elevation= 0
            
            
    elements.append((country_name,airport_name, int(airport_elevation) ))    

#Sort by longest river
elements.sort(key=lambda tup: tup[2], reverse=True)
print ("Highest Airport")
print ('Country     Airport    Elevation')
for i in range(0,1):
    print(elements[i][0] + '    ' + str(elements[i][1]) + '    ' + str(elements[i][2]) )  

Highest Airport
Country     Airport    Elevation
BOL    El Alto Intl    4063


###Test Area

In [113]:
#Iterate nodes including
#rivers = document.find( 'users' )
# Create a list.
elements = []
#for child in root:
for river in document.findall( 'airport' ): #tag name
    print river.tag, river.attrib
    for node in river.getchildren():
        print '-', node.tag, '-', node.attrib ,'-', node.text    

airport {'city': 'cty-Afghanistan-2', 'iatacode': 'HEA', 'country': 'AFG'}
- name - {} - Herat
- latitude - {} - 34.210017
- longitude - {} - 62.2283
- elevation - {} - 977
- gmtOffset - {} - 5
airport {'city': 'cty-Afghanistan-Kabul', 'iatacode': 'KBL', 'country': 'AFG'}
- name - {} - Kabul Intl
- latitude - {} - 34.565853
- longitude - {} - 69.212328
- elevation - {} - 1792
- gmtOffset - {} - 5
airport {'city': 'cty-Albania-Tirane', 'iatacode': 'TIA', 'country': 'AL'}
- name - {} - Tirana Rinas
- latitude - {} - 41.414742
- longitude - {} - 19.720561
- elevation - {} - 38
- gmtOffset - {} - 1
airport {'city': 'cty-Algeria-14', 'iatacode': 'TEE', 'country': 'DZ'}
- name - {} - Cheikh Larbi Tebessi
- latitude - {} - 35.431611
- longitude - {} - 8.120717
- elevation - {} - 811
- gmtOffset - {} - 1
airport {'city': 'cty-Algeria-6', 'iatacode': 'BLJ', 'country': 'DZ'}
- name - {} - Batna Airport
- latitude - {} - 35.752106
- longitude - {} - 6.308589
- elevation - {} - 822
- gmtOffset - {