# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
# Load XML document
document = ET.parse( './data/mondial_database.xml' )

# Import pandas and numpy
import pandas as pd
import numpy as np

## 1.) 10 countries with the lowest infant mortality rates

In [12]:
countryList = [country.find('name').text for country in document.iterfind('country')]
countryMortalityRate = pd.Series(np.nan,index=countryList)
for country in document.iterfind('country'):
    countryName = country.find('name').text;
    if ET.iselement(country.find('infant_mortality')):
        countryMortalityRate[countryName] = country.find('infant_mortality').text
print "Top 10 countries with lowest infant mortality: \n{}".format(countryMortalityRate.sort_values(ascending = True)[:10])

Top 10 countries with lowest infant mortality: 
Monaco            1.81
Japan             2.13
Bermuda           2.48
Norway            2.48
Singapore         2.53
Sweden            2.60
Czech Republic    2.63
Hong Kong         2.73
Macao             3.13
Iceland           3.15
dtype: float64


## 2.) 10 cities with the largest population (NOTE: Census data is preferred over current)

In [204]:
cityList = [city.find('name').text for country in document.iterfind('country') for city in country.getiterator('city')]
cityPopulation = pd.Series(np.nan,index=cityList)
for country in document.iterfind('country'):
    for city in country.getiterator('city'):
        cityName = city.find('name').text;
        
        # Check if population data is available
        if ET.iselement(city.find('population')):
            # Determine the latest population figures. pick the most current census. estimates are ranked the least reliable
            populationMeasurements = city.findall('population');
            populationData         = np.zeros([len(populationMeasurements),3])
            for (index,population) in enumerate(populationMeasurements):
                populationData[index,0] = int(population.text)
                populationData[index,1] = int(population.attrib['year'])
                if population.attrib.has_key('measured'):
                    populationData[index,2] = population.attrib['measured']=='census';
            populationData = populationData[np.argsort(populationData[:,1])[::-1],:]; # sort first by population year (in descending order)
            populationData = populationData[np.argsort(populationData[:,2]),:]; # then sort by census (all other values are less reliable)
            cityPopulation[cityName] = populationData[0,0];
print "Top 10 cities with the largest population: \n{}".format(cityPopulation.sort_values(ascending = False)[:10])

Top 10 cities with the largest population: 
Istanbul            13710512.0
Mumbai              12442373.0
Moskva              11979529.0
Delhi               11034555.0
Shenzhen            10358381.0
Seoul               10229262.0
Karachi              9863000.0
São Paulo            9811776.0
Tokyo                8591695.0
Ciudad de México     8555272.0
dtype: float64


## 3.) 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [205]:
ethnicGroups = set([group.text for country in document.iterfind('country') for group in country.findall('ethnicgroup')])
ethnicPopulationData = pd.Series(0.0,index=ethnicGroups)
for country in document.iterfind('country'):
    # Determine the latest population figures. pick a census if possible
    populationMeasurements = country.findall('population');
    populationData         = np.zeros([len(populationMeasurements),3])
    for (index,population) in enumerate(populationMeasurements):
        populationData[index,0] = int(population.text)
        populationData[index,1] = int(population.attrib['year'])
        if population.attrib.has_key('measured'):
            populationData[index,2] = population.attrib['measured']=='census';
    populationData = populationData[np.argsort(populationData[:,1])[::-1],:]; # sort first by population year (in descending order)
    populationData = populationData[np.argsort(populationData[:,2]),:]; # then sort by census (all other values are less reliable)
    populationValue = float(populationData[0,0]);
    
    # Now update ethnic group data by weighting the population value for the country by ethnic weighting
    for group in country.findall('ethnicgroup'):
        ethnicPopulationData[group.text] += float(group.attrib['percentage'])*populationValue;
print "Top 10 ethnic groups by largest population: \n{}".format(ethnicPopulationData.sort_values(ascending = False)[:10])

Top 10 ethnic groups by largest population: 
Han Chinese    1.245059e+11
Indo-Aryan     8.718156e+10
European       4.900780e+10
Dravidian      3.027137e+10
African        2.978845e+10
Bengali        1.467769e+10
Mestizo        1.394420e+10
Russian        1.318714e+10
Japanese       1.265342e+10
Javanese       1.135228e+10
dtype: float64


## 4.) name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [323]:
#a.) Longest river
longestRiverData= {'name':'none', 'country':'none', 'riverLength':0.0}; # name, country, river length
for river in document.findall('river'):
    if(ET.iselement(river.find('length'))):
        riverLength = float(river.find('length').text)
        if(riverLength > longestRiverData['riverLength']):
            longestRiverData['name']      = river.find('name').text;
            longestRiverData['country']   = river.attrib['country'];
            longestRiverData['riverLength'] = riverLength;
print "Longest river: {} ({} @ {} km)".format(longestRiverData['name'],longestRiverData['country'],longestRiverData['riverLength'])

#b.) Largest lake
largestLakeData= {'name':'none', 'country':'none', 'lakeArea':0.0}; # name, country, lakeArea
for lake in document.findall('lake'):
    if(ET.iselement(lake.find('area'))):
        lakeArea = float(lake.find('area').text)
        if(lakeArea > largestLakeData['lakeArea']):
            largestLakeData['name']     = lake.find('name').text;
            largestLakeData['country']  = lake.attrib['country'];
            largestLakeData['lakeArea'] = lakeArea;
print "Largest lake: {} ({} @ {} in sq. km)".format(largestLakeData['name'],largestLakeData['country'],largestLakeData['lakeArea'])

# c.) Airport at highest elevation
highestAirportData = {'name':'none', 'country':'none', 'elevation':0.0}; # name, country, elevation
for airport in document.findall('airport'):
    elevation = airport.find('elevation').text;
    if(elevation):
        elevation = float(elevation)
        if(elevation > highestAirportData['elevation']):
            highestAirportData['name']      = airport.attrib['country'];
            highestAirportData['country']   = airport.find('name').text;
            highestAirportData['elevation'] = elevation;
print "Airport at highest elevation: {} ({} @ {} meters)".format(highestAirportData['name'],highestAirportData['country'],highestAirportData['elevation'])

Longest river: Amazonas (CO BR PE @ 6448.0 km)
Largest lake: Caspian Sea (R AZ KAZ IR TM @ 386400.0 in sq. km)
Airport at highest elevation: BOL (El Alto Intl @ 4063.0 meters)
