# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [12]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [20]:
document = ET.parse( 'mondial_database.xml' )

In [21]:
import xml.etree.ElementTree as ET
tree = ET.parse('mondial_database.xml')
root = tree.getroot()

In [22]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

# Q1: Find the top 10 countries with the lowest infant mortality rates

In [23]:
country_mortality_dict  = {}
for country in document.iterfind('country'):
    mortality = country.find('infant_mortality')
    if mortality is not None:
        country_mortality_dict[country.find('name').text] = float(mortality.text)
df = pd.DataFrame(country_mortality_dict.items(), columns=['name', 'infant_mortality'])
df.sort_values('infant_mortality').head(10)

Unnamed: 0,name,infant_mortality
35,Monaco,1.81
210,Japan,2.13
72,Norway,2.48
65,Bermuda,2.48
77,Singapore,2.53
107,Sweden,2.6
56,Czech Republic,2.63
144,Hong Kong,2.73
53,Macao,3.13
188,Iceland,3.15


# Q2:  Find the 10 cities with the largest population:

In [46]:
list = []
for city in document.findall('.//city'):
    name = city.find('name')
    if  name is None:
        name = np.nan
    else:
        name = name.text
    population = city.find('population') 
    if population is None:
        population = np.nan
    else:
        population = int(population.text)
    list.append([city.attrib['id'],name, population])
df = pd.DataFrame(list, columns=['id', 'name', 'population_2011'])
df.sort_values('population_2011', ascending=False).head(10)

Unnamed: 0,id,name,population_2011
1928,cty-South-Korea-2,Seoul,10229262.0
1527,cty-India-2,Mumbai,9925891.0
2810,cty-Brazil-Sao-Paulo,São Paulo,9412894.0
1757,cty-RI-11,Jakarta,8259266.0
1341,cty-China-Shanghai,Shanghai,8205598.0
2109,cty-Mexico-Mexico-City,Ciudad de México,8092449.0
479,cty-Russia-Moscow,Moskva,8010954.0
1876,cty-Japan-Tokyo,Tokyo,7843000.0
1340,cty-China-3,Beijing,7362426.0
1582,cty-India-New-Delhi,Delhi,7206704.0


# Q3:  Find the 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [31]:
ethnicgroups = dict()
for country in document.iterfind('country'):
    populations = []
    for population in country.findall('population'):
        populations.append((int(population.get('year')), int(population.text)))
    max_population = max(populations)[1]
    for ethnicgroup in country.findall('ethnicgroup'):
        population_ethnic_group = (float(ethnicgroup.get('percentage')) * max_population / 100)
        ethnicgroups[ethnicgroup.text] = ethnicgroups.setdefault(ethnicgroup.text, 0) + population_ethnic_group
df = pd.DataFrame.from_dict(ethnicgroups, orient='index')
pd.options.display.float_format = '{:20,.2f}'.format
sorted_df = df.sort_values([0],ascending=False).head(10)
print(sorted_df)  

                               0
Han Chinese     1,245,058,800.00
Indo-Aryan        871,815,583.44
European          494,872,219.72
African           318,325,120.37
Dravidian         302,713,744.25
Mestizo           157,734,354.94
Bengali           146,776,916.72
Russian           131,856,996.08
Japanese          126,534,212.00
Malay             121,993,550.37


# Q4:  Find the name and country of a) longest river, b) largest lake and c) airport at highest elevation

# a) Longest River:


In [33]:
code_to_country_dict = {}
for country in document.iterfind('country'):
    code_to_country_dict[country.get('car_code')] = country.find('name').text

rivers = []
for river in document.iterfind('river'):
    for country in river.get('country').split():
        length = river.find('length')
        if length is None:
            length = np.nan
        else:
            length = float(length.text)
        rivers.append([river.find('name').text, length, code_to_country_dict[country]])
river_df = pd.DataFrame(rivers, columns=['name', 'length', 'country'])

In [35]:
river_df.head()

Unnamed: 0,name,length,country
0,Thjorsa,230.0,Iceland
1,Joekulsa a Fjoellum,206.0,Iceland
2,Glomma,604.0,Norway
3,Lagen,322.0,Norway
4,Goetaaelv,93.0,Sweden


In [36]:
max_river = river_df.loc[river_df['length'].idxmax()]
max_river

name                  Amazonas
length                6,448.00
country               Colombia
Name: 298, dtype: object

# b) Largest Lake

In [37]:
lakes = []
for lake in document.iterfind('lake'):
    for country in lake.get('country').split():
        area = lake.find('area')
        if area is None:
            area = np.nan
        else:
            area = float(area.text)
        lakes.append([lake.find('name').text, area, code_to_country_dict[country]])
lake_df = pd.DataFrame(lakes, columns=['name', 'area', 'country'])

In [38]:
lake_df.head()

Unnamed: 0,name,area,country
0,Inari,1040.0,Finland
1,Oulujaervi,928.0,Finland
2,Kallavesi,472.0,Finland
3,Saimaa,4370.0,Finland
4,Paeijaenne,1118.0,Finland


In [39]:
max_lake = lake_df.loc[lake_df['area'].idxmax()]
max_lake

name               Caspian Sea
area                386,400.00
country                 Russia
Name: 68, dtype: object

# c) Airport at highest elevation:

In [40]:
airports = []
for airport in document.iterfind('airport'):
    for country in airport.get('country').split():
        elevation = airport.find('elevation')
        if elevation is None or elevation.text is None:
            continue
        else:
            elevation = float(elevation.text)
        airports.append([airport.find('name').text, elevation, code_to_country_dict[country]])
airport_df = pd.DataFrame(airports, columns=['name', 'elevation', 'country'])

In [41]:
airport_df.head()

Unnamed: 0,name,elevation,country
0,Herat,977.0,Afghanistan
1,Kabul Intl,1792.0,Afghanistan
2,Tirana Rinas,38.0,Albania
3,Cheikh Larbi Tebessi,811.0,Algeria
4,Batna Airport,822.0,Algeria


In [43]:
max_airport = airport_df.loc[airport_df['elevation'].idxmax()]
max_airport

name                El Alto Intl
elevation               4,063.00
country                  Bolivia
Name: 80, dtype: object