# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [16]:
from xml.etree import ElementTree as ET
import pandas as pd

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [17]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [18]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [19]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [20]:
document = ET.parse( './data/mondial_database.xml' )
root = document.getroot()

In [21]:
# 10 countries with the largest population

# Trying to extract only year

population = pd.DataFrame()

d = {}

for element in document.iterfind('country'):
    for subelement in element.iterfind('population'):
        d[element.find('name').text] = subelement.attrib['year']
        
for element in document.iterfind('country'):
    for subelement in element.iterfind('population'):
        if subelement.attrib['year'] == d[element.find('name').text]:
            entry = pd.DataFrame({'country': element.find('name').text, 'population': subelement.text, 'year': int(subelement.attrib['year'])}, index = range(1))
            population = population.append(entry, ignore_index=True)
            
population.head(10)

Unnamed: 0,country,population,year
0,Albania,2800138,2011
1,Greece,10816286,2011
2,Macedonia,2059794,2011
3,Serbia,7120666,2011
4,Montenegro,620029,2011
5,Kosovo,1733872,2011
6,Andorra,78115,2011
7,France,64933400,2011
8,Spain,46815916,2011
9,Austria,8499759,2013


In [22]:
population.sort_values(by='population', ascending =False).head(10)

Unnamed: 0,country,population,year
200,Benin,9983884,2013
12,Hungary,9937628,2011
128,Haiti,9896400,2010
169,Tuvalu,9827,2010
47,Jersey,97857,2011
226,Somalia,9636173,2010
37,Sweden,9555893,2012
18,Belarus,9460692,2013
127,Dominican Republic,9445281,2010
63,Azerbaijan,9356500,2013


In [23]:
# 10 countries with the lowest infant mortality rates
from decimal import Decimal
mortality = pd.DataFrame()

for element in document.iterfind('country'):
    try:
        entry = pd.DataFrame({'country': element.find('name').text, 'mortality rate': Decimal(element.find('infant_mortality').text)}, index = range(1))
    except AttributeError:
        pass
    mortality = mortality.append(entry, ignore_index=True)

In [24]:
mortality.sort_values(by='mortality rate', ascending = True).head(10)

Unnamed: 0,country,mortality rate
38,Monaco,1.81
98,Japan,2.13
36,Norway,2.48
117,Bermuda,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


In [25]:
#10 cities with the largest population

population = pd.DataFrame()

d = {}

for element in document.iterfind('country'):
    for subelement in element.iterfind('city'):
        for subelement_pop in subelement.iterfind('population'):
            d[subelement.find('name').text] = subelement_pop.attrib['year']

for element in document.iterfind('country'):
    for subelement in element.iterfind('city'):
        for subelement_pop in subelement.iterfind('population'):
            if subelement_pop.attrib['year'] == d[subelement.find('name').text]:
                entry = pd.DataFrame({'city': subelement.find('name').text, 'population': int(subelement_pop.text), 'year': int(subelement_pop.attrib['year'])}, index = range(1))
                population = population.append(entry, ignore_index=True)


In [26]:
population.sort_values(by='population', ascending =False).head(10)

Unnamed: 0,city,population,year
164,Seoul,9708483,2010
153,Al Qahirah,8471859,2006
74,Bangkok,7506700,1999
122,Hong Kong,7055071,2009
86,Ho Chi Minh,5968384,2009
200,Singapore,5076700,2010
152,Al Iskandariyah,4123869,2006
204,New Taipei,3939305,2012
165,Busan,3403135,2010
101,Pyongyang,3255288,2008


In [29]:
# 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
from decimal import Decimal
ethnic_group = pd.DataFrame()

def convert(x):
    return x/100

for element in document.iterfind('country'):
    for subelement in element.iterfind('ethnicgroup'):
        ethnic_entry = pd.DataFrame({'country': element.find('name').text, 'ethnicity': subelement.text, 'percentage': 
                                     Decimal(subelement.attrib['percentage'])}, index = range(1))
        ethnic_group = ethnic_group.append(ethnic_entry, ignore_index=True)
ethnic_group['percentage'] = ethnic_group['percentage'].apply(convert)

ethnic_group.head(10)

Unnamed: 0,country,ethnicity,percentage
0,Albania,Albanian,0.95
1,Albania,Greek,0.03
2,Greece,Greek,0.93
3,Macedonia,Macedonian,0.642
4,Macedonia,Albanian,0.252
5,Macedonia,Turkish,0.039
6,Macedonia,Gypsy,0.027
7,Macedonia,Serb,0.018
8,Serbia,Serb,0.829
9,Serbia,Montenegrin,0.009


In [52]:
grouping = pd.DataFrame()

for i in ethnic_group.ethnicity.unique():
    temp = 0
    for j in ethnic_group[ethnic_group.ethnicity == i]['percentage']:
        temp = temp * j
    entry = pd.DataFrame({'ethnicity': i, 'total_of_world': temp}, index = range(1))

    grouping = grouping.append(entry, ignore_index=True)

grouping

Unnamed: 0,ethnicity,total_percentage
0,Albanian,0.011012400
1,Greek,0.021483
2,Macedonian,0.016050
3,Turkish,8.02182610944E-12
4,Gypsy,0.027
5,Serb,8.5950720E-8
6,Montenegrin,0.00387
7,Hungarian,6.913583820E-12
8,Roma,1.4922600E-11
9,Bosniak,0.0000053424


In [63]:
total = 0
for i in population.population:
    total = int(i) + total

def amt(x):
    return x*total
    
grouping['total_percentage'] = grouping['total_percentage'].apply(amt)

grouping

Unnamed: 0,ethnicity,total_percentage
0,Albanian,2448633.944181600
1,Greek,4776797.339622
2,Macedonian,3568756.565700
3,Turkish,0.00178367256055873437696
4,Gypsy,6003515.718
5,Serb,19.111351796052480
6,Montenegrin,860503.91958
7,Hungarian,0.001537252190040017880
8,Roma,0.003318076431608400
9,Bosniak,1187.8956434016
