# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET
import pandas as pd

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )
root = document.getroot()

In [6]:
# 10 countries with the largest population

# Trying to extract only year

population = pd.DataFrame()

d = {}

for element in document.iterfind('country'):
    for subelement in element.iterfind('population'):
        d[element.find('name').text] = subelement.attrib['year']
        
for element in document.iterfind('country'):
    for subelement in element.iterfind('population'):
        if subelement.attrib['year'] == d[element.find('name').text]:
            entry = pd.DataFrame({'country': element.find('name').text, 'population': int(subelement.text), 'year': int(subelement.attrib['year'])}, index = range(1))
            population = population.append(entry, ignore_index=True)
            
population.head(10)

Unnamed: 0,country,population,year
0,Albania,2800138,2011
1,Greece,10816286,2011
2,Macedonia,2059794,2011
3,Serbia,7120666,2011
4,Montenegro,620029,2011
5,Kosovo,1733872,2011
6,Andorra,78115,2011
7,France,64933400,2011
8,Spain,46815916,2011
9,Austria,8499759,2013


In [7]:
population.sort_values(by='population', ascending =False).head(10)

Unnamed: 0,country,population,year
55,China,1360720000,2013
67,India,1210854977,2011
120,United States,318857056,2014
88,Indonesia,252124458,2014
176,Brazil,202768562,2014
57,Pakistan,173149306,2010
202,Nigeria,164294516,2011
65,Bangladesh,149772364,2011
23,Russia,143666931,2014
98,Japan,127298000,2013


In [8]:
# 10 countries with the lowest infant mortality rates
from decimal import Decimal
mortality = pd.DataFrame()

for element in document.iterfind('country'):
    try:
        entry = pd.DataFrame({'country': element.find('name').text, 'mortality rate': Decimal(element.find('infant_mortality').text)}, index = range(1))
    except AttributeError:
        pass
    mortality = mortality.append(entry, ignore_index=True)

In [9]:
mortality.sort_values(by='mortality rate', ascending = True).head(10)

Unnamed: 0,country,mortality rate
38,Monaco,1.81
98,Japan,2.13
36,Norway,2.48
117,Bermuda,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


In [10]:
#10 cities with the largest population

city = pd.DataFrame()

d = {}

for element in document.iterfind('country'):
    for subelement in element.iterfind('city'):
        for subelement_pop in subelement.iterfind('population'):
            d[subelement.find('name').text] = subelement_pop.attrib['year']

for element in document.iterfind('country'):
    for subelement in element.iterfind('city'):
        for subelement_pop in subelement.iterfind('population'):
            if subelement_pop.attrib['year'] == d[subelement.find('name').text]:
                entry = pd.DataFrame({'city': subelement.find('name').text, 'population': int(subelement_pop.text), 'year': int(subelement_pop.attrib['year'])}, index = range(1))
                city = city.append(entry, ignore_index=True)

In [11]:
city.sort_values(by='population', ascending =False).head(10)

Unnamed: 0,city,population,year
164,Seoul,9708483,2010
153,Al Qahirah,8471859,2006
74,Bangkok,7506700,1999
122,Hong Kong,7055071,2009
86,Ho Chi Minh,5968384,2009
200,Singapore,5076700,2010
152,Al Iskandariyah,4123869,2006
204,New Taipei,3939305,2012
165,Busan,3403135,2010
101,Pyongyang,3255288,2008


In [12]:
#10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
from decimal import Decimal
ethnic_group = pd.DataFrame()

def rounding(x):
    return x/100

for element in document.iterfind('country'):
    for subelement in element.iterfind('ethnicgroup'):
        ethnic_entry = pd.DataFrame({'country': element.find('name').text, 'ethnicity': subelement.text, 
                                     'country_pop': (Decimal(subelement.attrib['percentage'])/100) * int(population[population.country == element.find('name').text]['population']),
                                     }, index = range(1))
        ethnic_group = ethnic_group.append(ethnic_entry, ignore_index=True)

ethnic_pop = pd.DataFrame()

for i in ethnic_group.ethnicity.unique():
    temp = 0
    for j in ethnic_group[ethnic_group.ethnicity == i]['country_pop']:
        temp = temp + j
    entry = pd.DataFrame({'ethnicity': i, 'ethnic_pop': temp}, index = range(1))
    ethnic_pop = ethnic_pop.append(entry, ignore_index=True)

ethnic_pop.sort_values(by='ethnic_pop', ascending =False).head(10)

Unnamed: 0,ethnic_pop,ethnicity
80,1245058800.0,Han Chinese
106,871815583.44,Indo-Aryan
128,494872219.7196,European
16,318325120.369,African
105,302713744.25,Dravidian
150,157734354.937,Mestizo
98,146776916.72,Bengali
33,131856996.077,Russian
139,126534212.0,Japanese
110,121993550.374,Malay


In [17]:
# name and country of a) longest river, b) largest lake and c) airport at highest elevation

rivers = pd.DataFrame()
for element in document.iterfind('river'):
    try:
        rivers_entry = pd.DataFrame({'country': element.attrib['country'], 'name': element.find('name').text, 
                                 'length': float(element.find('length').text),
                                 }, index = range(1))
    except AttributeError:
        pass
    rivers = rivers.append(rivers_entry, ignore_index=True)

rivers.sort_values(by='length', ascending =False).head(10)

Unnamed: 0,country,length,name
174,CO BR PE,6448,Amazonas
137,CN,6380,Jangtse
136,CN,4845,Hwangho
123,R,4400,Lena
205,RCB ZRE,4374,Zaire
138,CN LAO THA K VN,4350,Mekong
115,R KAZ CN,4248,Irtysch
186,RMM RN WAN RG,4184,Niger
160,USA,4130,Missouri
119,R,4092,Jenissej


In [32]:
lakes = pd.DataFrame()
for element in document.iterfind('lake'):
    try:
        lakes_entry = pd.DataFrame({'country': element.attrib['country'], 'name': element.find('name').text, 
                                 'area': float(element.find('area').text),
                                 }, index = range(1))
    except AttributeError:
        pass
    lakes = lakes.append(lakes_entry, ignore_index=True)

lakes.sort_values(by='area', ascending =False).head(10)

Unnamed: 0,area,country,name
54,386400,R AZ KAZ IR TM,Caspian Sea
109,82103,CDN USA,Lake Superior
81,68870,EAT EAK EAU,Lake Victoria
106,59600,CDN USA,Lake Huron
108,57800,USA,Lake Michigan
47,41650,IL JOR WEST,Dead Sea
83,32893,ZRE Z BI EAT,Lake Tanganjika
98,31792,CDN,Great Bear Lake
43,31492,R,Ozero Baikal
89,29600,MW MOC EAT,Lake Malawi


In [33]:
airports = pd.DataFrame()
for element in document.iterfind('airport'):
    try:
        airports_entry = pd.DataFrame({'country': element.attrib['country'], 'name': element.find('name').text, 
                                 'elevation': float(element.find('elevation').text),
                                 }, index = range(1))
    except AttributeError:
        pass
    airports = airports.append(airports_entry, ignore_index=True)

airports.sort_values(by='elevation', ascending =False).head(10)

#Why is this not working? It is essentially a cut copy and paste of the previous two

TypeError: float() argument must be a string or a number