# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [3]:
document = ET.parse( './data/mondial_database.xml' )

In [10]:
root = document.getroot()

In [11]:
root.tag

'mondial'

In [12]:
root.attrib

{}

In [16]:
root.text

'\n   '

In [34]:
# print names of all countries and their cities
country = []
infantm = []
for element in document.iterfind('country'):
    country.append(element.find('name').text + ':')
    #capitals_string = ''
    try:
        infantm.append(element.find('infant_mortality').text)
    except:
        infantm.append(np.nan)
    #for subelement in element.getiterator('city'):
        #capitals_string += subelement.find('').text + ', '
    #print capitals_string[:-2]

In [20]:
document.iterfind('country')

<generator object select at 0x00000000082EB3A8>

In [33]:
import pandas as pd
import numpy as np


In [35]:
df = pd.DataFrame({
         "country" : country, "infant_mortality": infantm
    })

In [41]:
df.dtypes

country             object
infant_mortality    object
dtype: object

In [43]:
df['infant_mortality'] = df.infant_mortality.astype(np.float)

In [49]:
#Answer 1
df_clean = df[df.infant_mortality.notnull()]
df_clean.sort_values(by = 'infant_mortality').head(10)

Unnamed: 0,country,infant_mortality
38,Monaco:,1.81
98,Japan:,2.13
117,Bermuda:,2.48
36,Norway:,2.48
106,Singapore:,2.53
37,Sweden:,2.6
10,Czech Republic:,2.63
78,Hong Kong:,2.73
79,Macao:,3.13
44,Iceland:,3.15


In [72]:
city = []
population = []
for element in document.iterfind('country'):
    for subelement in element.getiterator('city'):
        city.append(subelement.find('name').text + ':')
        try:
            population.append(subelement.find('population').text)
        except:
            population.append(np.nan)

In [73]:
pop = pd.DataFrame({
         "city" : city, "population": population
    })

In [75]:
pop.dtypes

city          object
population    object
dtype: object

In [77]:
pop['population'] = pop.population.astype(np.float)

In [78]:
pop.head()

Unnamed: 0,city,population
0,Tirana:,192000.0
1,Shkodër:,62000.0
2,Durrës:,60000.0
3,Vlorë:,56000.0
4,Elbasan:,53000.0


In [79]:
#Answer 2
pop_clean = pop[pop.population.notnull()]
pop_clean.sort_values(by = 'population').tail(10)

Unnamed: 0,city,population
1582,Delhi:,7206704.0
1340,Beijing:,7362426.0
1876,Tokyo:,7843000.0
479,Moskva:,8010954.0
2109,Ciudad de México:,8092449.0
1341,Shanghai:,8205598.0
1757,Jakarta:,8259266.0
2810,São Paulo:,9412894.0
1527,Mumbai:,9925891.0
1928,Seoul:,10229262.0


In [154]:
ethnicgroup = []
percentage = []
for element in document.iterfind('country'):
        try:
            ethnicgroup.append(element.find('ethnicgroup').text)
        except:
            ethnicgroup.append(np.nan)
            for subelement in element.iterfind('ethnicgroup'):  
                try:
                    percentage.append(subelement.get('percentage'))       
                except:
                    percentage.append(np.nan)

In [165]:
ethnicgroup = []
percentage = []
#for element in document.iterfind('country'):
    #print element.find('ethnicgroup').text
for subelement in element.iterfind('ethnicgroup'):  
    print subelement.get('percentage')      
                #except:
                    #percentage.append(np.nan)

In [157]:
eth = pd.DataFrame({
         "ethnicgroup" : ethnicgroup, "percentage": percentage
    })

ValueError: arrays must all be same length

In [152]:
eth

Unnamed: 0,ethnicgroup,percentage
0,Albanian,
1,Greek,
2,Macedonian,
3,Serb,
4,Montenegrin,
5,Albanian,
6,Spanish,
7,,
8,Mediterranean Nordic,
9,Austrian,


In [98]:
ethnicgroup1 = []
percentage = []
for element in document.iterfind('country'):
    ethnicgroup1.append(element.find('ethnicgroup percentage').text)
        #try:
            #percentage.append(element.find('percentage').text)
        #except:
            #percentage.append(np.nan)

AttributeError: 'NoneType' object has no attribute 'text'

In [169]:
for country in root.findall('country'):
    ethnicgroup = country.find('ethnicgroup').text
    percentage = country.get['percentage']
    print ethnicgroup, percentage

TypeError: 'instancemethod' object has no attribute '__getitem__'

In [118]:
document['mondial']['country']

TypeError: 'ElementTree' object has no attribute '__getitem__'