# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )
root = document.getroot()

    

In [150]:
# Lowest Infant mortality rates
#making a list of tuples of the form (country, infant_mortality)
a = []

for country in root.iterfind('country'):
    
    if country.find('infant_mortality')!= None:
        infant_mortality = country.find('infant_mortality').text
        z = float(infant_mortality)
        name = country.find('name').text
        a.append((name,z))
        

a = sorted(a, key=lambda IM: IM[1]) #Sorting the tuples by Infant Mortality Rate
for i in range(0,10):
        print a[i][0]


Monaco
Japan
Norway
Bermuda
Singapore
Sweden
Czech Republic
Hong Kong
Macao
Iceland


In [151]:
#Top 10 cities by population
a = []

for country in root.iterfind('country'):
    for city in country.getiterator('city'):            
                     
            name = city.find('name').text
            for population in city.findall( 'population' ):
                population_total = int(population.text)#finding the most recent  population 
            a.append((name,population_total))
                
            
            
#Creating a dataframe
df = pd.DataFrame(a)
df.columns = ['City', 'Population']
df.sort('Population', ascending = False).head(10)     

Unnamed: 0,City,Population
1341,Shanghai,22315474
771,Istanbul,13710512
1527,Mumbai,12442373
479,Moskva,11979529
1340,Beijing,11716620
2810,São Paulo,11152344
1342,Tianjin,11090314
1064,Guangzhou,11071424
1582,Delhi,11034555
1067,Shenzhen,10358381


In [152]:
#Top 10 ethnic groups in the world
import pandas as pd
a = []
for country in root.findall( 'country' ):
    for population in country.findall('population'):
             
        population_total = population.text
    population = int(population_total)    #finding the most recent  population 
                   
          
            
    
        
    #print country.find('name').text, population
    for ethnic_group in country.findall('ethnicgroup'):
        sub_population = float((ethnic_group.attrib['percentage']))* 0.01*population
        a.append((ethnic_group.text , int(sub_population)))
        
      
    
#Creating a dataframe
df = pd.DataFrame(a)
df.columns = ['Ethnic Group', 'Population']
grouped = df.groupby('Ethnic Group')
grouped.sum().sort('Population', ascending = False).head(10)




Unnamed: 0_level_0,Population
Ethnic Group,Unnamed: 1_level_1
Han Chinese,1245058800
Indo-Aryan,871815583
European,494872201
African,318325104
Dravidian,302713744
Mestizo,157734349
Bengali,146776916
Russian,131856989
Japanese,126534212
Malay,121993548


In [118]:
# Airport at maximum elevation
c = []
d = []
e = []
for airport in root.findall('airport'):
    
    if airport.find('elevation').text!= None:
        elevation = airport.find('elevation').text
        z = int(elevation)
        d.append(z)
        name = airport.find('name').text
        c.append(name)
        country = airport.get('country')
        e.append(country)
  

#Finding the index of the maximum elevation
max_elev = d.index(max(d))
#Printing the airport name, country and elevation 
print c[max_elev], e[max_elev], max(d)

El Alto Intl BOL 4063


In [119]:
# Longest River
c = []
d= []
e = []
for river in root.findall('river'):
    
    if river.find('length')!= None:
        length = river.find('length').text
        z = float(length)
        d.append(z)
        name = river.find('name').text
        c.append(name)
        country = river.get('country')
        e.append(country)
  


#Finding the index of the maximum length
max_len = d.index(max(d))
#Printing the name, country and length 
print c[max_len], e[max_len], max(d)

Amazonas CO BR PE 6448.0


In [120]:
#Largest lake
c = []
d= []
e = []
for lake in root.findall('lake'):
    
    if lake.find('area')!= None:
        area = lake.find('area').text
        z = float(area)
        d.append(z)
        name = lake.find('name').text
        c.append(name)
        country = lake.get('country')
        e.append(country)
  

#Finding the index of the maximum area
max_area = d.index(max(d))
#Printing the name, country  and the area 
print c[max_area], e[max_area], max(d)

Caspian Sea R AZ KAZ IR TM 386400.0
