# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [1]:
from xml.etree import ElementTree as ET
import numpy as np
import pandas as pd

In [2]:
document = ET.parse( 'mondial_database.xml' )

# 1. 10 countries with the lowest infant mortality rates

In [3]:
#Creates two empty lists where we will append data from the xml
country_names=[]
infant_mortality=[]
#For loop appends each country name and infant mortality to the respective list, 
#skipping over entries where no infant mortality is given
for country in document.iterfind('country'):
    if country.find('infant_mortality') is None:
        pass
    else:
        country_names.append (country.find('name').text)
        infant_mortality.append (country.find('infant_mortality').text)
#Creates a dataframe from the two lists
IM_dataframe=pd.DataFrame(
    {'Country': country_names,
     'Infant_Mortality': infant_mortality})
#Changes data from the Infant_Mortality column to float so it can be sorted correctly
IM_dataframe[['Infant_Mortality']] = IM_dataframe[['Infant_Mortality']].apply(pd.to_numeric)
#Returns the 10 rows with the lowest value in Infant_Mortality
IM_dataframe.sort_values(by='Infant_Mortality').head(10)


Unnamed: 0,Country,Infant_Mortality
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


# 2. 10 cities with the largest population

In [4]:
#Same process as previous exercise, but with an additional embedded loop to iterate over 
#all cities within each country, and only append the most recent population count
city_name=[]
pop=[]
for country in document.iterfind('country'):
    for city in country.getiterator('city'):
        if city.find('population') is None:
            pass
        else:
            city_name.append(city.find('name').text)
            pop.append(city.find('population[last()]').text)
            
#As with the previous exercise, creates dataframe, and adjusts as needed to create the correct output
pop_dataframe = pd.DataFrame(
    {'City': city_name,
     'Population': pop})
pop_dataframe[['Population']] = pop_dataframe[['Population']].apply(pd.to_numeric)
pop_dataframe.sort_values(by='Population',ascending=False).head(10)

Unnamed: 0,City,Population
1251,Shanghai,22315474
707,Istanbul,13710512
1421,Mumbai,12442373
443,Moskva,11979529
1250,Beijing,11716620
2594,São Paulo,11152344
1252,Tianjin,11090314
974,Guangzhou,11071424
1467,Delhi,11034555
977,Shenzhen,10358381


# 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [5]:
#Set up empty lists for the relevant data
country_name=[]
pop=[]
ethnicity=[]
#As with previous execise, loops through countries and loops through ethnic groups within each country
for country in document.iterfind('country'):
    #Set up "iteration" and "group" variables to count the number of loops (i.e., ethnic groups) within each country
    iteration=1
    group=str('ethnicgroup'+'['+str(iteration)+']')
    if country.find('population') is None or country.find('ethnicgroup') is None:
        pass
    else:
        for ethnicgroup in country.getiterator('ethnicgroup'):
                country_name.append(country.find('name').text)
                ethnicity.append(country.find(group).text)
    #Calculates the population of the ethnic group within the country by multiplying the percentage 
    #of the ethnic group by the total country population
                percent = float(ethnicgroup.get('percentage'))/100
                pop.append(int(int(country.find('population[last()]').text)*percent))
    #Update iteration so that the next ethnic group can be located (i.e., ethnicgroup[1], ethnicgroup[2],etc)
                iteration=iteration+1
                group=str('ethnicgroup'+'['+str(iteration)+']')
#Create dataframe from the lists built in the loop
#Group by Ethnic_Group and sum to get the total population of each ethnic group across all countries
ethnicpop = pd.DataFrame(
    {'Country': country_name,
     'Ethnic_Group': ethnicity,
     'Population': pop})
ethnicpop.groupby(['Ethnic_Group']).sum().sort_values(by='Population',ascending=False).head(10)

Unnamed: 0_level_0,Population
Ethnic_Group,Unnamed: 1_level_1
Han Chinese,1245058800
Indo-Aryan,871815583
European,494872201
African,318325104
Dravidian,302713744
Mestizo,157734349
Bengali,146776916
Russian,131856989
Japanese,126534212
Malay,121993548


# 4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
#Longest river
river_name=[]
river_length=[]
country=[]
for river in document.iterfind('river'):
    if river.find('name') is None or river.find('length') is None:
        pass
    else:
        river_name.append(river.find('name').text)
        river_length.append(river.find('length').text)
        country.append(river.get('country'))
rivers = pd.DataFrame(
    {'River_Name': river_name,
     'Length': river_length,
     'Country': country})
rivers[['Length']] = rivers[['Length']].apply(pd.to_numeric)
rivers.sort_values(by='Length',ascending=False).head(1)

Unnamed: 0,Country,Length,River_Name
174,CO BR PE,6448.0,Amazonas


In [7]:
#Largest lake
lake_name=[]
lake_area=[]
country=[]
for lake in document.iterfind('lake'):
    if lake.find('name') is None or lake.find('area') is None:
        pass
    else:
        lake_name.append(lake.find('name').text)
        lake_area.append(lake.find('area').text)
        country.append(lake.get('country'))
lakes = pd.DataFrame(
    {'Lake_Name': lake_name,
     'Size': lake_area,
     'Country': country})
lakes[['Size']] = lakes[['Size']].apply(pd.to_numeric)
lakes.sort_values(by='Size',ascending=False).head(1)

Unnamed: 0,Country,Lake_Name,Size
54,R AZ KAZ IR TM,Caspian Sea,386400.0


In [8]:
#Highest elevation airport
airport_name=[]
airport_elevation=[]
country=[]
for airport in document.iterfind('airport'):
    if airport.find('name') is None or airport.find('elevation') is None:
        pass
    else:
        airport_name.append(airport.find('name').text)
        airport_elevation.append(airport.find('elevation').text)
        country.append(airport.get('country'))
airports = pd.DataFrame(
    {'Airport_Name': airport_name,
     'Elevation': airport_elevation,
     'Country': country})
airports[['Elevation']] = airports[['Elevation']].apply(pd.to_numeric)
airports.sort_values(by='Elevation',ascending=False).head(1)


Unnamed: 0,Airport_Name,Country,Elevation
80,El Alto Intl,BOL,4063.0
