# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [101]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [102]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [103]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [104]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    capitals_string = ''
    print('* ' + element.find('name').text + ':', capitals_string)
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania: 
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: 
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: 
Skopje, Kumanovo
* Serbia: 
Beograd, Novi Sad, Niš
* Montenegro: 
Podgorica
* Kosovo: 
Prishtine
* Andorra: 
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [105]:
document = ET.parse( './data/mondial_database.xml' )

In [106]:
# Import pandas for future use
import pandas as pd

In [107]:
infantMoralityRates = {}
for country in document.getroot():
    countryName = country.find('name').text
    # Grab infant mortality 
    infant_mortality_element = country.find('infant_mortality')
    if infant_mortality_element != None: #ignore null values  
        infantMoralityRates[countryName] = float(infant_mortality_element.text)

In [108]:
# Lowest mortality rates
pd.Series(infantMoralityRates).sort_values(ascending = True)[:10]

Monaco            1.81
Japan             2.13
Norway            2.48
Bermuda           2.48
Singapore         2.53
Sweden            2.60
Czech Republic    2.63
Hong Kong         2.73
Macao             3.13
Iceland           3.15
dtype: float64

In [109]:
# 2. 10 cities with the largest population

In [110]:
largestPopulations = [] # List for holding extracted data

for country in document.getroot():
    countryName = country.find('name').text
    # City populations 
    for city in country.getiterator('city'):
        cityName = city.find('name').text
        censusData = city.findall('population')
        for yr in censusData: 
            curryr = yr.attrib['year']
            pop = yr.text
            largestPopulations.append({'country':countryName, 
                                       'city':cityName,
                                       'year':curryr, 
                                       'population':int(pop)}
                                     )

# Create a DF 
largestPop_DF = pd.DataFrame(largestPopulations)

In [111]:
# Sort by population, drop duplicate citys so they only appear once, keep the top 10
largestPop_DF.sort_values(['population'], ascending = False).drop_duplicates('city')[:10]

Unnamed: 0,city,country,population,year
3750,Shanghai,China,22315474,2010
2607,Istanbul,Turkey,13710512,2012
4398,Delhi,India,12877470,2001
4303,Mumbai,India,12442373,2011
1546,Moskva,Russia,11979529,2013
3746,Beijing,China,11716620,2010
8208,São Paulo,Brazil,11152344,2010
3754,Tianjin,China,11090314,2010
3364,Guangzhou,China,11071424,2010
3371,Shenzhen,China,10358381,2010


In [112]:
# 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates 
#   over all countries)

In [113]:
# Create two data sets, one with populations and another with ethnic group info
populations = []
ethnicGroups = []
for country in document.getroot():
    iterData = {}
    countryName = country.find('name').text
    # Grab populations 
    for pop in country.findall('population'):
        yr = pop.attrib['year']
        num = pop.text
        populations.append({  'country':countryName, 
                              'year': yr, 
                              'population': int(num)
                             })
    # EthnicGroups
    for eth in country.findall('ethnicgroup'):
        per = eth.attrib['percentage']
        group = eth.text
        ethnicGroups.append({'country':countryName, 
                             'percent':float(per) / 100, 
                             'group':group
                            })

In [114]:
# Create dataframes
populationsDf = pd.DataFrame(populations)
ethnicGroupsDf = pd.DataFrame(ethnicGroups)

In [115]:
# Sort populations keeping the most recent 
mostRecentPop = populationsDf.sort_values(['country', 'year'], ascending=[True, False]).drop_duplicates(['country'])

In [116]:
# Merge together ethnicity and population info
merged = ethnicGroupsDf.merge(mostRecentPop, 
                     left_on = 'country', 
                     right_on = 'country',
                     how = 'left'
                    )

In [117]:
# Create a new variable multiplying the population by percents to get the "count" 
merged['count'] = round(merged.percent * merged.population) 

In [118]:
# Sum counts across countries, sort it descending order, slice off the top 10
merged[['group','count']].groupby('group').sum().sort_values('count', ascending = False)[:10]

Unnamed: 0_level_0,count
group,Unnamed: 1_level_1
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


In [119]:
# 4. name and country of 
#     a) longest river, 
#     b) largest lake and 
#     c) airport at highest elevation

In [120]:
# Section to find the longest river 
rivers = [] 


for river in document.iterfind('river'):
    riverName = river.find("name").text
    riverLength = river.find('length')
    riverCountry = river.attrib['country']
    if riverLength != None: 
        rivers.append({'river': riverName, 
                       'length': float(riverLength.text),
                       'country': riverCountry
                      })
# Create the DF
riversDF = pd.DataFrame(rivers)

In [121]:
# Longest river which appears to go through three countries
riversDF.sort_values('length', ascending = False)[:1]

Unnamed: 0,country,length,river
174,CO BR PE,6448.0,Amazonas


In [122]:
# Setion to find the largest lake - assuming area 
lakes = []

for lake in document.iterfind('lake'):
    lakeName = lake.find("name").text
    areaElement = lake.find('area')
    country = lake.find('located')
    if (areaElement != None) & (country != None): 
        area = areaElement.text
        country = lake.find('located').attrib['country']
        lakes.append({'country': country,
                      'area':float(area), 
                      'name':lakeName
                      
                      })  

lakesDF = pd.DataFrame(lakes)

In [123]:
# Find the largest lake... which turns out to be a sea even though its labeled as a lake...
lakesDF.sort_values('area', ascending = False)[:1]

Unnamed: 0,area,country,name
52,386400.0,R,Caspian Sea


In [124]:
# Section to find the highest airport 
airport = []

for ap in document.iterfind('airport'):
    apName = ap.find('name').text
    apEle = ap.find('elevation')
    apCountry = ap.attrib['country']
    if apEle.text != None: 
        airport.append({'name': apName, 
                        'country': apCountry,
                        'elevation':float(apEle.text),
                       })
        
# Create the DF
airportDF = pd.DataFrame(airport)    

In [125]:
# Airport with the highest Elevation
airportDF.sort_values('elevation', ascending = False)[:1]

Unnamed: 0,country,elevation,name
80,BOL,4063.0,El Alto Intl
