# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [167]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [168]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [169]:
# print names of all countries
#treeroot = document_tree.getroot
#name = treeroot.find('name')

for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [170]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [171]:
document = ET.parse( './data/mondial_database.xml' )

In [172]:
#top 10 countries with the lowest infant mortality rate

import pandas as pd
countries = []
infantrates = []

def attribute_test(node, attribute_name, listname):
    node_present = node.find(attribute_name)
    
    if(node_present is None):
        return False
    else:
        listname.append(node.find(attribute_name).text)   
        return True

for country in document.findall('country'):
    if(attribute_test(country, 'infant_mortality', infantrates)):
        attribute_test(country, 'name', countries)

infantrates_df = pd.DataFrame({ 'country' : countries,
                               'infant_rates': infantrates })
infantrates_df['infant_rates'] = infantrates_df['infant_rates'].astype(float)
infantrates_df.sort_values(['infant_rates'], ascending=True).head(10)

Unnamed: 0,country,infant_rates
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


In [173]:
#top 10 cities with largest population

cities = []
city_populations = []

for country in document.findall('country'):
    for node in country.getiterator():
        for city in node.findall('city'):
            populus = []
            for population in city.findall('population'):
                populus.append(int(population.text))
            if(len(populus) > 0):
                city_populations.append(populus[len(populus)-1])
                cities.append(city.find('name').text)
            
populations_df = pd.DataFrame({'cityname' : cities,
                               'population' : city_populations })
populations_df['population'] = populations_df['population'].astype(int)
populations_df.sort_values(['population'],ascending=False).head(10)


Unnamed: 0,cityname,population
1251,Shanghai,22315474
707,Istanbul,13710512
1421,Mumbai,12442373
443,Moskva,11979529
1250,Beijing,11716620
2594,São Paulo,11152344
1252,Tianjin,11090314
974,Guangzhou,11071424
1467,Delhi,11034555
977,Shenzhen,10358381


In [174]:
#10 ethnic groups with the largest overall populations

ethnic_groups = {}

for country in document.findall('country'):
    for node in country.getiterator():
        populus = []
        for population in node.findall('population'):
            populus.append(int(population.text))
        if(len(populus) > 0):
            for ethnicgroupnode in node.findall('ethnicgroup'):
                ethnicpercent = float(ethnicgroupnode.attrib['percentage'])
                ethnicgroup = (ethnicgroupnode.text)
                ethnic_population = (float(populus[len(populus)-1])*float(ethnicpercent/100))
                if(ethnicgroup not in ethnic_groups):
                    ethnic_groups[ethnicgroup] = int(ethnic_population)
                else:
                    ethnic_groups[ethnicgroup] = ethnic_groups[ethnicgroup] + int(ethnic_population)

#ethnic_groups

#for generating sorted ethnicgroup/population
sorted_ethnicgroupList = sorted(ethnic_groups)
sorted_ethnicgroupPopulation = []

for ethnic_name in sorted_ethnicgroupList:
    sorted_ethnicgroupPopulation.append(ethnic_groups[ethnic_name])

#dataframe for ethnicname and population
ethnicgroup_df = pd.DataFrame({'ethnicname': sorted_ethnicgroupList,
                             'ethnicpopulus': sorted_ethnicgroupPopulation})

#output sorted ethnicname and populus
ethnicgroup_df.sort_values('ethnicpopulus', ascending=False).head(10)

Unnamed: 0,ethnicname,ethnicpopulus
113,Han Chinese,1245058800
120,Indo-Aryan,871815583
89,European,494872201
2,African,318325104
77,Dravidian,302713744
176,Mestizo,157734349
42,Bengali,146776916
217,Russian,131856989
128,Japanese,126534212
163,Malay,121993548


In [175]:
#name and country of 
    #a) longest river, 
    #b) largest lake,
    #c) airport at highest elevation

#get country codes
country_codes = {}

for country in document.findall('country'):
    country_codes[country.attrib['car_code']] = country.find('name').text
        
#a) longest river
maxlength = -100.0
riverdata = []

for river in document.findall('river'):
    countrycode = river.attrib['country']
    river_name = river.find('name').text
    lengthnode = river.find('length')
    if(lengthnode is None):
        length = 0.0
    else:
      length = float(lengthnode.text)
    if(length > maxlength):
        riverdata = []
        riverdata.append(countrycode)
        riverdata.append(river_name)
        riverdata.append(length)
        maxlength = length

print("Longest river:\n\tcountry/countries: ", riverdata[0], "\n\triver name: ", 
      riverdata[1], "\n\tLength: ", riverdata[2])

#b) largest lake
maxarea = -100.0
lakedata = []

for lake in document.findall('lake'):
    countrycode = lake.attrib['country']
    lake_name = lake.find('name').text
    areanode = lake.find('area')
    if(areanode is None):
        area = 0.0
    else:
      area = float(areanode.text)
    if(area > maxarea):
        lakedata = []
        lakedata.append(countrycode)
        lakedata.append(lake_name)
        lakedata.append(area)
        maxarea = area

print("\nLargest lake:\n\tcountry/countries: ", lakedata[0], "\n\tlake name: ", 
      lakedata[1], "\n\tLength: ", lakedata[2])

#c) airport at highest altitude
maxelevation = -100.0
airportdata = []

for airport in document.findall('airport'):
    countrycode = airport.attrib['country']
    airport_name = airport.find('name').text
    elevationnode = airport.find('elevation')
    if(elevationnode.text is None):
        elevation = 0.0
    else:
        elevation = float(elevationnode.text)
    if(elevation > maxelevation):
        airportdata = []
        airportdata.append(countrycode)
        airportdata.append(airport_name)
        airportdata.append(elevation)
        maxelevation = elevation

print("\nAirport at highest elevation:\n\tcountry/countries: ", airportdata[0], 
      "\n\tairport name: ", airportdata[1], "\n\tAirport elevation: ", airportdata[2])

Longest river:
	country/countries:  CO BR PE 
	river name:  Amazonas 
	Length:  6448.0

Largest lake:
	country/countries:  R AZ KAZ IR TM 
	lake name:  Caspian Sea 
	Length:  386400.0

Airport at highest elevation:
	country/countries:  BOL 
	airport name:  El Alto Intl 
	Airport elevation:  4063.0
