In [2]:
from xml.etree import ElementTree as ET
import pandas as pd

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
doc = ET.parse( './data/mondial_database.xml' ) #reading in the xml file

### Problem 1: 10 countries w/ lowest infant mortality rates

In [7]:
im_dict = {} #create an empty dictionary to later pass into a pandas series

for element in doc.iterfind('country'): #iterate through the elements to find each country
    try: #because some countries infant mortality rates are listed as 'None' which cannot convert to float
        im_dict[element.find('name').text] = float(element.findtext('infant_mortality')) #add each country name as a dict key and then find the infant_mortality tag to pass as the dict value
    except:
        pass #skip any 'None' values that raise error when converting to float
    
ser = pd.Series(im_dict) #pass the dictionary into a pandas series
ser.order().head(10) #sort the series and print only the first 10 rows



Monaco            1.81
Japan             2.13
Norway            2.48
Bermuda           2.48
Singapore         2.53
Sweden            2.60
Czech Republic    2.63
Hong Kong         2.73
Macao             3.13
Iceland           3.15
dtype: float64

### Problem 2: 10 cities with the largest population

In [38]:
cities = []
countries = []
poplist= []
years = []

for element in root.getiterator('country'): #iterate through the elements to find each country
    for subelement in element.iter('city'): #iterate through the subelements of each countries to find the cities
        year = 0
        pop = None
        for pop in subelement.getiterator('population'): #iterate though the children of cities to find population tags
            if int(pop.get('year')) > year: #if a year .attrib of a population is found to be greater than the currently stored value...
                year = int(pop.get('year')) #then set the greater year as the stored year      
                pop= int(pop.text)
        if pop != None:
            cities.append(subelement.find('name').text) #then add the text of the city tag to the cities list
            countries.append(element.find('name').text) #and add the country name to the countries list
            years.append(year) # and add the year attribute to the years list
            poplist.append(pop) #add the population value to the poplist
            
pop_df = pd.DataFrame({"City": cities, "Country": countries, "Year": years, "Population": poplist}) #create a pandas dataframe and make each list of values into a column in the df
pop_df.sort_values('Population', ascending = 0).head(10) #sort the df by Population values and show the top 10 rows.


Unnamed: 0,City,Country,Population,Year
1251,Shanghai,China,22315474,2010
707,Istanbul,Turkey,13710512,2012
1421,Mumbai,India,12442373,2011
443,Moskva,Russia,11979529,2013
1250,Beijing,China,11716620,2010
2594,São Paulo,Brazil,11152344,2010
1252,Tianjin,China,11090314,2010
974,Guangzhou,China,11071424,2010
1467,Delhi,India,11034555,2011
977,Shenzhen,China,10358381,2010


### Problem 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [31]:
countries = []
ethnicities = []
percentages = []
populations =[]

for country in doc.iterfind('country'): #iterate through the doc to find countries
    year = None
    pop = None
    for popel in country.getiterator('population'): #iterate through children of country to find population tag
            if year == None or int(popel.get('year')) > year :
                year = int(popel.get('year'))
                pop = int(popel.text) 
    name = None
    percent = None
    for group in country.getiterator('ethnicgroup'):   #find each ethnicgroup tag
        name = group.text
        percent = float(group.attrib['percentage']) * .01
        if name != None:#if an ethnicgroup tag was found, add the country name, ethnic name, percent, and population to corresponding lists
            countries.append(country.find('name').text)
            ethnicities.append(name)
            percentages.append(percent)
            populations.append(pop)

#create pandas dataframe with the lists of values
df = pd.DataFrame({"Ethnicity": ethnicities, "Percent of Population" : percentages, "Country": countries, "Population Total": populations})
#multiply the population to all the ethnic percentages and store as group total per country in new column
df["Ethnic total"] = df["Percent of Population"] * df["Population Total"]

totals = df.groupby(["Ethnicity"])["Ethnic total"].sum() #group the ethnic rows together then sum the ethnic totals
totals.sort_values(ascending = False).head(10) #sort in descending order and show the first 10 rows

Ethnicity
Han Chinese    1.245059e+09
Indo-Aryan     8.718156e+08
European       4.948722e+08
African        3.183251e+08
Dravidian      3.027137e+08
Mestizo        1.577344e+08
Bengali        1.467769e+08
Russian        1.304840e+08
Japanese       1.265342e+08
Malay          1.219936e+08
Name: Ethnic total, dtype: float64

### Problem 4. name and country of a) longest river...

In [37]:
rivers = []
countries = []
lengths= []

for riverel in doc.iterfind('river'): #iterate to find river tags/children
    country = None
    length = None
    name = None
    try: #because some rivers do not have length tags
        length = float(riverel.find('length').text) #find the length tag and set the test value to 'length'
        country = riverel.attrib['country'] #set the country attritube to 'country'
        name = riverel.find('name').text #set the name tag as 'name'
        if length != None: #make sure a length tag was found
            rivers.append(name) #add 'name' to the rivers list
            countries.append(country) #add 'country' to the countries list
            lengths.append(length) #add the length to the lengths list
    except: continue
            
            
#create pandas dataframe with the lists of values:
df = pd.DataFrame({"Name": rivers, "Country" : countries, "Length": lengths})

df.sort(['Length'], ascending = False).head(1) #sort the df by length and show the first row only



Unnamed: 0,Country,Length,Name
174,CO BR PE,6448.0,Amazonas


### 4b) name and country  of largest lake 

In [44]:
lakes = []
countries = []
areas= []

for lakeel in doc.iterfind('lake'): #iterate to find lake tags/children
    country = None
    size = None
    name = None
    try: #because some lakes do not have area tags
        size = float(lakeel.find('area').text) #find the area tag and set the test value to 'size'
        country = lakeel.attrib['country'] #set the country attritube to 'country'
        name = lakeel.find('name').text #set the name tag as 'name'
        if size != None: #make sure an area tag was found
            lakes.append(name) #add 'name' to the lakes list
            countries.append(country) #add 'country' to the countries list
            areas.append(size) #add the  to the areas list
    except: continue
            
            
#create pandas dataframe with the lists of values:
df = pd.DataFrame({"Name": lakes, "Country" : countries, "Area": areas})

df.sort(['Area'], ascending = False).head(1) #sort the df by length and show the first row only



Unnamed: 0,Area,Country,Name
54,386400.0,R AZ KAZ IR TM,Caspian Sea


### 4c) airport at highest elevation

In [45]:
airports = []
countries = []
elevations= []

for ap in doc.iterfind('airport'): #iterate to find lake tags/children
    country = None
    elev = None
    name = None
    try: #because some airports do not have elevation tags
        elev = float(ap.find('elevation').text) #find the elevation tag and set the value to 'elev'
        country = ap.attrib['country'] #set the country attritube to 'country'
        name = ap.find('name').text #set the name tag as 'name'
        if elev != None: #make sure an elevation tag was found
            airports.append(name) #add 'name' to the lakes list
            countries.append(country) #add 'country' to the countries list
            elevations.append(elev) #add the  to the areas list
    except: continue
            
            
#create pandas dataframe with the lists of values:
df = pd.DataFrame({"Name": airports, "Country" : countries, "Elevation": elevations})

df.sort(['Elevation'], ascending = False).head(1) #sort the df by elevation and show the first row only



Unnamed: 0,Country,Elevation,Name
80,BOL,4063.0,El Alto Intl
