In [None]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

In [127]:
# Preparatory tasks
document_tree = ET.parse( './data/mondial_database.xml' )

# Generate index for country (to be used for various DF's)
index_ctry = [child.find('name').text for child in document_tree.iterfind('country')]

# Create dictionary of country codes to names (for lookup in last exercise)
list_ctry_code = [child.attrib.get('car_code') for child in document_tree.iterfind('country')]
dict_ctry = dict(zip(list_ctry_code,index_ctry))

# Generate index for ethnic groups (for exercise 3)
index_ethnic = []
for element_ctry in document_tree.iterfind('country'):
    for element_ethnic in element_ctry.iterfind('ethnicgroup'):
        index_ethnic.append(element_ethnic.text)
index_ethnic = list(set(index_ethnic))


In [128]:
# Exercise #1
df1 = pd.DataFrame(index=index_ctry,columns=['infant_mortality'])

# insert infant mortality rate per country (if there is one)
for child in document_tree.iterfind('country'):
    country = child.find('name').text
    infant_mortality = [child2.text for child2 in child.iterfind('infant_mortality')]
    if infant_mortality:
        df1.loc[country] = float(infant_mortality[0])
    else:
        df1.loc[country] = np.nan
df1['infant_mortality2'] = df1['infant_mortality'].astype(float)
print("10 Countries with Lowest Infant Mortality Rates")
print("===============================================")
print(df1.nsmallest(10,'infant_mortality2')['infant_mortality2'])
print("")

10 Countries with Lowest Infant Mortality Rates
Monaco            1.81
Japan             2.13
Norway            2.48
Bermuda           2.48
Norway            2.48
Bermuda           2.48
Singapore         2.53
Sweden            2.60
Czech Republic    2.63
Hong Kong         2.73
Macao             3.13
Iceland           3.15
Name: infant_mortality2, dtype: float64



In [132]:
# Exercise 2
df2 = pd.DataFrame([],columns=['population'])

# City can be the child of the country or child of province,
# which is a child of the country.
# Assumption: Last population record is most current population
for element_ctry in document_tree.iterfind('country'):
    for element_city in element_ctry.iterfind('city'):
        city = element_city.find('name').text
        pop_current = np.nan
        for element_pop in element_city.iterfind('population'):
            pop_current = element_pop.text
        if pop_current:
            population = pop_current
        else:
            population = np.nan
        df2.loc[city]= population
    for element_prov in element_ctry.iterfind('province'):
        for element_city in element_prov.iterfind('city'):
            city = element_city.find('name').text
            pop_current = np.nan
            for element_pop in element_city.iterfind('population'):
                pop_current = np.nan
                pop_current = element_pop.text
            if pop_current:
                population = pop_current
            else:
                population = np.nan
            df2.loc[city]= population 
            
df2['population2'] = df2['population'].astype(float)  
print("10 Cities with the Largest Populations")
print("======================================")
print(df2.nlargest(10,'population2')['population2'])
print("")   

10 Cities with the Largest Populations
Shanghai     22315474.0
Istanbul     13710512.0
Mumbai       12442373.0
Moskva       11979529.0
Beijing      11716620.0
São Paulo    11152344.0
Tianjin      11090314.0
Guangzhou    11071424.0
Delhi        11034555.0
Shenzhen     10358381.0
Name: population2, dtype: float64



In [135]:
# Exercise 3     

# Ethnic group is only at the country level, so need to sum it up
# after gathering all ethnicgroup records for every country.
index_df3 = pd.MultiIndex.from_product([index_ctry,index_ethnic],names=['country','ethnic_group'])
df3 = pd.DataFrame(index=index_df3,columns=['population','percentage'])
        
for element_ctry in document_tree.iterfind('country'):  
    country = element_ctry.find('name').text
    
    # get last population value
    pop_current = np.nan
    for element_pop in element_ctry.iterfind('population'):
        pop_current = element_pop.text
        
    for element_ethnic in element_ctry.iterfind('ethnicgroup'):
        ethnic_group = element_ethnic.text
        ethnic_attrib = element_ethnic.attrib
        ethnic_perc = ethnic_attrib.get('percentage')
        df3.loc[country,ethnic_group]=[pop_current,ethnic_perc]

# apply percentage to country's population      
df3['ethnic_pop'] = df3['population'].astype(float) * df3['percentage'].astype(float) / 100

# remove extra records
df3_cond = ~df3['ethnic_pop'].isnull()
df3_clean = df3[df3_cond]['ethnic_pop']

df3_ethnic = pd.DataFrame([],index=index_ethnic,columns=['ethnic_pop'],dtype=float)

for ethnic_group in index_ethnic:
    df3_ethnic.loc[ethnic_group] = df3_clean.loc[:,ethnic_group].sum()
    
print("10 Ethnic Groups with Largest Overall Population")
print("================================================")
print(df3_ethnic.nlargest(10,'ethnic_pop'))


10 Ethnic Groups with Largest Overall Population
               ethnic_pop
Han Chinese  1.245059e+09
Indo-Aryan   8.718156e+08
European     4.948722e+08
African      3.183251e+08
Dravidian    3.027137e+08
Mestizo      1.577344e+08
Bengali      1.467769e+08
Russian      1.318570e+08
Japanese     1.265342e+08
Malay        1.219936e+08


In [197]:
# Exercise 4
index_river = [element_river.find('name').text for element_river in document_tree.iterfind('river')]
index_lake = [element_lake.find('name').text for element_lake in document_tree.iterfind('lake')]
index_airport = [element_airport.find('name').text for element_airport in document_tree.iterfind('airport')]

df_river = pd.DataFrame([],index=index_river,columns=['countries','length'])
df_lake = pd.DataFrame([],index=index_lake,columns=['countries','area'])
df_airport = pd.DataFrame([],index=index_airport,columns=['countries','elevation'])

for element_river in document_tree.iterfind('river'):
    river_name = element_river.find('name').text
    river_countries = element_river.attrib.get('country')
    find_river_length = element_river.find('length')
    if find_river_length != None:
        river_length = find_river_length.text
    else:
        river_length = -1
    df_river.loc[river_name] = [river_countries,river_length]
df_river['length_new'] = df_river['length'].astype(float)
countries = df_river.nlargest(1,'length_new')['countries']
print("Longest river is " + countries.index[0] + ", located in " + countries.values[0])

for element_lake in document_tree.iterfind('lake'):
    lake_name = element_lake.find('name').text
    lake_countries = element_lake.attrib.get('country')
    find_lake_area = element_lake.find('area')
    if find_lake_area != None:
        lake_area = find_lake_area.text
    else:
        lake_area = -1
    df_lake.loc[lake_name] = [lake_countries,lake_area]
df_lake['area_new'] = df_lake['area'].astype(float)
countries = df_lake.nlargest(1,'area_new')['countries']
print("Largest lake is " + countries.index[0] + ", located in " + countries.values[0])

for element_airport in document_tree.iterfind('airport'):
    airport_name = element_airport.find('name').text
    airport_countries = element_airport.attrib.get('country')
    find_airport_elevation = element_airport.find('elevation')
    if find_airport_elevation != None:
        airport_elev = find_airport_elevation.text
    else:
        airport_elev = -1
    df_airport.loc[airport_name] = [airport_countries,airport_elev]
df_airport['elevation_new'] = df_airport['elevation'].astype(float)
countries = df_airport.nlargest(1,'elevation_new')['countries']
print("Highest airport is " + countries.index[0] + ", located in " + countries.values[0])

Longest river is Amazonas, located in CO BR PE
Largest lake is Caspian Sea, located in R AZ KAZ IR TM
Highest airport is El Alto Intl, located in BOL
