****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )
from operator import itemgetter

In [6]:
dict1 = {}
for element in document.iterfind('country'):
    name = element.find('name')
    im = element.find('infant_mortality')
    if im is not None:
        dict1[element.find('name').text] = float(im.text)
    else:
        dict1[element.find('name').text] = 0.0

arr1 = []
for key, value in sorted(dict1.items(), key=lambda item: (item[1], item[0])):
    arr1.append ("%s: %s" % (key, value))
    
print("10 countries with the lowest infant mortality rates")
print("===================================================")

for i in range(10):
    print(arr1[i])

10 countries with the lowest infant mortality rates
Ceuta: 0.0
Christmas Island: 0.0
Cocos Islands: 0.0
Curacao: 0.0
Falkland Islands: 0.0
Holy See: 0.0
Kosovo: 0.0
Melilla: 0.0
Montenegro: 0.0
Niue: 0.0


In [7]:
# Create an empty array to hold the population for the maximum year available for a city
maxYearPopulation = []
i = 0
root = document.getroot()
# For every city
for city in root.iter('city'):
    i=0
    yearsLst = []
    # For every population in the city append the [Year, Population] to yearsLst List
    if len(city.findall('population')) > 0:
            for pop in city.iter('population'):
                yearsLst.append([])
                yearsLst[i].append(pop.attrib['year'])
                yearsLst[i].append(int(pop.text))
                yearsLst[i].append(city.find('name').text)
                i+=1
    # Once done with all the available populations sort the list descending    
            yearsLstSorted = sorted(yearsLst, key = itemgetter(0) , reverse = True)
    # Get the first list in the list of lists i.e. Maximum year and append it to maxYearPopulation List
    maxYearPopulation.append(yearsLstSorted[0])
#print(maxYearPopulation)
# Sort this list descending on population
maxYearPopulationSorted = sorted(maxYearPopulation, key = itemgetter(1), reverse = True)
# Print Top 10 in another List
print ("10 cities with the largest population")
print ("=====================================")
for idx in range(10):
    print (maxYearPopulationSorted[idx][2], end = "")
    print (" - ", end = "")
    print (maxYearPopulationSorted[idx][1])

    


10 cities with the largest population
Shanghai - 22315474
Istanbul - 13710512
Mumbai - 12442373
Moskva - 11979529
Beijing - 11716620
São Paulo - 11152344
Tianjin - 11090314
Guangzhou - 11071424
Delhi - 11034555
Shenzhen - 10358381


In [8]:
from xml.etree import ElementTree as ET
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
document = ET.parse( './data/mondial_database.xml' )
root = document.getroot()
df = pd.DataFrame(columns=['Year','Population','Country'])
df1 = pd.DataFrame(columns=['Country','Percent','Ethnicgroup'])

for country in root.iter('country'):
    temp = []
    if len(country.findall('population')) > 0:
        for pop in country.findall('population'):
            temp.append(pop.attrib['year'])
        temp.sort(reverse = True)    
        maxYear = temp[0]
        #print(temp[0])

        for pop in country.findall('population'):
            if pop.attrib['year'] == maxYear:
                df2 = pd.DataFrame([[pop.attrib['year'], int(pop.text), country.find('name').text]],\
                                   columns=['Year','Population','Country'])
                frames = [df , df2]
                df = pd.concat(frames)
    if len(country.findall('ethnicgroup')) > 0:
        for eth in country.findall('ethnicgroup'):
            df3 = pd.DataFrame([[ country.find('name').text, float(eth.attrib['percentage']), eth.text ]] ,\
                               columns = ['Country' , 'Percent' , 'Ethnicgroup'])
            frames1 = [df1, df3]
            df1 = pd.concat(frames1)


resultDF = pd.merge (df, df1, on='Country') 

In [9]:
resultDF['Calc'] = resultDF['Percent']/100 * resultDF['Population']
finalResultDF = resultDF[['Ethnicgroup','Calc']]
finalResultDFSum = finalResultDF.groupby('Ethnicgroup').sum()
finalResultSortedDesc = finalResultDFSum.sort_values(['Calc'],ascending=False)

In [10]:
answer = finalResultSortedDesc.head(10)

In [11]:
print("10 ethnic groups with the largest overall populations ")
print("===================================================")
answer

10 ethnic groups with the largest overall populations 


Unnamed: 0_level_0,Calc
Ethnicgroup,Unnamed: 1_level_1
Han Chinese,1245058800.0
Indo-Aryan,871815583.44
European,494872219.72
African,318325120.37
Dravidian,302713744.25
Mestizo,157734354.94
Bengali,146776916.72
Russian,131856996.08
Japanese,126534212.0
Malay,121993550.37


In [54]:
dfRivers = pd.DataFrame(columns=['CountryName','RiverName','Length'])
for river in root.iter('river'):
    if river.find('length') is not None:
        dfTemp = pd.DataFrame([[river.attrib['country'],river.find('name').text , float(river.find('length').text) ]],\
                              columns=['CountryName','RiverName','Length'])
        riverFrames = [dfRivers,dfTemp]
        dfRivers = pd.concat(riverFrames)

In [55]:
dfRivers = dfRivers.sort_values('Length',ascending=False)
dfLongestRiver = dfRivers.head(1)

In [56]:
print("Longest river and the Countries it flows through ")
print("===================================================")
dfLongestRiver

Longest river and the Countries it flows through 


Unnamed: 0,CountryName,RiverName,Length
0,CO BR PE,Amazonas,6448.0


In [58]:
dfLakes = pd.DataFrame(columns=['CountryName','LakeName','Area'])
for lake in root.iter('lake'):
    if lake.find('area') is not None:
        dfTemp = pd.DataFrame([[lake.attrib['country'], lake.find('name').text , float(lake.find('area').text) ]],\
                              columns=['CountryName','LakeName','Area'])
        lakeFrames = [dfLakes,dfTemp]
        dfLakes = pd.concat(lakeFrames)

dfLakes = dfLakes.sort_values('Area',ascending=False)
dfLargestLake = dfLakes.head(1)
print("Largest lake and the Country it is located in ")
print("===================================================")
dfLargestLake

Largest lake and the Country it is located in 


Unnamed: 0,CountryName,LakeName,Area
0,R AZ KAZ IR TM,Caspian Sea,386400.0


In [61]:
dfAirports = pd.DataFrame(columns=['CountryName','AirportName','Elevation'])
for airport in root.iter('airport'):
    if airport.find('elevation') is not None:
        dfTemp = pd.DataFrame([[airport.attrib['country'], airport.find('name').text , airport.find('elevation').text ]],\
                              columns=['CountryName','AirportName','Elevation'])
        airportFrames = [dfAirports,dfTemp]
        dfAirports = pd.concat(airportFrames)

dfAirports = dfAirports.sort_values('Elevation',ascending=False)
dfLargestAirport = dfAirports.head(1)
print("Largest Airport and the Country it is located in ")
print("===================================================")
dfLargestAirport

Largest Airport and the Country it is located in 


Unnamed: 0,CountryName,AirportName,Elevation
0,IR,Mashhad,995
