# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [3]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [15]:
import pandas as pd
document = ET.parse( './data/mondial_database.xml' )

mydict={}
for country in document.findall('country[infant_mortality]'):
    mydict[country.find('name').text]=float(country.find('infant_mortality').text)

df=pd.DataFrame(mydict.items(), columns=['name','infmort'])
df.sort_values('infmort').head(10)

Unnamed: 0,name,infmort
35,Monaco,1.81
210,Japan,2.13
73,Norway,2.48
66,Bermuda,2.48
78,Singapore,2.53
108,Sweden,2.6
57,Czech Republic,2.63
145,Hong Kong,2.73
54,Macao,3.13
188,Iceland,3.15


In [91]:
# Problem 2... just experimenting with XML navigation. 
# Here I had just figured out how to use Element iterators.
# They appear to be a unique object class that function
# like iterators first and data containing objects second.
# I.e. you have to call object.text on the individual 
# object to get any values out. 

# The point of this code was to figure out if all the 
# cities had data from 2011, so I listed all the years
# with populations next to them in a dictionary

for city in document_tree.iter('city'):
    citypops = {}
    for pop in city.findall('population'):
        citypops[int(pop.get('year'))]=pop.text
    print(city.find('name').text, sorted(citypops))    

('Tirana', [1987, 1990, 2011])
(u'Shkod\xebr', [1987, 2011])
(u'Durr\xebs', [1987, 2011])
(u'Vlor\xeb', [1987, 2011])
('Elbasan', [1987, 2011])
(u'Kor\xe7\xeb', [1987, 2011])
('Komotini', [])
('Kavala', [1981, 1991, 2001, 2011])
('Athina', [1981, 1991, 2001, 2011])
('Peiraias', [1981, 1991, 2001, 2011])
('Peristeri', [1991, 2001, 2011])
('Acharnes', [1991, 2001, 2011])
('Patra', [1981, 1991, 2001, 2011])
('Kozani', [])
('Kerkyra', [1991, 2001, 2011])
('Ioannina', [1991, 2001, 2011])
('Thessaloniki', [1981, 1991, 2001, 2011])
('Iraklio', [1981, 1991, 2001, 2011])
('Chania', [1991, 2001, 2011])
('Ermoupoli', [])
('Rhodes', [1991, 2001, 2011])
('Tripoli', [])
('Lamia', [2011])
('Chalkida', [1991, 2001, 2011])
('Larissa', [1981, 1991, 2001, 2011])
('Volos', [1981, 1991, 2001, 2011])
('Mytilini', [])
('Karyes', [2014])
('Skopje', [2002, 2011])
('Kumanovo', [2002, 2011])
('Beograd', [1987, 2002, 2011])
('Novi Sad', [2002, 2011])
(u'Ni\u0161', [2002, 2011])
('Podgorica', [2003, 2011])
('Prish

In [105]:
# If you run the above code on the larger document, there are several
# population values from 2013, 2014...... need to devise a way to get
# the most up to date values

for city in document.iter('city'):
    citypops = {}
    max_key = 0
    for pop in city.findall('population'):
        citypops[int(pop.get('year'))]=pop.text
    if len(citypops) != 0:
        max_key = max(citypops.keys())    
        print(city.find('name').text, max_key, citypops[max_key])

('Tirana', 2011, '418495')
(u'Shkod\xebr', 2011, '77075')
(u'Durr\xebs', 2011, '113249')
(u'Vlor\xeb', 2011, '79513')
('Elbasan', 2011, '78703')
(u'Kor\xe7\xeb', 2011, '51152')
('Kavala', 2011, '58790')
('Athina', 2011, '664046')
('Peiraias', 2011, '163688')
('Peristeri', 2011, '139981')
('Acharnes', 2011, '106943')
('Patra', 2011, '213984')
('Kerkyra', 2011, '102071')
('Ioannina', 2011, '112486')
('Thessaloniki', 2011, '325182')
('Iraklio', 2011, '173993')
('Chania', 2011, '108642')
('Rhodes', 2011, '115490')
('Lamia', 2011, '75315')
('Chalkida', 2011, '102223')
('Larissa', 2011, '162591')
('Volos', 2011, '144449')
('Karyes', 2014, '233')
('Skopje', 2011, '514967')
('Kumanovo', 2011, '107745')
('Beograd', 2011, '1639121')
('Novi Sad', 2011, '335701')
(u'Ni\u0161', 2011, '257867')
('Podgorica', 2011, '150977')
('Prishtine', 2011, '198214')
('Andorra la Vella', 2011, '22256')
('Strasbourg', 2011, '272222')
('Mulhouse', 2011, '110351')
('Bordeaux', 2011, '239399')
('Clermont-Ferrand', 20

In [113]:
# Now I can get the most up to date years. Just need to find the top 10 cities

city_dict = {}

for city in document.iter('city'):
    citypops = {}
    max_key = 0
    for pop in city.findall('population'):
        citypops[int(pop.get('year'))]=pop.text
    if len(citypops) != 0:
        max_key = max(citypops.keys())    
        city_dict[city.find('name').text] = int(citypops[max_key])

df = pd.DataFrame(city_dict.items(),columns=['name','population'])
df.sort_values('population',ascending=False).head(10)
    

Unnamed: 0,name,population
2778,Shanghai,22315474
1617,Istanbul,13710512
1855,Mumbai,12442373
1061,Moskva,11979529
2209,Beijing,11716620
2682,São Paulo,11152344
536,Tianjin,11090314
838,Guangzhou,11071424
2736,Delhi,11034555
596,Shenzhen,10358381


In [43]:
# PROBLEM 3
# THERE ARE ETHNICGROUP TAGS FOR EVERY COUNTRY. ATTR IS PERCENTAGE OF TOTAL POP
# THAT FALLS UNDER THE TAG'S VALUE (GREEK OR HAN CHINESE ETC).
# NEED TO SUM ENTIRE POPULATION OF COUNTRY AND THEN MULTIPLY BY THESE PERCENTAGES
# AND ADD THE RESULT TO A GLOBAL DICT KEEPING TABS ON THE RUNNING COUNTS.

# THIS SECTION I WILL TRY TO PRODUCE ETHNICGROUP COUNTS FOR EACH COUNTRY

import pandas as pd
document = ET.parse( './data/mondial_database.xml' )
from collections import defaultdict

ethniccounts = defaultdict(float)

for country in document.iter('country'):
    counpop={} 
    maxpopyear = 0
    ethnicpercents = defaultdict(float)

    ### find the latest year
    for pop in country.findall('population'):
        counpop[int(pop.get('year'))] = int(pop.text)
    if len(counpop) != 0:
        maxpopyear = max(counpop.keys()) 

    ### extract ethnicity percentages and add counts to dict
    for ethnicity in country.findall('ethnicgroup'):
        ethniccounts[ethnicity.text] += float(ethnicity.get('percentage')) * float(counpop[maxpopyear]) / 100
    if len(ethnicpercents) != 0:
        ethniccounts[country.find('name').text] += float(counpop[maxpopyear])

ethniccounts = {k:int(v) for k, v in ethniccounts.items()}

df = pd.DataFrame(ethniccounts.items(), columns = ['ethnicity','count']).sort_values('count',ascending=False)
df.head(10)

#sanity check
#print(sum(ethniccounts.values())) 

5960384860


In [98]:
# PROBLEM 4: name and country of a) longest river, b) largest lake and c) airport at highest elevation
# LOOKING AT THE DATA FILE, THE RIVERS ARE LISTED AFTER THE COUNTRIES AT THE SAME LEVEL. INFO
# INCLUDES SOURCE AND ESTUARY COUNTRIES AS WELL AS SOMETIMES OTHER COUNTRIES IT RUNS THROUGH.

# THIS TOOK ME ABOUT 1.5 HRS

import pandas as pd
document = ET.parse( './data/mondial_database.xml' )
from collections import defaultdict

longestriver = ['default',int(0),'defaultcountry']

for river in document.findall('river'):
    try:
        if float(river.find('length').text) > float(longestriver[1]):
            longestriver[0:3] = [river.find('name').text, float(river.find('length').text), river.find('./source').get('country')]
            ### TO FIND THE COUNTRY USE TWO METHODS INSTEAD OF ONE!! ONE TO FIND THE ELEMENT AND THE NEXT
            ### TO GET THE ATTR FROM THAT TAG
    except AttributeError: ### HERE I USED TRY AND EXCEPT FOR THE FIRST TIME TO DEAL WITH MISSING DATA STOPPING MY CODE
        print('no length provided for ' + river.find('name').text)

print longestriver
### NO LENGTH FOR THE NILE WAS PROVIDED.........

no length provided for Nile
no length provided for White Nile
no length provided for Bahr el-Djebel/Albert-Nil
no length provided for Victoria Nile
no length provided for Lualaba
['Amazonas', 6448.0, 'PE']


In [104]:
# LARGEST LAKE -- PRETTY MUCH COPY AND PASTE

import pandas as pd
document = ET.parse( './data/mondial_database.xml' )
from collections import defaultdict

largestlake = ['default',0,'defaultcountry']

for lake in document.findall('lake'):
   
    try:
        if float(lake.find('area').text) > float(largestlake[1]):
            largestlake[0:3] = [lake.find('name').text, float(lake.find('area').text), lake.find('./located').get('country')]

    except AttributeError: 
        print('Attr Error for ' + lake.find('name').text)

print largestlake

Attr Error for Barrage de Mbakaou
Attr Error for Lake Nyos
['Caspian Sea', 386400.0, 'R']


In [126]:
# HIGHEST AIRPORT -- PRETTY MUCH COPY AND PASTE. BIGGEST CHALLENGE WAS REALIZING WHEN NO
# ELEVATION WAS GIVEN, IT WAS THROWING A TYPEERROR INSTEAD OF ATTRIBUTEERROR

import pandas as pd
document = ET.parse( './data/mondial_database.xml' )
from collections import defaultdict

highairport = ['default',int(0),'defaultcountry']

for airport in document.findall('airport'):
    try:
        if int(airport.find('elevation').text) > highairport[1]:
            highairport[0:3] = [airport.find('name').text, int(airport.find('elevation').text), airport.get('country')]

    except AttributeError: 
        print('Attr Error for ' + airport.find('name').text)
    except TypeError:
        print('TypeError for ' + airport.find('name').text) # NO ELEVATION GIVEN

print highairport

TypeError for Xiangfan Airport
TypeError for Wenzhou Longwan Airport
TypeError for Beihai Airport
TypeError for Nantong Airport
TypeError for Mianyang Airport
TypeError for Luzhou Airport
TypeError for Shubuling Airport
TypeError for Yancheng Airport
TypeError for Meixian Airport
TypeError for Nanyang Airport
TypeError for Manzhouli
TypeError for Anqing Airport
TypeError for Changde Airport
TypeError for Ganzhou Airport
TypeError for Lianyungang Airport
TypeError for Weifang Airport
TypeError for Changzhi Airport
TypeError for Chifeng Airport
TypeError for Nanchong Airport
TypeError for Jiujiang Lushan Airport
TypeError for Wuhai
TypeError for Guangyuan Airport
TypeError for Jinzhou Airport
TypeError for Quzhou Airport
TypeError for Ulanhot Airport
TypeError for Jieyang Chaoshan International Airport
['El Alto Intl', 4063, 'BOL']
