# XML exercise

Using data from [**mondial database**](https://drive.google.com/file/d/14lFT4nWHgwN36ij4XZh6OUuup-K9qLgR/view?usp=sharing) find the answers to following questions:

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [2]:
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize  
from pprint import pprint
import xml.etree.ElementTree as ET

In [3]:
file = '/mnt/d/lighthouse/lighthouse_data_notes/Week_2/Day_3/Other_data_types_exercise/mondial.xml'
mondial = ET.parse(file)

In [4]:
root = mondial.getroot()
root

<Element 'mondial' at 0x7f20d19bff40>

In [5]:
x = root.findall("./country/infant_mortality")

# 1. 10 countries with the lowest infant mortality rates

In [13]:
my_dict = {'name': [],'mort': []}
for country in root.findall('country'):
    
    name = country[0].text
    my_dict['name'].append(name)
    
    mort = country.find('infant_mortality')
    if mort == None:
        my_dict['mort'].append('')
    else:
        my_dict['mort'].append(country.find('infant_mortality').text)
#my_dict
df_1 = pd.DataFrame(my_dict) 

In [15]:
df_1['mort'].replace('', np.nan, inplace=True)
df_1.dropna(subset=['mort'], inplace=True)
df_1['mort'] = df_1.mort.astype(float)
df_1.sort_values('mort').head(10)

Unnamed: 0,name,mort
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
8,Spain,2.7
78,Hong Kong,2.73
79,Macao,3.13


# 2. 10 cities with the largest population

In [67]:
country = root[0]
for city in country.findall('city'):  
    print(city[0].text)

Tirana
Shkodër
Durrës
Vlorë
Elbasan
Korçë


In [70]:
for city in country.iter('city'):
    print(city.find('population').text)

192000
62000
66200
56000
53000
52000


In [105]:
my_dict_2 = {'name': [], 'population' : []}

for country in root.findall('country'):
    for city in country.iter('city'):
        
        name = city[0].text
        my_dict_2['name'].append(name)
    
        pop = city.find('population')
        if pop == None:
            my_dict_2['population'].append('')
        else:
            my_dict_2['population'].append(city.find('population').text)
            
df_2 = pd.DataFrame(my_dict_2) 

In [106]:
df_2['population'].replace('', np.nan, inplace=True)
df_2.dropna(subset=['population'], inplace=True)
df_2['population'] = df_2.population.astype(int)

In [107]:
df_2.sort_values('population', ascending = False).head(10)

Unnamed: 0,name,population
1926,Seoul,10229262
1528,Mumbai,9925891
2811,São Paulo,9412894
1755,Jakarta,8259266
1340,Shanghai,8205598
2109,Ciudad de México,8092449
479,Moskva,8010954
1874,Tokyo,7843000
1339,Beijing,7362426
1584,Delhi,7206704


### Trying with year

## ?????

# 3. name and country of:

## A) longest river

In [125]:
river_dict = {'name': [], 'country': [],'length': []}

for river in root.findall('river'):
    
    riv = river.find('name').text
    river_dict['name'].append(riv)
    
    c = river.get('country')
    river_dict['country'].append(c)
    
    length = river.find('length')
    if length == None:
        river_dict['length'].append('')
    else:
        river_dict['length'].append(river.find('length').text)
    
df_river = pd.DataFrame(river_dict) 
df_river

Unnamed: 0,name,country,length
0,Thjorsa,IS,230
1,Jökulsa a Fjöllum,IS,206
2,Thames,GB,346
3,Severn,GB,354
4,Trent,GB,298
...,...,...,...
434,Murrumbidgee River,AUS,1579
435,Eucumbene River,AUS,83
436,Snowy River,AUS,403
437,Waikato River,NZ,425


In [129]:
df_river['length'].replace('', np.nan, inplace=True)
df_river.dropna(subset=['length'], inplace=True)
df_river['length'] =  pd.to_numeric(df_river['length'], downcast="float")

In [131]:
df_river.sort_values('length', ascending = False).head(1)

Unnamed: 0,name,country,length
214,Yangtze,CN,6380.0


## B) Largest lake

In [133]:
lake_dict = {'name': [], 'country': [],'area': []}

for lake in root.findall('lake'):
    
    lak = lake.find('name').text
    lake_dict['name'].append(lak)
    
    c = lake.get('country')
    lake_dict['country'].append(c)
    
    areas = river.find('area')
    if areas == None:
        lake_dict['area'].append('')
    else:
        lake_dict['area'].append(lake.find('area').text)
    
df_lake = pd.DataFrame(lake_dict) 
df_lake

Unnamed: 0,name,country,area
0,Inarijärvi,SF,1040
1,Oulujärvi,SF,928
2,Saimaa,SF,4370
3,Päijänne,SF,1118
4,Mjoesa-See,N,368
...,...,...,...
184,Lake Eucumbene,AUS,145
185,Lake Jindabyne,AUS,30
186,Lake Hume,AUS,202
187,Lake Taupo,NZ,622


In [134]:
df_lake['area'].replace('', np.nan, inplace=True)
df_lake.dropna(subset=['area'], inplace=True)
df_lake['area'] =  pd.to_numeric(df_lake['area'], downcast="float")

In [136]:
df_lake.sort_values('area', ascending = False).head(1)

Unnamed: 0,name,country,area
59,Caspian Sea,R AZ KAZ IR TM,386400.0


## C) airpot at highest elevation

In [137]:
airport_dict = {'name': [], 'country': [],'elevation': []}

for airport in root.findall('airport'):
    
    air = airport.find('name').text
    airport_dict['name'].append(air)
    
    c = airport.get('country')
    airport_dict['country'].append(c)
    
    ele = airport.find('elevation')
    if ele == None:
        airport_dict['elevation'].append('')
    else:
        airport_dict['elevation'].append(airport.find('elevation').text)
    
df_airport = pd.DataFrame(airport_dict) 
df_airport

Unnamed: 0,name,country,elevation
0,Herat,AFG,977
1,Kabul Intl,AFG,1792
2,Tirana Rinas,AL,38
3,Cheikh Larbi Tebessi,DZ,811
4,Batna Airport,DZ,822
...,...,...,...
1312,Livingstone,Z,1007
1313,Ndola,Z,1270
1314,Lusaka Intl,Z,1152
1315,J M Nkomo Intl,ZW,1329


In [138]:
df_airport['elevation'].replace('', np.nan, inplace=True)
df_airport.dropna(subset=['elevation'], inplace=True)
df_airport['elevation'] =  pd.to_numeric(df_airport['elevation'], downcast="float")

In [140]:
df_airport.sort_values('elevation', ascending = False).head(1)

Unnamed: 0,name,country,elevation
81,El Alto Intl,BOL,4063.0
