# WHO datasets

Running this notebooks collects all data sources from the WHO. Available datasets are automatically scrapped from homepage https://apps.who.int/gho/data/node.imr.

## Settings

In [1]:
# helpers
from pathlib import Path
import urllib.request
from datetime import datetime
import shutil
import tqdm

# data processing
import pandas as pd
from bs4 import BeautifulSoup

In [21]:
def clean_filename(filename):
  return filename.replace("/", "")

def scarp_all_datasets_from_WHO(url = "https://apps.who.int/gho/data/node.imr"):
    # Return a Python dictionnary that contains all links pointing to datasets to date
    # dataset name -> dataset url
    
    with urllib.request.urlopen(url) as page:
        html = BeautifulSoup(page.read().decode('utf-8'), 'html.parser')
        all_links_in_page = html.find_all('a')

        #only dataset urls beginning by 'node.imr' are relevant
        all_not_empty_links = {clean_filename(link.text) : link.get('href') for link in all_links_in_page if (not link.get('href') is None) and ("node.imr." in link.get('href'))}
        return all_not_empty_links

def get_csv_file_from_dataset(dataset_links, url =  "https://apps.who.int/gho/athena/data/GHO/",
                                  output_path = Path('../../data/clean/WHO/')):
    # Simply download corresponding .csv file while removing existing ones
    # Could not think of better solution since it's hard to check for updates...
    
    for dataset_name in dataset_links.keys():
        filename = dataset_links[dataset_name].split("node.imr.")[1]
        urllib.request.urlretrieve(url + filename + "?format=csv", (dataset_name + ".csv"))
        


## Get all datasets from WHO

In [None]:
# Get the datasets urls
dataset_links = scarp_all_datasets_from_WHO()
# Download the csv, save it
get_csv_file_from_dataset(dataset_links)