# FAO datasets

Running this notebooks collects all data price sources from http://www.fao.org/faostat/en/#home which is an UN organization. Available datasets are automatically scrapped from homepage.

## Settings

In [2]:
# helpers
from pathlib import Path
import urllib.request
from datetime import datetime
import shutil
import tqdm

# data processing
import pandas as pd
import os
from zipfile import ZipFile
from bs4 import BeautifulSoup

In [3]:
def scarp_all_datasets_from_the_FAO(url = "http://fenixservices.fao.org/faostat/static/bulkdownloads/datasets_E.xml"):
    # Return a Python dictionnary from XML file on the FAO website dataset name -> (link, update date)
    
    with urllib.request.urlopen(url) as page:
        
        # Parse .xml file
        xml = BeautifulSoup(page.read(), "lxml")
        
        # Get all dataset info
        datasets_info = {}
        for dataset in xml.find_all('dataset'):
                datasets_info[dataset.datasetname.text] = (dataset.filelocation.text, dataset.dateupdate.text) 
        
        return datasets_info

def first_time_download(datasets_info,  output_path = Path('../../data/clean/FAO/')):
    # Download corresponding all .csv files while removing existing ones
    
    for dataset_name in datasets_info.keys():
        
        filename = datasets_info[dataset_name][0].split("/")[-1]
        path = datasets_info[dataset_name][0]
        
        # Get file
        urllib.request.urlretrieve(path, output_path / filename)
        
        # Extract all the contents of zip file in current directory
        with ZipFile(filename, 'r') as zipObj:
           zipObj.extractall()
        
        # Delete zip file
        os.remove(filename)

def write_dataset_info(datasets_info, output_path = Path('../../data/meta/FAO_info.csv')):
    dataframe = pd.df.from_dict(datasets_info)
    dataframe.to_csv(output_path)

def update(datasets_info, input_path, output_path = Path('/content')):
    # Download if dataset needs an update
    
    # Get old version
    already_installed = pd.read_csv(input_path)

    for dataset_name in datasets_info.keys():
        # New version update
        filename = datasets_info[dataset_name][0].split("/")[-1]
        new_date = datasets_info[dataset_name][1]
        
        # Old version udpate
        old_date = already_installed[dataset_name][1]

        if old_date < new_date::

            #download version
            path = datasets_info[dataset_name][0]
            urllib.request.urlretrieve(path, output_path / filename)
            
            with ZipFile(filename, 'r') as zipObj:
                # Extract all the contents of zip file in current directory
              zipObj.extractall()

            os.remove(filename)

## Get all datasets from http://www.foodsecurityportal.org/

In [None]:
# Get the datasets urls
datasets_info = scarp_all_datasets_from_the_FAO()
# Save update table
write_dataset_info(datasets_info)
# Download all the .csv, save them
first_time_download(datasets_info)