# WFP Hunger Covid Weekly Snapshots

Running this notebooks collects the national and urban weekly snapshots about Hunger and Covid, created by the World Food Programm (WFP). They are all downloaded (see parameter `output_folder_path`) and a metadata csv is created (see parameter `output_metadata_file_path`).

#### How to maintain this code?
New countries or urban areas may be added to the [Snapshots listing pdf](https://static.hungermapdata.org/hungermap/reports/hunger_covid_weekly_snapshot.pdf") over time. If so, they will appear in the last cell output. To include them in the collection, edit the "hyperlinks mapping" csv file (see parameter `countries_code_path`) to add them.

## Settings

In [1]:
# helpers
from pathlib import Path
import urllib.request
from datetime import datetime
import shutil
import tqdm

# data processing
import pdfplumber
import pandas as pd

# conf
import sys
from path_manipulation import get_to_root

sys.path[0] = get_to_root(3,sys.path[0])
from config.config import config
sources = config.sources

In [2]:
def get_pdf_from_url(url, output_path=None):
    """Download a pdf from a given url.
    :param url: web url from where to download the pdf
    :param output_path: a pathlib.Path object to which to write the downloaded pdf
    """
    # create the folder if not existing
    output_path.parent.mkdir(parents=True, exist_ok=True)
    # download
    urllib.request.urlretrieve(url, output_path)

In [3]:
def scrap_pdf_urls_from_countries_snapshots_overview(
    overview_path=sources.wpf_hunger_covid_snapshots_overview_path,
):
    # Returns links_metadata, a list of (country_name, url) pairs
    # Note: this snippet is adapted (+ corrected) from : https://github.com/jsvine/pdfplumber/issues/151

    def try_resolve(x):
        # Some of the PDF's properties are encoded as strings, while others are encoded as PDFObjects.
        # Here, we resolve them all to strings, rather than guess
        try:
            return x.resolve()
        except:
            return x

    def rect_to_bbox(rect, page):
        # Just to transform the PDF spec's version of bounding boxes
        # into more common coordinates
        return [
            rect[0],
            float(page.height) - rect[3],
            rect[2],
            float(page.height) - rect[1],
        ]

    def generate_links(page):
        for anno in page.page_obj.annots:
            anno = anno.resolve()
            if anno.get("Subtype").name != "Link":
                continue

            yield {
                "url": try_resolve(anno["A"])["URI"].decode("utf-8"),
                "bbox": rect_to_bbox(try_resolve(anno["Rect"]), page),
            }

    with pdfplumber.open(overview_path) as pdf:
        my_page = pdf.pages[2]
        links = list(generate_links(my_page))
        links_metadata = []
        for link in links:

            url = link["url"]
            bbox = link["bbox"]

            subpage = my_page.crop(bbox, relative=False)
            text = subpage.extract_text(x_tolerance=3, y_tolerance=3)

            if "EN" in text:
                # if there are two languages, look on the left to find the country
                bbox[0] = bbox[0] - int(140 / 3)
                subpage = my_page.crop(bbox, relative=False)
                text = subpage.extract_text(x_tolerance=3, y_tolerance=3)
                # remove ": EN" from e.g. "Honduras EN:" text
                text = text.replace(": EN", "")
            text = text.replace("o ", "").replace("− ", "")
            link_metadata = (text, url)
            links_metadata.append(link_metadata)

    return links_metadata

In [None]:
# TODO: add to utils
def get_current_date_as_str():
    date_str = datetime.now().strftime("%Y%d%m")
    return date_str


def get_countries_pdf_from_countries_url(
    links_metadata,
    countries_code_path=sources.wfp_hunger_covid_snapshots_hyperlinks_mapping_path,
    output_folder_path=sources.wfp_hunger_covid_weekly_snapshots_folder_path,
    output_metadata_file_path=sources.wfp_hunger_covid_weekly_snapshots_metadata_path,
):
    # This maps names in the snapshots overview to countries iso code and a formated filename,
    # then download all files in a folder. If the folder already exists, it is deleted and will
    # be re-created with the new files.
    # Additionaly, a csv is constructed with the snapshots metadata:
    # country_iso, hyperlink_name, hyperlink_url, formated_name, filename, date of update
    try:
        shutil.rmtree(output_folder_path)
    except:
        # folder already deleted
        pass
    # Get the name:country_iso and name:formated_name mappings
    # using a custom mapping adapted to WFP snapshots' names.
    countries_code = pd.read_csv(countries_code_path, sep=";")
    countries_code = countries_code[countries_code.include_this_url == 1].drop(
        columns="include_this_url"
    )
    countries_code = countries_code.set_index("name_in_weekly_snapshot").to_dict()
    hyperkink_names_known_to_be_ignored = [
        "CLICK TO DOWNLOAD ALL NATIONAL SNAPSHOTS ",
        " ES",
        "of the Congo",
        " ES",
        " ES",
        " FR",
        " ES",
        "Republic",
        "DOWNLOAD TODAY’S DAILY SNAPSHOT ",
        "CLICK TO DOWNLOAD ALL URBAN SNAPSHOTS",
        "@WFPVAM|",
        " @mobileVAM",
        "mvam.org",
    ]

    full_metadatas = []
    print(
        "Ignored Hyperlinks will be listed below (if any). \n N.B.: Consider inclusion if this is a new country/zone considered by the WFP."
    )
    for hyperlink_name, hyperlink_url in tqdm.tqdm(links_metadata):
        # get the country iso and formated name from the hyperlink's text.
        date_str = get_current_date_as_str()
        try:
            country_iso = countries_code["iso_3166_1_code"][hyperlink_name]
            formated_name = countries_code["formated_name"][hyperlink_name]
            output_filename = "snapshot_{}_iso_{}_country_{}.pdf".format(
                date_str, country_iso, formated_name
            )
            output_path = output_folder_path / output_filename
            get_pdf_from_url(hyperlink_url, output_path=output_path)
            full_metadatas.append(
                [
                    country_iso,
                    hyperlink_name,
                    hyperlink_url,
                    formated_name,
                    str(output_path),
                    date_str,
                ]
            )
        except e:

            if hyperlink_name not in hyperkink_names_known_to_be_ignored:
                print("-     '{}'".format(hyperlink_name))
                print("Associated error:", e)

    df = pd.DataFrame(
        full_metadatas,
        columns=[
            "country_iso",
            "hyperlink_name",
            "hyperlink_url",
            "formated_name",
            "output_path",
            "date_of_update_YYYYMMDD",
        ],
    )
    df.to_csv(output_metadata_file_path, sep=";", index=False)

## Get snapshots overview and scrap listing for all urls

In [25]:
overview_url = sources.wfp_hunger_covid_snapshots_overview_url
overview_path = sources.wpf_hunger_covid_snapshots_overview_path
# download the pdf, save it
get_pdf_from_url(overview_url, output_path=overview_path)
# Get the url from the hyperlinks in the snapshot overview
links_metadata = scrap_pdf_urls_from_countries_snapshots_overview(
    overview_path=overview_path
)
# map each location to a country's Iso code and download the pdf.
get_countries_pdf_from_countries_url(links_metadata)

  0%|                                                                                                                                                              | 0/65 [00:00<?, ?it/s]

Ignored Hyperlinks will be listed below (if any). 
 N.B.: Consider inclusion if this is a new country/zone considered by the WFP.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 65/65 [00:48<00:00,  1.33it/s]


In [24]:
# this is for development purpose / debogage

links_metadata
countries_code_path=sources.wfp_hunger_covid_snapshots_hyperlinks_mapping_path
output_folder_path=sources.wfp_hunger_covid_weekly_snapshots_folder_path
output_metadata_file_path=sources.wfp_hunger_covid_weekly_snapshots_metadata_path
try:
    shutil.rmtree(output_folder_path)
except:
    # folder already deleted
    pass
# Get the name:country_iso and name:formated_name mappings
# using a custom mapping adapted to WFP snapshots' names.
countries_code = pd.read_csv(countries_code_path, sep=";")
countries_code = countries_code[countries_code.include_this_url == 1].drop(
    columns="include_this_url"
)
countries_code = countries_code.set_index("name_in_weekly_snapshot").to_dict()
hyperkink_names_known_to_be_ignored = [
    "CLICK TO DOWNLOAD ALL NATIONAL SNAPSHOTS ",
    " ES",
    "of the Congo",
    " ES",
    " ES",
    " FR",
    " ES",
    "Republic",
    "DOWNLOAD TODAY’S DAILY SNAPSHOT ",
    "CLICK TO DOWNLOAD ALL URBAN SNAPSHOTS",
    "@WFPVAM|",
    " @mobileVAM",
    "mvam.org",
]

full_metadatas = []
print(
    "Ignored Hyperlinks will be listed below (if any). \n N.B.: Consider inclusion if this is a new country/zone considered by the WFP."
)
for hyperlink_name, hyperlink_url in tqdm.tqdm(links_metadata):
    # get the country iso and formated name from the hyperlink's text.
    date_str = get_current_date_as_str()
    
    try:
        country_iso = countries_code["iso_3166_1_code"][hyperlink_name]
        formated_name = countries_code["formated_name"][hyperlink_name]
        output_filename = "snapshot_{}_iso_{}_country_{}.pdf".format(
            date_str, country_iso, formated_name
        )
        print(output_filename)
        output_path = output_folder_path / output_filename
        get_pdf_from_url(hyperlink_url, output_path=output_path)
        print(str(output_path))
#         full_metadatas.append(
#             [
#                 country_iso,
#                 hyperlink_name,
#                 hyperlink_url,
#                 formated_name,
#                 str(output_path),
#                 date_str,
#             ]
#         )
    except e:
        print(e)
        if hyperlink_name not in hyperkink_names_known_to_be_ignored:
            print("-     '{}'".format(hyperlink_name))
            break

  0%|                                                                                                                                                              | 0/65 [00:00<?, ?it/s]

Ignored Hyperlinks will be listed below (if any). 
 N.B.: Consider inclusion if this is a new country/zone considered by the WFP.





NameError: name 'e' is not defined

In [15]:
hyperlink_name

'mvam.org'