
# Note book to create html pages for countries and Kreise in Germany

Files:

New strategy (with pelican)

- put notebooks into wwwroot/ipynb folder
- put html into html wwwroot/folder
- pelican files can then go into wwwroot folder

Advantages:
- cleaner than all in one folder
- github can display all files in each subdirectory (there is a limit of 500 files or so)


## Computation strategy for html plots

such as https://oscovida.github.io/html/Turkey.html

- create notebooks for each country from template (in ipynb folder)
- run nbconvert on each notebook to execute it. This creates the html notebook.
- at the same time, record in the MetadataRegion directory/class what regions have been processed.
- the above execution can be requested again for all regions, but those will be skipped that have 
  recently (by default within the last 2 hours) been processed.
  This is useful to just re-run the command if a calculation time-out of (for an unknown reason).

- at any time later, create markdown pages (for pelican) based on that metadata

In [None]:
import sys

from multiprocessing import Pool, cpu_count

from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert import HTMLExporter
from nbconvert.writers import FilesWriter

import nbformat

import os
import shutil

import numpy as np

#  Set to false if you do not want multiprocessing enabled
cores = 'auto'

if cores == 'auto':
    cores = max(1, cpu_count()-1) +1


if cores:
    print(f'Using {cores} processes')
    
wwwroot = "wwwroot"


In [None]:
%config InlineBackend.figure_formats = ['svg']

from coronavirus import *
from coronavirus import MetadataRegion

pd.set_option('display.float_format', '{:.2f}'.format)  #  Disable pandas scientific notation

Cleaning of cache and copying files has moved to

- `generate-webpage-clean-setup.py` and 
- `generate-webpage-clean-setup.sh`


In [None]:
TITLE_PREFIX = "Tracking plots: "

In [None]:
d, c = fetch_deaths(), fetch_cases()

countries = d.index
countries2 = c.index
assert (countries2 == countries).all()

In [None]:
data_US_cases = fetch_cases_US()
data_US_deaths = fetch_deaths_US()

In [None]:
# also fetch data from Germany, so it is available later from the cache
germany = fetch_data_germany()

In [None]:
def modify_template(templatefile, output_file_name, mappings, wwwroot):
    """Create concrete *.ipynb file from template
    - templatefile: the template with placeholders to be substituted
    - mappings: dictiorany with placeholders as keys, and values to be substituted
    - output_file_name: name to write modified file to
    - wwwroot: directory in which the output file should be written
    """
    # open template
    with open(templatefile, "tr") as f_template:
        template = f_template.read()
    for key in mappings:
        value = mappings[key]
        if value is None:
            value = str(None)
        template = template.replace(key, value)
    with open(os.path.join(wwwroot, output_file_name), "tw") as f:
        f.write(template)
    print(f"Written file to {output_file_name}")


In [None]:
def check_country_name_is_known(name):
    d = fetch_deaths()
    assert name in d.index, f"{name} is unknown. Known countries are {sorted(d.index)}"

def germany_check_region_name_is_known(name):
    d = fetch_data_germany()
    assert name in list(d['Bundesland'].drop_duplicates()), \
        f"{name} is unknown. Known regions are {sorted(list(d['Bundesland'].drop_duplicates()))}"

def germany_check_subregion_name_is_known(name):
    d = fetch_data_germany()
    assert name in list(d['Landkreis'].drop_duplicates()), \
        f"{name} is unknown. Known regions are {sorted(list(d['Landkreis'].drop_duplicates()))}"

germany_check_region_name_is_known("Hamburg") 
germany_check_subregion_name_is_known("SK Hamburg") 

    
def sanitise(name):
    """Given a country name as a string, sanitise it for use as URL and filename: 
    - get rid of spaces, commas
    
    return cleaned string.
    
    (Leave umlaute for now)
    """
    s = name.replace(" ", "-")
    s = s.replace(",", "-")
    return s
    
    
def get_binder_url(notebook):
    """Given a notebook name, compute the path"""
    base = "https://mybinder.org/v2/gh/oscovida/binder/master?filepath=ipynb/"
    return base + notebook.replace(" ", "%20")


def create_ipynb_for_country(country, templatename, wwwroot):
    """Creates ipynb file for country, based on templatename. 
    File is based in ipynb subfolder of wwwroot.
    Returns name of file."""
    
    # create ipynb folder if required
    ipynb_dir = os.path.join(wwwroot, "ipynb")
    if not os.path.exists(ipynb_dir):
        os.mkdir(ipynb_dir)
        
    
    check_country_name_is_known(country)
    
    output_file_name =  f"{country}.ipynb"
    output_file_path = os.path.join(wwwroot, "ipynb", output_file_name)
    
    # country = sanitize(country)
    mappings = {
        "%title%" : country,
        "%title2%" : "",
        "%country%" : country,
        "%binderurl%" : get_binder_url(output_file_name),
        "%create_date%" : datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    }

    modify_template(templatename, os.path.join("ipynb", output_file_name), mappings, wwwroot)
    assert os.path.exists(output_file_path), f"{output_file_path} does not exist"
    return output_file_name

def create_ipynb_for_germany(region, subregion, templatename, wwwroot):
    """Creates ipynb file for region and subregion in Germany, based on templatename. 
    File is based in ipynb subfolder of wwwroot.
    Returns name of file."""
    germany_check_region_name_is_known(region)
    germany_check_subregion_name_is_known(subregion)
    
    output_file_name =  f"Germany-{sanitise(region)}-{sanitise(subregion)}.ipynb"
    output_file_path = os.path.join(wwwroot, "ipynb", output_file_name)
    
    # country = sanitize(country)
    mappings = {
        "%title%" : f"Germany: {subregion} ({region})",
        "%title2%" : "",
        "%region%" : region,
        "%subregion%" : subregion,
        "%binderurl%" : get_binder_url(output_file_name),
        "%create_date%" : datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    }

    modify_template(templatename, os.path.join("ipynb", output_file_name), mappings, wwwroot)
    assert os.path.exists(output_file_path), f"{output_file_path} does not exist"
    return output_file_name

def create_ipynb_for_US(state, subregion, templatename, wwwroot):
    """Creates ipynb file for states in US, based on templatename. 
    File is based in ipynb subfolder of wwwroot.
    Returns name of file."""
    assert state in get_US_region_list()
    assert subregion == None
    
    output_file_name =  f"US-{sanitise(state)}.ipynb"
    output_file_path = os.path.join(wwwroot, "ipynb", output_file_name)
    
    # country = sanitize(country)
    mappings = {
        "%title%" : f"United States: {state}",
        "%title2%" : "",
        "%region%" : state,
        "%subregion%" : subregion,
        "%binderurl%" : get_binder_url(output_file_name),
        "%create_date%" : datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    }

    modify_template(templatename, os.path.join("ipynb", output_file_name), mappings, wwwroot)
    assert os.path.exists(output_file_path), f"{output_file_path} does not exist"
    return output_file_name


In [None]:
nb_executor = ExecutePreprocessor()
nb_executor.allow_errors = True

html_exporter = HTMLExporter()
html_writer = FilesWriter()

In [None]:
def nb_convert_html(nb_path, outdir):
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    filename = os.path.basename(nb_path)
    outpath = os.path.join(outdir, os.path.splitext(filename)[0])
    with open(nb_path) as f:
        nb = nbformat.read(f, as_version=4)
        nb = nb_executor.preprocess(nb)[0]
        body, resources = html_exporter.from_notebook_node(nb)
        html_writer.write(body, resources, outpath)

In [None]:
def nbconvert_ipynb2html(ipynb_name, wwwroot):
    """Given the name of a a notebook (such as "germany.ipynb"), create the 
    corresponding html file ("html/germany.html") from the notebook file in 
    "ipynb" and return the name of the file (i.e. germany.html).
    """
    ipynb_dir = os.path.join(wwwroot, "ipynb")
    
    # execute notebook and create html copy from it
    nb_convert_html(
        os.path.join(ipynb_dir, ipynb_name),
        os.path.join(wwwroot, "html")
    )

    # compute output path
    output_file_name = os.path.splitext(ipynb_name)[0] + ".html"
    assert os.path.exists(os.path.join(wwwroot, "html", output_file_name))
    
    return output_file_name
    

In [None]:
def create_markdown_index_list(category):
    """Assemble a markdown table like this:
    
    | Country/Region                       | Total cases   | Total deaths   |
    |:-------------------------------------|:--------------|:---------------|
    | [Afghanistan](html/Afghanistan.html) | 1,351         | 43             |
    | [Albania](html/Albania.html)         | 678           | 27             |
    | [Algeria](html/Algeria.html)         | 3,127         | 415            |
    | [Andorra](html/Andorra.html)         | 731           | 40             |
    
    and return as string.
    """
    
    known_categories = ["world", "Germany", "US"]
               
    # gather data
    regions_all = MetadataRegion.get_all_as_dataframe()
    if category in known_categories:
        # select those we are interested in
        regions = regions_all[regions_all['category'] == category]
    elif category in ["all-regions"]:
        regions = regions_all
    else:
        
        raise NotImplementedError(f"category {category} is unknown."+
                                  f" Known values are {known_categories + ['all-regions']}")
    
    # sanity check
    assert len(regions) >= 4
    
    # change index to contain URLs and one-line summary in markdown syntax
    def compose_md_url(x):
        one_line_summary, html = x
        if isinstance(html, str):
            return "[" + one_line_summary + "](" + os.path.join('html', html) +")"
        elif repr(html) == 'nan':   # if html was not produced, then variable html is np.nan
            print(f"Missing html for {one_line_summary} - will not add link to html: \n{x}")
            return one_line_summary
        else:
            raise NotImplementedError("Don't know how to proceed: ", one_line_summary, html, x)

    new_index = regions[['one-line-summary', 'html-file']].apply(compose_md_url, axis=1)
    regions2 = regions.set_index(new_index)
    regions2.index.name = "Location"
    
    # select columns
    regions3 = regions2[['max-cases', 'max-deaths', 'cases-last-week']]
    regions4 = regions3.applymap(lambda v: '{:,}'.format(v))  # Thousands comma separator
    
    # rename columns
    rename_dict = {'max-cases' : 'Total cases', 
                   'max-deaths' : 'Total deaths',
                   'cases-last-week' : 'New cases last week'}
    regions5 = regions4.rename(columns=rename_dict)

    return regions5.to_markdown()


In [None]:
def get_country_list():
    d, c = fetch_deaths(), fetch_cases()

    countries = d.index
    countries2 = c.index
    assert (countries2 == countries).all()
    
    # Here we should identify regions in countries, and process those.
    # Instead, as a quick hack to get started, we'll just take one country
    # and the current "get_country" method will sum over all regions of one country if only 
    # the country name is given.
    
    return sorted(countries.drop_duplicates())
    

In [None]:
def create_index_page(sections, rootname, wwwroot):
    """Sections is dictionary: key is title, value is markdown text"""
    md_file = rootname + ".md"
    
    with open(os.path.join(wwwroot, md_file), "tw") as f:
        for section in sections:
            f.write(f"# {section}\n\n")
            f.write(sections[section])
    print(f"Written overview to {md_file}.")
    html_file = rootname + ".html"
    subprocess.check_call(f"pandoc -t html -o {os.path.join(wwwroot, html_file)} " +
                          f"{os.path.join(wwwroot, md_file)}", shell=True)
    return html_file

In [None]:
def get_germany_subregion_list():
    """returns list of subregions (Kreise), 
    ordered according to (i) Land, then (ii) Kreis
    """
    x = fetch_data_germany()
    land_kreis = x[['Bundesland', 'Landkreis']]
    ordered = land_kreis.sort_values(['Bundesland', 'Landkreis'])
    return list(ordered['Landkreis'].drop_duplicates())
 

@joblib_memory.cache
def germany_get_bundesland_from_kreis(kreis):
        x = fetch_data_germany()
        return x[x['Landkreis'] == kreis].iloc[0]['Bundesland']    

In [None]:
def does_wwwroot_exist(wwwroot):
    if not os.path.exists(wwwroot):
        msg = "To put the html into github repo for webhosting, run "
        msg += '"git clone git@github.com:oscovida/oscovida.github.io.git wwwroot" or similar'
        # os.mkdir(wwwroot)
        raise ValueError(f"directory {wwwroot} missing.")

In [None]:
def create_html_for_john_hopkins_countries(countries, wwwroot, expiry_hours=2):
    """ Create ipynb for country, and create html from it. Update metadata.
    
    Arguments:
    - countries: list of strings with country names
    - wwwroot path to root of webpages
    - expiry_hours: if the same data set has been processed within the last expiry_hours hours, 
      skip the task and leave the files and metadata untouched.
    
    """

    start_time = time.time()
    does_wwwroot_exist(wwwroot)
    skipped = 0

    for i, country in enumerate(countries):
        m = MetadataRegion(country)
        if m.last_updated_hours_ago() < expiry_hours:
            print(f"Skipping {country} - was updated {m.last_updated_hours_ago():.1f} hours ago")
            skipped += 1
            continue
            
        # metadata to be used when we create html pages 
        m['source'] = "CSSE Johns Hopkins"
        m['category'] = "world"
        cases, deaths, region_label = get_country_data(country)
        m['max-deaths'] = int(deaths[-1])
        m['max-cases'] = int(cases[-1])
        m['region'] = str(None)
        m['subregion'] = str(None)
        one_line_summary = f"{country}"
        m['one-line-summary'] = one_line_summary  # used as title in table
        
        # compute number of infections in last week
        m['cases-last-week'] = int(get_cases_last_week(cases))
    
        try:
            print(f"Processing {i+1}/{len(countries)} [{time.time()-start_time:4.0f}s]")
            ipynb_name = create_ipynb_for_country(country, "template-country.ipynb", wwwroot=wwwroot)
            html_name = nbconvert_ipynb2html(ipynb_name, wwwroot=wwwroot)
            m['html-file'] = html_name
            m['ipynb-name'] = ipynb_name
            print(f"Mark {country} as updated")
            m.mark_as_updated()
        except Exception as e:
            print(f"Error for {country}", end='')
            print(e)
            raise e

        sys.stdout.flush()
        
    print(f"Created {len(countries)-skipped} (skipped {skipped}) notebooks and html versions in " + \
          f"{time.time()-start_time} seconds")
    
    sys.stdout.flush()

In [None]:
def parallel_html_for_countries(countries, wwwroot, pool):
    processes = pool._processes
    padding = processes - (len(countries) % processes)
    countries = countries + ([0] * padding)
    per_process = int(len(countries)/processes)
    countries_per_process = list(countries[i:i+per_process] for i in range(0, len(countries), per_process))
    countries_per_process[-1] = countries_per_process[-1][:-padding]
    
    tasks = ((c, wwwroot) for c in countries_per_process)
    
    res = pool.starmap(create_html_for_john_hopkins_countries, tasks)
    

In [None]:
def create_markdown_index_page(md_content, title, pelican_file_path, 
                               save_as, wwwroot, slug=None):
    """Create pelican markdown file, like this:
    
    title: Germany
    tags: Data, Plots, Germany
    save-as: germany
    date: 2020-04-11 08:00
    """

    if slug is None:
        slug = save_as
    
    with open(os.path.join(pelican_file_path), "tw") as f:
        f.write(f"title: {title}\n")
        # f.write(f"category: Data\n")  - have stopped using categories (22 April 2020)
        f.write(f"tags: Data, Plots, {title}\n")
        f.write(f"save-as: {save_as}\n")
        f.write(f"slug: {slug}\n")
        date_time = datetime.datetime.now().strftime("%Y/%m/%d %H:%M")
        f.write(f"date: {date_time}\n")
        f.write("\n")
        f.write("\n")
        f.write(md_content)
        f.write("\n")


# Create country overview for the world

## Computation

In [None]:
countries = get_country_list()

if cores:
    with Pool(cores) as pool:
        parallel_html_for_countries(countries, wwwroot, pool)
else:
    create_html_for_john_hopkins_countries(countries, wwwroot)


## Creation of markdown

In [None]:
index_md = create_markdown_index_list("world")

create_markdown_index_page(index_md, title=TITLE_PREFIX + " Countries of the world", 
                           pelican_file_path="pelican/content/countries.md", save_as="countries", 
                           wwwroot=wwwroot)

# Create list of Germany data sets

In [None]:
def create_html_for_Germany(subregions, wwwroot, expiry_hours=2):
    """If a data set has been created within expire_hours from now, do not update it, but return 
    immediately."""
    does_wwwroot_exist(wwwroot)
    start_time = time.time()
    skipped = 0
    
    for i, kreis in enumerate(subregions):
        m = MetadataRegion(kreis)
        if m.last_updated_hours_ago() < expiry_hours:
            print(f"Skipping {kreis} - was updated {m.last_updated_hours_ago():.1f} hours ago")
            skipped += 1
            continue
            
        # metadata to be used when we create html pages 
        m['source'] = "Robert Koch Institute"
        m['category'] = "Germany"
        cases, deaths, region_label = get_country_data("Germany",
                                                       subregion=kreis)

        m['max-deaths'] = int(deaths[-1])
        m['max-cases'] = int(cases[-1])
        m['subregion'] = kreis
        bundesland = germany_get_bundesland_from_kreis(kreis)
        m['region'] = bundesland
        one_line_summary = f"Germany: {bundesland} : {kreis}"
        m['one-line-summary'] = one_line_summary  # used as title in table
        # compute number of infections in last week
        m['cases-last-week'] = int(get_cases_last_week(cases))
    

        try:    
            print(f"Processing {i+1}/{len(subregions)} [{time.time()-start_time:4.0f}s]")
            ipynb_name = create_ipynb_for_germany(region=bundesland, subregion=kreis, 
                                                  templatename="template-germany.ipynb", wwwroot=wwwroot)
            html_name = nbconvert_ipynb2html(ipynb_name, wwwroot=wwwroot)
            m['html-file'] = html_name
            m['ipynb-name'] = ipynb_name
            print(f"Mark {kreis} as updated")
            m.mark_as_updated()

        except Exception as e:
            print(f"Error for {kreis}", end='')
            print(e)
            raise
            
        sys.stdout.flush()

    print(f"Created {len(subregions)-skipped} (skipped {skipped}) notebooks and html versions in " + \
          f"{time.time()-start_time} seconds")
    sys.stdout.flush()


In [None]:
def parallel_html_for_germany(subregions, wwwroot, pool):
    processes = pool._processes
    padding = processes - (len(subregions) % processes)
    subregions = subregions + ([0] * padding)
    per_process = int(len(subregions)/processes)
    subregions_per_process = list(subregions[i:i+per_process] for i in range(0, len(subregions), per_process))
    subregions_per_process[-1] = subregions_per_process[-1][:-padding]
    
    tasks = ((c, wwwroot) for c in subregions_per_process)
    
    pool.starmap(create_html_for_Germany, tasks)
    

## Computation of html (Germany)

In [None]:
wwwroot = "wwwroot"
subregions = get_germany_subregion_list()

# data cleaning: on 13 April, we had a Landkreis "LK Göttingen (alt)"
# with only one data point. This causes plots to fail, because there
# is nothing to plot, and then the legend() command failed.
# We assume that the RKI labels unusual data with '(alt)', and remove those.

alt_data_sets = [x for x in subregions if "(alt)" in x.lower()]
if len(alt_data_sets) > 0:
    print(f"Removing datasets label with '(alt)': {alt_data_sets}")
    for alt in alt_data_sets:
        c, d = germany_get_region(landkreis=alt)
        print(f"  removed: {alt} : len(cases)={len(c)}, len(deaths)={len(d)}")
    subregions = [x for x in subregions if not "(alt)" in x.lower()]

# Actual calculations
    
if cores:
    with Pool(cores) as pool:
        parallel_html_for_germany(subregions, wwwroot, pool)
else:
    create_html_for_Germany(subregions, wwwroot)



In [None]:
index_md = create_markdown_index_list(category="Germany")

create_markdown_index_page(index_md, title= TITLE_PREFIX + " Germany", 
                           pelican_file_path="pelican/content/germany.md", save_as="germany", 
                           wwwroot=wwwroot)

## Computation of html (US)

In [None]:
def parallel_html_for_US(subregions, wwwroot, pool):
    processes = pool._processes
    padding = processes - (len(subregions) % processes)
    subregions = subregions + ([0] * padding)
    per_process = int(len(subregions)/processes)
    subregions_per_process = list(subregions[i:i+per_process] for i in range(0, len(subregions), per_process))
    subregions_per_process[-1] = subregions_per_process[-1][:-padding]
    
    tasks = ((c, wwwroot) for c in subregions_per_process)
    
    pool.starmap(create_html_for_US, tasks)
    

In [None]:
def create_html_for_US(states, wwwroot, expiry_hours=2):
    """If a data set has been created within expire_hours from now, do not update it, but return 
    immediately."""
    does_wwwroot_exist(wwwroot)
    start_time = time.time()
    skipped = 0
    
    for i, state in enumerate(states):
        name = f"US-{state}"
        m = MetadataRegion(name)
        if m.last_updated_hours_ago() < expiry_hours:
            print(f"Skipping {name} - was updated {m.last_updated_hours_ago():.1f} hours ago")
            skipped += 1
            continue
            
        # metadata to be used when we create html pages 
        m['source'] = "Johns Hopkins University CSSE"
        m['category'] = "US"
        cases, deaths = get_region_US(state)
        m['max-deaths'] = int(deaths[-1])
        m['max-cases'] = int(cases[-1])
        m['subregion'] = None  # would be county
        m['region'] = state
        one_line_summary = f"US: {state}"
        m['one-line-summary'] = one_line_summary  # used as title in table
        # compute number of infections in last week
        m['cases-last-week'] = int(get_cases_last_week(cases))
    

        try:    
            print(f"Processing {i+1}/{len(states)} [{time.time()-start_time:4.0f}s]")
            ipynb_name = create_ipynb_for_US(state, subregion=None,
                                             templatename="template-US.ipynb", wwwroot=wwwroot)
            html_name = nbconvert_ipynb2html(ipynb_name, wwwroot=wwwroot)
            m['html-file'] = html_name
            m['ipynb-name'] = ipynb_name
            print(f"Mark {name} as updated")
            m.mark_as_updated()

        except Exception as e:
            print(f"Error for {name}", end='')
            print(e)
            raise
            
        sys.stdout.flush()

    print(f"Created {len(states)-skipped} (skipped {skipped}) notebooks and html versions in " + \
          f"{time.time()-start_time} seconds")
    sys.stdout.flush()


states = get_US_region_list()

# Actual calculations
    
if cores:
    with Pool(cores) as pool:
        parallel_html_for_US(states, wwwroot, pool)
else:
    create_html_for_US(states, wwwroot)



## Creation of html (Pelican) for US

In [None]:
index_md = create_markdown_index_list("US")

create_markdown_index_page(index_md, title=TITLE_PREFIX + " United States", 
                           pelican_file_path="pelican/content/US.md", save_as="us", 
                           wwwroot=wwwroot)

## Creation of html page for all regions

In [None]:
index_md = create_markdown_index_list("all-regions")
create_markdown_index_page(index_md, title=TITLE_PREFIX + " All regions and countries", 
                           pelican_file_path="pelican/content/all-regions.md", save_as="all-regions", 
                           wwwroot=wwwroot)

# Check what went wrong?
- Sometimes, a job times out. If so, the loop below will report this.

In [None]:
ms = MetadataRegion.get_all()
for name in ms:
    m = MetadataRegion(name)
    dt = m.last_updated_hours_ago()
    if dt > 2:
        print(f"Problem with '{name}', last update: {dt} ago ")
        
    

# Additional debugging - RKI seems old

In [None]:
# Check when the last data from RKI was updated
# d = fetch_data_germany()

# d.sort_values("Meldedatum", ascending=False)

In [None]:
!pwd