## Setup

In [None]:
import datetime
import json
import os
import shutil
import sys
from multiprocessing import Pool, cpu_count

import ipynb_py_convert
import nbformat
import numpy as np
from nbconvert import HTMLExporter
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert.writers import FilesWriter

In [None]:
#  Set to false if you do not want multiprocessing enabled
cores = 'auto'

if cores == 'auto':
    cores = max(1, cpu_count())
    # try at most 4 to reduce probability of error message like
    # the one shown at https://github.com/jupyter/jupyter_client/issues/541
    cores = min(cores, 4)


if cores:
    print(f'Using {cores} processes')
    
wwwroot = "wwwroot"

In [None]:
%config InlineBackend.figure_formats = ['svg']

from coronavirus import *
from coronavirus import MetadataRegion

pd.set_option('display.float_format', '{:.2f}'.format)  #  Disable pandas scientific notation

Cleaning of cache and copying files has moved to

- `generate-webpage-clean-setup.py` and 
- `generate-webpage-clean-setup.sh`

In [None]:
TITLE_PREFIX = "Tracking plots: "

### Download Datasets

In [None]:
d, c = fetch_deaths(), fetch_cases()

countries = d.index
countries2 = c.index
assert (countries2 == countries).all()

In [None]:
data_US_cases = fetch_cases_US()
data_US_deaths = fetch_deaths_US()

In [None]:
# also fetch data from Germany, so it is available later from the cache
germany = fetch_data_germany()

### Generic Functions

In [None]:
def does_wwwroot_exist(wwwroot):
    if not os.path.exists(wwwroot):
        msg = "To put the html into github repo for webhosting, run "
        msg += '"git clone git@github.com:oscovida/oscovida.github.io.git wwwroot" or similar'
        # os.mkdir(wwwroot)
        raise ValueError(f"directory {wwwroot} missing.")

## Index Page Generation

In [None]:
def create_markdown_index_list(category):
    """Assemble a markdown table like this:
    
    | Country/Region                       | Total cases   | Total deaths   |
    |:-------------------------------------|:--------------|:---------------|
    | [Afghanistan](html/Afghanistan.html) | 1,351         | 43             |
    | [Albania](html/Albania.html)         | 678           | 27             |
    | [Algeria](html/Algeria.html)         | 3,127         | 415            |
    | [Andorra](html/Andorra.html)         | 731           | 40             |
    
    and return as string.
    """
    
    known_categories = ["world", "Germany", "US"]
               
    # gather data
    regions_all = MetadataRegion.get_all_as_dataframe()
    if category in known_categories:
        # select those we are interested in
        regions = regions_all[regions_all['category'] == category]
    elif category in ["all-regions"]:
        regions = regions_all
    else:
        
        raise NotImplementedError(f"category {category} is unknown."+
                                  f" Known values are {known_categories + ['all-regions']}")
    
    # change index to contain URLs and one-line summary in markdown syntax
    def compose_md_url(x):
        one_line_summary, html = x
        if isinstance(html, str):
            return "[" + one_line_summary + "](" + os.path.join('html', html) +")"
        elif repr(html) == 'nan':   # if html was not produced, then variable html is np.nan
            print(f"Missing html for {one_line_summary} - will not add link to html: \n{x}")
            return one_line_summary
        else:
            raise NotImplementedError("Don't know how to proceed: ", one_line_summary, html, x)

    new_index = regions[['one-line-summary', 'html-file']].apply(compose_md_url, axis=1)
    regions2 = regions.set_index(new_index)
    regions2.index.name = "Location"
    
    # select columns
    regions3 = regions2[['max-cases', 'max-deaths', 'cases-last-week']]
    regions4 = regions3.applymap(lambda v: '{:,}'.format(v))  # Thousands comma separator
    
    # rename columns
    rename_dict = {'max-cases' : 'Total cases', 
                   'max-deaths' : 'Total deaths',
                   'cases-last-week' : 'New cases last week'}
    regions5 = regions4.rename(columns=rename_dict)

    return regions5.to_markdown()


In [None]:
def create_markdown_index_page(md_content, title, pelican_file_path, 
                               save_as, wwwroot, slug=None):
    """Create pelican markdown file, like this:
    
    title: Germany
    tags: Data, Plots, Germany
    save-as: germany
    date: 2020-04-11 08:00
    """

    if slug is None:
        slug = save_as
    
    with open(os.path.join(pelican_file_path), "tw") as f:
        f.write(f"title: {title}\n")
        # f.write(f"category: Data\n")  - have stopped using categories (22 April 2020)
        f.write(f"tags: Data, Plots, {title}\n")
        f.write(f"save-as: {save_as}\n")
        f.write(f"slug: {slug}\n")
        date_time = datetime.datetime.now().strftime("%Y/%m/%d %H:%M")
        f.write(f"date: {date_time}\n")
        f.write("\n")
        f.write("\n")
        f.write(md_content)
        f.write("\n")

In [None]:
def create_index_page(sections, rootname, wwwroot):
    """Sections is dictionary: key is title, value is markdown text"""
    md_file = rootname + ".md"
    
    with open(os.path.join(wwwroot, md_file), "tw") as f:
        for section in sections:
            f.write(f"# {section}\n\n")
            f.write(sections[section])
    print(f"Written overview to {md_file}.")
    html_file = rootname + ".html"
    subprocess.check_call(f"pandoc -t html -o {os.path.join(wwwroot, html_file)} " +
                          f"{os.path.join(wwwroot, md_file)}", shell=True)
    return html_file

## Abstract Report Class

In [None]:
class BaseReport:
    def __init__(self, country, title, overview_function, overview_args,
                 data_load_function, data_load_args, output_file, wwwroot):
        self.country = country
        self.title = title

        self.overview_function = overview_function
        self.overview_args = overview_args

        self.data_load_function = data_load_function
        self.data_load_args = data_load_args

        self.output_file_name = self.sanitise(output_file) + ".ipynb"
        self.output_ipynb_path = os.path.join(
            wwwroot, "ipynb", self.output_file_name)
        self.output_html_path = os.path.join(
            wwwroot, "html", self.output_file_name.replace(".ipynb", ".html"))

        self.create_date = datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")

    @staticmethod
    def sanitise(name):
        """Given a country name as a string, sanitise it for use as URL and
        filename by getting rid of spaces and commas

        return cleaned string.

        (Leave umlauts for now)
        """
        s = name.replace(" ", "-")
        s = s.replace(",", "-")
        return s

    @property
    def get_binder_url(self):
        """Given a notebook name, compute the path"""
        base = "https://mybinder.org/v2/gh/oscovida/binder/master?filepath=ipynb/"
        return base + self.output_file_name.replace(" ", "%20")

    @property
    def mapping(self):
        return {
            "TITLE": self.title,
            "COUNTRY": self.country,
            "BINDER_URL": self.get_binder_url,
            "CREATION_DATE" : self.create_date,
            "OVERVIEW_FUNCTION": self.overview_function,
            "OVERVIEW_ARGS": self.overview_args,
            "DATA_LOAD_FUNCTION": self.data_load_function,
            "DATA_LOAD_ARGS": self.data_load_args
        }

    def generate_notebook(self, templatefile="./template-report.py"):
        with open(templatefile, 'r') as f:
            template_str = f.read()

        template_str = template_str.format_map(self.mapping)

        notebook = ipynb_py_convert.py2nb(template_str)

        with open(self.output_ipynb_path, 'tw') as f:
            print(self.output_ipynb_path)
            json.dump(notebook, f, indent=2)

            print(f"Written file to {self.output_file_name}")

    def generate_html(self, kernel_name='python3'):
        nb_executor = ExecutePreprocessor(kernel_name=kernel_name)
        nb_executor.allow_errors = True

        html_exporter = HTMLExporter()
        html_writer = FilesWriter()

        with open(self.output_ipynb_path) as f:
            nb = nbformat.read(f, as_version=4)
            nb = nb_executor.preprocess(nb)[0]
            body, resources = html_exporter.from_notebook_node(nb)
            #  HTML writer automatically adds .html to the end, so get rid of it
            html_writer.write(body, resources,
                self.output_html_path.replace(".html", ""))

            print(f"Written file to {self.output_html_path}")

## Country Report Generation

In [None]:
class Country(BaseReport):
    def __init__(self, country, wwwroot='wwwroot'):
        title = country
        overview_function = "overview"
        overview_args = f"\"{country}\""
        data_load_function = "get_country_data"
        data_load_args = f"\"{country}\""
        output_file = f"{country}"

        self.check_country_is_known(country)

        super().__init__(country, title, overview_function, overview_args,
                         data_load_function, data_load_args, output_file, wwwroot)

    @staticmethod
    def check_country_is_known(country):
        d = fetch_deaths()
        assert country in d.index, f"{country} is unknown. Known countries are {sorted(d.index)}"

In [None]:
def get_country_list():
    d, c = fetch_deaths(), fetch_cases()

    countries = d.index
    countries2 = c.index
    assert (countries2 == countries).all()
    
    # Here we should identify regions in countries, and process those.
    # Instead, as a quick hack to get started, we'll just take one country
    # and the current "get_country" method will sum over all regions of one country if only 
    # the country name is given.
    
    return sorted(countries.drop_duplicates())
    

In [None]:
def create_html_for_john_hopkins_countries(countries, wwwroot, expiry_hours=2):
    """ Create ipynb for country, and create html from it. Update metadata.
    
    Arguments:
    - countries: list of strings with country names
    - wwwroot path to root of webpages
    - expiry_hours: if the same data set has been processed within the last expiry_hours hours, 
      skip the task and leave the files and metadata untouched.
    
    """

    start_time = time.time()
    does_wwwroot_exist(wwwroot)
    skipped = 0

    for i, country in enumerate(countries):
        m = MetadataRegion(country)
#         if m.last_updated_hours_ago() < expiry_hours:
#             print(f"Skipping {country} - was updated {m.last_updated_hours_ago():.1f} hours ago")
#             skipped += 1
#             continue
            
        # metadata to be used when we create html pages 
        m['source'] = "CSSE Johns Hopkins"
        m['category'] = "world"
        cases, deaths, region_label = get_country_data(country)
        m['max-deaths'] = int(deaths[-1])
        m['max-cases'] = int(cases[-1])
        m['region'] = str(None)
        m['subregion'] = str(None)
        one_line_summary = f"{country}"
        m['one-line-summary'] = one_line_summary  # used as title in table
        
        # compute number of infections in last week
        m['cases-last-week'] = int(get_cases_last_week(cases))
    
        try:
            print(f"Processing {i+1}/{len(countries)} [{time.time()-start_time:4.0f}s]")
            country = Country(country, wwwroot=wwwroot)
            country.generate_notebook()
            country.generate_html()
            m['html-file'] = country.output_ipynb_path
            m['ipynb-name'] = country.output_html_path
            print(f"Mark {country.title} as updated")
            m.mark_as_updated()
        except Exception as e:
            print(f"Error for {country}", end='')
            print(e)
            raise e

        sys.stdout.flush()
        
    print(f"Created {len(countries)-skipped} (skipped {skipped}) notebooks and html versions in " + \
          f"{time.time()-start_time} seconds")
    
    sys.stdout.flush()

In [None]:
def parallel_html_for_countries(countries, wwwroot, pool):
    processes = pool._processes
    padding = processes - (len(countries) % processes)
    countries = countries + ([0] * padding)
    per_process = int(len(countries)/processes)
    countries_per_process = list(countries[i:i+per_process] for i in range(0, len(countries), per_process))
    countries_per_process[-1] = countries_per_process[-1][:-padding]
    
    tasks = ((c, wwwroot) for c in countries_per_process)
    
    res = pool.starmap(create_html_for_john_hopkins_countries, tasks)
    

In [None]:
countries = get_country_list()

countries = countries[0:8]
cores=False

if cores:
    with Pool(cores) as pool:
        parallel_html_for_countries(countries, wwwroot, pool)
else:
    create_html_for_john_hopkins_countries(countries, wwwroot)

In [None]:
index_md = create_markdown_index_list("world")

create_markdown_index_page(index_md, title=TITLE_PREFIX + " Countries of the world", 
                           pelican_file_path="pelican/content/countries.md", save_as="countries", 
                           wwwroot=wwwroot)

## Germany Report Generation

In [None]:
class Germany(BaseReport):
    def __init__(self, region, subregion, wwwroot='wwwroot'):
        country = "Germany"
        title = f"{country}: {subregion} ({region})"
        overview_function = "overview"
        overview_args = f"country=\"{country}\", subregion=\"{subregion}\""
        data_load_function = "germany_get_region"
        data_load_args = f"landkreis=\"{subregion}\""
        output_file = f"Germany-{region}-{subregion}"

        self.germany_check_region_is_known(region)
        self.germany_check_subregion__is_known(subregion)

        super().__init__(country, title, overview_function, overview_args,
                         data_load_function, data_load_args, output_file, wwwroot)

    @staticmethod
    def germany_check_region_is_known(region):
        d = fetch_data_germany()
        assert region in list(d['Bundesland'].drop_duplicates()), \
            f"{region} is unknown."

    @staticmethod
    def germany_check_subregion__is_known(subregion):
        d = fetch_data_germany()
        assert subregion in list(d['Landkreis'].drop_duplicates()), \
            f"{subregion} is unknown."

In [None]:
def get_germany_subregion_list():
    """returns list of subregions (Kreise), 
    ordered according to (i) Land, then (ii) Kreis
    """
    x = fetch_data_germany()
    land_kreis = x[['Bundesland', 'Landkreis']]
    ordered = land_kreis.sort_values(['Bundesland', 'Landkreis'])
    return list(ordered['Landkreis'].drop_duplicates())
 

@joblib_memory.cache
def germany_get_bundesland_from_kreis(kreis):
        x = fetch_data_germany()
        return x[x['Landkreis'] == kreis].iloc[0]['Bundesland']    

In [None]:
def create_html_for_Germany(subregions, wwwroot, expiry_hours=2):
    """If a data set has been created within expire_hours from now, do not update it, but return 
    immediately."""
    does_wwwroot_exist(wwwroot)
    start_time = time.time()
    skipped = 0
    
    for i, kreis in enumerate(subregions):
        m = MetadataRegion(kreis)
#         if m.last_updated_hours_ago() < expiry_hours:
#             print(f"Skipping {kreis} - was updated {m.last_updated_hours_ago():.1f} hours ago")
#             skipped += 1
#             continue
            
        # metadata to be used when we create html pages 
        m['source'] = "Robert Koch Institute"
        m['category'] = "Germany"
        cases, deaths, region_label = get_country_data("Germany",
                                                       subregion=kreis)

        m['max-deaths'] = int(deaths[-1])
        m['max-cases'] = int(cases[-1])
        m['subregion'] = kreis
        bundesland = germany_get_bundesland_from_kreis(kreis)
        m['region'] = bundesland
        one_line_summary = f"Germany: {bundesland} : {kreis}"
        m['one-line-summary'] = one_line_summary  # used as title in table
        # compute number of infections in last week
        m['cases-last-week'] = int(get_cases_last_week(cases))

        try:    
            print(f"Processing {i+1}/{len(subregions)} [{time.time()-start_time:4.0f}s]")
            germany = Germany(bundesland, kreis, wwwroot=wwwroot)
            germany.generate_notebook()
            germany.generate_html()
            m['html-file'] = germany.output_html_path
            m['ipynb-name'] = germany.output_ipynb_path
            print(f"Mark {germany.title} as updated")
            m.mark_as_updated()
        except Exception as e:
            print(f"Error for {kreis}", end='')
            print(e)
            raise
            
        sys.stdout.flush()

    print(f"Created {len(subregions)-skipped} (skipped {skipped}) notebooks and html versions in " + \
          f"{time.time()-start_time} seconds")
    sys.stdout.flush()

In [None]:
def parallel_html_for_germany(subregions, wwwroot, pool):
    processes = pool._processes
    padding = processes - (len(subregions) % processes)
    subregions = subregions + ([0] * padding)
    per_process = int(len(subregions)/processes)
    subregions_per_process = list(subregions[i:i+per_process] for i in range(0, len(subregions), per_process))
    subregions_per_process[-1] = subregions_per_process[-1][:-padding]
    
    tasks = ((c, wwwroot) for c in subregions_per_process)
    
    pool.starmap(create_html_for_Germany, tasks)

In [None]:
wwwroot = "wwwroot"
subregions = get_germany_subregion_list()

# data cleaning: on 13 April, we had a Landkreis "LK Göttingen (alt)"
# with only one data point. This causes plots to fail, because there
# is nothing to plot, and then the legend() command failed.
# We assume that the RKI labels unusual data with '(alt)', and remove those.

alt_data_sets = [x for x in subregions if "(alt)" in x.lower()]
if len(alt_data_sets) > 0:
    print(f"Removing datasets label with '(alt)': {alt_data_sets}")
    for alt in alt_data_sets:
        c, d = germany_get_region(landkreis=alt)
        print(f"  removed: {alt} : len(cases)={len(c)}, len(deaths)={len(d)}")
    subregions = [x for x in subregions if not "(alt)" in x.lower()]

# Actual calculations

subregions = subregions[0:8]
cores=2
    
if cores:
    with Pool(cores) as pool:
        parallel_html_for_germany(subregions, wwwroot, pool)
else:
    create_html_for_Germany(subregions, wwwroot)

In [None]:
index_md = create_markdown_index_list(category="Germany")

create_markdown_index_page(index_md, title= TITLE_PREFIX + " Germany", 
                           pelican_file_path="pelican/content/germany.md", save_as="germany", 
                           wwwroot=wwwroot)

## USA Report Generation

In [None]:
class USA(BaseReport):
    def __init__(self, region, wwwroot='wwwroot'):
        country = "USA"
        title = f"United States: {region}"
        overview_function = "overview"
        overview_args = f"country=\"US\", region=\"{region}\""
        data_load_function = "get_country_data"
        data_load_args = f"\"US\", \"{region}\""
        output_file = f"US-{region}"

        super().__init__(country, title, overview_function, overview_args,
                         data_load_function, data_load_args, output_file, wwwroot)

In [None]:
def parallel_html_for_US(subregions, wwwroot, pool):
    processes = pool._processes
    padding = processes - (len(subregions) % processes)
    subregions = subregions + ([0] * padding)
    per_process = int(len(subregions)/processes)
    subregions_per_process = list(subregions[i:i+per_process] for i in range(0, len(subregions), per_process))
    subregions_per_process[-1] = subregions_per_process[-1][:-padding]
    
    tasks = ((c, wwwroot) for c in subregions_per_process)
    
    pool.starmap(create_html_for_US, tasks)
    

In [None]:
def create_html_for_US(states, wwwroot, expiry_hours=2):
    """If a data set has been created within expire_hours from now, do not update it, but return 
    immediately."""
    does_wwwroot_exist(wwwroot)
    start_time = time.time()
    skipped = 0
    
    for i, state in enumerate(states):
        name = f"US-{state}"
        m = MetadataRegion(name)
#         if m.last_updated_hours_ago() < expiry_hours:
#             print(f"Skipping {name} - was updated {m.last_updated_hours_ago():.1f} hours ago")
#             skipped += 1
#             continue
            
        # metadata to be used when we create html pages 
        m['source'] = "Johns Hopkins University CSSE"
        m['category'] = "US"
        cases, deaths = get_region_US(state)
        m['max-deaths'] = int(deaths[-1])
        m['max-cases'] = int(cases[-1])
        m['subregion'] = None  # would be county
        m['region'] = state
        one_line_summary = f"US: {state}"
        m['one-line-summary'] = one_line_summary  # used as title in table
        # compute number of infections in last week
        m['cases-last-week'] = int(get_cases_last_week(cases))
    

        try:    
            print(f"Processing {i+1}/{len(states)} [{time.time()-start_time:4.0f}s]")
            usa = USA(state, wwwroot=wwwroot)
            usa.generate_notebook()
            usa.generate_html()
            m['html-file'] = usa.output_ipynb_path
            m['ipynb-name'] = usa.output_html_path
            print(f"Mark {usa.title} as updated")
            m.mark_as_updated()
        except Exception as e:
            print(f"Error for {name}", end='')
            print(e)
            raise
            
        sys.stdout.flush()

    print(f"Created {len(states)-skipped} (skipped {skipped}) notebooks and html versions in " + \
          f"{time.time()-start_time} seconds")
    sys.stdout.flush()

In [None]:
states = get_US_region_list()

# Actual calculations
states = states[0:8]
cores=2

if cores:
    with Pool(cores) as pool:
        parallel_html_for_US(states, wwwroot, pool)
else:
    create_html_for_US(states, wwwroot)
