## Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime
import json
import os
import shutil
import sys
from multiprocessing import Pool, cpu_count

import ipynb_py_convert
import nbformat
import numpy as np
from nbconvert import HTMLExporter
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert.writers import FilesWriter

from itertools import compress
import functools
import concurrent.futures
from tqdm.notebook import trange, tqdm
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import concurrent.futures

In [None]:
from report_generators.reporters import CountryReport, GermanyReport, USAReport

In [None]:
debug = False

In [None]:
#  Crappy solution to me not knowing how to use concurrent futures rigth
global shutdown

In [None]:
#  Set to false if you do not want multiprocessing enabled
workers = 16 #'auto'

if workers == 'auto':
    workers = max(1, cpu_count())
    # try at most 4 to reduce probability of error message like
    # the one shown at https://github.com/jupyter/jupyter_client/issues/541
    workers = min(cores, 4)


if workers:
    print(f'Using {workers} processes')
    
wwwroot = "wwwroot"

In [None]:
%config InlineBackend.figure_formats = ['svg']

from coronavirus import *
from coronavirus import MetadataRegion

pd.set_option('display.float_format', '{:.2f}'.format)  #  Disable pandas scientific notation

Cleaning of cache and copying files has moved to

- `generate-webpage-clean-setup.py` and 
- `generate-webpage-clean-setup.sh`

In [None]:
TITLE_PREFIX = "Tracking plots: "

### Download Datasets

In [None]:
d, c = fetch_deaths(), fetch_cases()

countries = d.index
countries2 = c.index
assert (countries2 == countries).all()

In [None]:
data_US_cases = fetch_cases_US()
data_US_deaths = fetch_deaths_US()

In [None]:
# also fetch data from Germany, so it is available later from the cache
germany = fetch_data_germany()

### Generic Functions

In [None]:
def does_wwwroot_exist(wwwroot):
    if not os.path.exists(wwwroot):
        msg = "To put the html into github repo for webhosting, run "
        msg += '"git clone git@github.com:oscovida/oscovida.github.io.git wwwroot" or similar'
        # os.mkdir(wwwroot)
        raise ValueError(f"directory {wwwroot} missing.")

## Index Page Generation

In [None]:
def create_markdown_index_list(category):
    """Assemble a markdown table like this:
    
    | Country/Region                       | Total cases   | Total deaths   |
    |:-------------------------------------|:--------------|:---------------|
    | [Afghanistan](html/Afghanistan.html) | 1,351         | 43             |
    | [Albania](html/Albania.html)         | 678           | 27             |
    | [Algeria](html/Algeria.html)         | 3,127         | 415            |
    | [Andorra](html/Andorra.html)         | 731           | 40             |
    
    and return as string.
    """
    
    known_categories = ["world", "Germany", "US"]

    # gather data
    regions_all = MetadataRegion.get_all_as_dataframe()
    if category in known_categories:
        # select those we are interested in
        regions = regions_all[regions_all['category'] == category]
    elif category in ["all-regions"]:
        regions = regions_all
    else:
        
        raise NotImplementedError(f"category {category} is unknown."+
                                  f" Known values are {known_categories + ['all-regions']}")
    
    # change index to contain URLs and one-line summary in markdown syntax
    def compose_md_url(x):
        one_line_summary, html = x
        if isinstance(html, str):
            return "[" + one_line_summary + "](" + os.path.join('html', html) +")"
        elif repr(html) == 'nan':   # if html was not produced, then variable html is np.nan
            print(f"Missing html for {one_line_summary} - will not add link to html: \n{x}")
            return one_line_summary
        else:
            raise NotImplementedError("Don't know how to proceed: ", one_line_summary, html, x)

    new_index = regions[['one-line-summary', 'html-file']].apply(compose_md_url, axis=1)
    regions2 = regions.set_index(new_index)
    regions2.index.name = "Location"
    
    # select columns
    regions3 = regions2[['max-cases', 'max-deaths', 'cases-last-week']]
    regions4 = regions3.applymap(lambda v: '{:,}'.format(v))  # Thousands comma separator
    
    # rename columns
    rename_dict = {'max-cases' : 'Total cases', 
                   'max-deaths' : 'Total deaths',
                   'cases-last-week' : 'New cases last week'}
    regions5 = regions4.rename(columns=rename_dict)

    return regions5.to_markdown()


In [None]:
def create_markdown_index_page(md_content, title, pelican_file_path, 
                               save_as, wwwroot, slug=None):
    """Create pelican markdown file, like this:
    
    title: Germany
    tags: Data, Plots, Germany
    save-as: germany
    date: 2020-04-11 08:00
    """

    if slug is None:
        slug = save_as
    
    with open(os.path.join(pelican_file_path), "tw") as f:
        f.write(f"title: {title}\n")
        # f.write(f"category: Data\n")  - have stopped using categories (22 April 2020)
        f.write(f"tags: Data, Plots, {title}\n")
        f.write(f"save-as: {save_as}\n")
        f.write(f"slug: {slug}\n")
        date_time = datetime.datetime.now().strftime("%Y/%m/%d %H:%M")
        f.write(f"date: {date_time}\n")
        f.write("\n")
        f.write("\n")
        f.write(md_content)
        f.write("\n")

In [None]:
def create_index_page(sections, rootname, wwwroot):
    """Sections is dictionary: key is title, value is markdown text"""
    md_file = rootname + ".md"
    
    with open(os.path.join(wwwroot, md_file), "tw") as f:
        for section in sections:
            f.write(f"# {section}\n\n")
            f.write(sections[section])
    print(f"Written overview to {md_file}.")
    html_file = rootname + ".html"
    subprocess.check_call(f"pandoc -t html -o {os.path.join(wwwroot, html_file)} " +
                          f"{os.path.join(wwwroot, md_file)}", shell=True)
    return html_file

### Serial and Parallel Report Functions

In [None]:
def create_html_report_single(region, *,
                              Reporter, wwwroot,
                              expiry_hours=2, attempts=3, force=False, verbose=False):
    for attempt in range(attempts):
        if 'shutdown' in globals() and shutdown:
            raise KeyboardInterrupt
        try:
            report = Reporter(region, wwwroot=wwwroot, verbose=verbose)
            if report.metadata.last_updated_hours_ago() < expiry_hours and not force:
                continue
            report.generate()
        except Exception as e:
            if type(e) == KeyboardInterrupt:
                raise e
            if attempt+1 == attempts:
                print(f"Error for {region}")
                print(e)
                raise e
        else:
            break

In [None]:
def create_html_reports_serial(regions, *,
                               Reporter, wwwroot,
                               expiry_hours=2, attempts=3, force=False, verbose=False):
    pbar = trange(len(regions))
    for i in pbar:
        region = regions[i]
        region_str = region[-1] if type(region) == list else region
        pbar.set_description(f"Processing {region_str}")
        create_html_report_single(region,
                                  Reporter=Reporter, wwwroot=wwwroot,
                                  attempts=attempts, force=force, verbose=verbose)

In [None]:
def create_html_reports_parallel(regions, workers, pool, *,
                                 Reporter, wwwroot,
                                 expiry_hours=2, attempts=3, force=False, verbose=False):
    padding = workers - (len(regions) % workers)
    regions = regions + ([None] * padding)
    per_worker = int(len(regions)/workers)
    #  Weird way to create an evenly distributed list
    regions_per_worker = [[] for p in range(workers)]
    [regions_per_worker[w].append(r) for w, r in list(zip(list(range(workers))*per_worker, regions))]
    regions_per_worker = [list(filter(None.__ne__, worker)) for worker in regions_per_worker]
    
    print(f"Using {workers} workers with tasks:")
    for n in range(workers):
        if len(regions_per_worker[n]) > 5:
            print(f"\t{n}: {len(regions_per_worker[n])} regions...")
        else:
            print(f"\t{n}: {regions_per_worker[n]}")
    print("")
    
    wrapper = functools.partial(create_html_reports_serial,
                                Reporter=Reporter, wwwroot=wwwroot,
                                attempts=attempts, force=force, verbose=verbose)

    pool.map(wrapper, regions_per_worker)

In [None]:
def create_html_reports(*, Reporter, regions, wwwroot,
                        expiry_hours=2, attempts=3, force=False, verbose=False,
                        workers=None):
    if workers:
        #  Works with both ThreadPoolExecutor and ProcessPoolExecutor
        #  for this task multithreading and multiprocessing perform
        #  about the same
        with ThreadPoolExecutor(max_workers=workers) as pool:
            create_html_reports_parallel(
                regions, workers, pool,
                Reporter=Reporter, wwwroot=wwwroot,
                expiry_hours=expiry_hours, attempts=attempts, force=force, verbose=verbose)

    else:
        create_html_reports_serial(regions,
                                   Reporter=Reporter, wwwroot=wwwroot,
                                   expiry_hours=expiry_hours, attempts=attempts, force=force, verbose=verbose)

## Country Report Generation

In [None]:
def get_country_list():
    d, c = fetch_deaths(), fetch_cases()

    countries = d.index
    countries2 = c.index
    assert (countries2 == countries).all()
    
    # Here we should identify regions in countries, and process those.
    # Instead, as a quick hack to get started, we'll just take one country
    # and the current "get_country" method will sum over all regions of one country if only 
    # the country name is given.
    
    return sorted(countries.drop_duplicates())

In [None]:
countries = get_country_list()

In [None]:
#  This used to be in `create_html_reports` wrapping the executor, didn't work
#  but it does work here so... *shrug*
try:
    create_html_reports(
            Reporter=CountryReport, regions=countries,
            wwwroot=wwwroot, workers=workers,
        )
except KeyboardInterrupt:
    shutdown = True

In [None]:
index_md = create_markdown_index_list("world")

create_markdown_index_page(
    index_md, title=TITLE_PREFIX + " Countries of the world", 
    pelican_file_path="pelican/content/countries.md", save_as="countries", 
    wwwroot=wwwroot
)

## Germany Report Generation

In [None]:
def get_germany_regions_list():
    data_germany = fetch_data_germany()
    land_kreis = data_germany[['Bundesland', 'Landkreis']]
    ordered = land_kreis.sort_values(['Bundesland', 'Landkreis'])
    return ordered.drop_duplicates().values.tolist()

In [None]:
germany_regions = get_germany_regions_list()
wwwroot = "wwwroot"

# data cleaning: on 13 April, we had a Landkreis "LK Göttingen (alt)"
# with only one data point. This causes plots to fail, because there
# is nothing to plot, and then the legend() command failed.
# We assume that the RKI labels unusual data with '(alt)', and remove those.

alt_data_sets = ["(alt)" in r[1].lower() for r in germany_regions]
if sum(alt_data_sets) > 0:
    bad_datasests = list(compress(germany_regions, alt_data_sets))
    
    print(f"Removing datasets label with '(alt)': {bad_datasests}")

    for bd in bad_datasests:
        c, d, _ = germany_get_region(landkreis=bd[1])
        print(f"\tremoved: {bd} : len(cases)={len(c)}, len(deaths)={len(d)}")

    bad_indecies = list(compress(range(len(alt_data_sets)), alt_data_sets))

    [germany_regions.pop(i) for i in bad_indecies]

In [None]:
#  This used to be in `create_html_reports` wrapping the executor, didn't work
#  but it does work here so... *shrug*
try:
    shutdown = False
    create_html_reports(
            Reporter=GermanyReport, regions=germany_regions,
            wwwroot=wwwroot, workers=workers,
        )
except KeyboardInterrupt:
    shutdown = True

In [None]:
index_md = create_markdown_index_list(category="Germany")

create_markdown_index_page(
    index_md, title= TITLE_PREFIX + " Germany", 
    pelican_file_path="pelican/content/germany.md", save_as="germany", 
    wwwroot=wwwroot
)

## USA Report Generation

In [None]:
states = get_US_region_list()

In [None]:
#  This used to be in `create_html_reports` wrapping the executor, didn't work
#  but it does work here so... *shrug*
try:
    shutdown = False
    create_html_reports(
            Reporter=USAReport, regions=states,
            wwwroot=wwwroot, workers=workers,
        )
except KeyboardInterrupt:
    shutdown = True

In [None]:
index_md = create_markdown_index_list("US")

create_markdown_index_page(
    index_md, title=TITLE_PREFIX + " United States", 
    pelican_file_path="pelican/content/US.md", save_as="us", 
    wwwroot=wwwroot
)

### HTML Pages for All Regions

In [None]:
index_md = create_markdown_index_list("all-regions")
create_markdown_index_page(
    index_md, title=TITLE_PREFIX + " All regions and countries", 
    pelican_file_path="pelican/content/all-regions.md", save_as="all-regions", 
    wwwroot=wwwroot
)

## Error Reporting

In [None]:
ms = MetadataRegion.get_all()
for name in ms:
    m = MetadataRegion(name)
    dt = m.last_updated_hours_ago()
    if dt > 2:
        print(f"Problem with '{name}', last update: {dt} ago ")