In [1]:
%load_ext autoreload
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
sys.path.insert(0,os.path.dirname(currentdir))

In [4]:
%autoreload 2

import pandas as pd

from tools import DataLoader
from configparser import ConfigParser
from urllib.request import urlopen
import json
import requests
from bs4 import BeautifulSoup

# configuration
parser = ConfigParser()
parser.read("../settings.ini")


%aimport

Modules to reload:
all-except-skipped

Modules to skip:



In [5]:
load=DataLoader(parser)

In [7]:
class Extract:

    def __init__(self, parser: ConfigParser):

        self.data = self.load_jhu(parser)

        self.country_info = self.read_geonames_country_info(parser)

    def load_jhu(self, parser):

        lookup = pd.read_csv(parser.get("urls", "jhu_lookup_url"))
        lookup.rename(columns={"Country_Region": "region"}, inplace=True)

        country_info = self.read_geonames_country_info(parser)

        def read_prepare_data(url):
            data_raw = pd.read_csv(parser.get("urls", url))
            data_raw.rename(columns={"Country/Region": "region"}, inplace=True)
            data = (
                data_raw.groupby("region")
                .sum()
                .drop(columns=["Lat", "Long"])
                .reset_index()
            )

            return data

        def create_timeseries(data, lookup, value_name):
            id_vars = "region"
            var_name = "date"
            timeseries = pd.melt(
                data, id_vars=id_vars, var_name=var_name, value_name=value_name
            )
            timeseries = pd.merge(
                lookup[["iso2", "iso3", "code3", id_vars]
                       ].groupby(id_vars).first(),
                timeseries,
                on=id_vars,
                how="inner",
            )
            timeseries.loc[:, var_name] = pd.to_datetime(
                timeseries.loc[:, var_name])
            return timeseries

        confirmed_data = read_prepare_data("jhu_confirmed_url")
        deaths_data = read_prepare_data("jhu_deaths_url")
        recovered_data = read_prepare_data("jhu_recovered_url")

        confirmed = create_timeseries(confirmed_data, lookup, "confirmed")
        deaths = create_timeseries(deaths_data, lookup, "deaths")
        recovered = create_timeseries(recovered_data, lookup, "recovered")

        data = pd.merge(
            deaths[["date", "region", "iso3", "deaths"]],
            confirmed[["date", "confirmed", "iso3"]],
            on=["iso3", "date"],
            how="inner",
        )
        data = pd.merge(
            data,
            recovered[["date", "recovered", "iso3"]],

            on=["iso3", "date"],
            how="inner",
        )

        data.rename(columns={"confirmed": "cases"}, inplace=True)
    

        return data

    def read_geonames_country_info(self, parser):

        res = requests.get(parser.get("urls", "geonames_countries_url"))
        soup = BeautifulSoup(res.content, "lxml")
        table = soup.find_all("table", id="countries")
        country_info = pd.read_html(str(table), keep_default_na=False)[0]
        country_info.rename(
            columns={
                "ISO-3166alpha2": "iso_alpha2",
                "ISO-3166alpha3": "iso_alpha",
                "ISO-3166numeric": "iso_num",
                "Country": "region",
                "Population": "population",
                "Continent": "continent",
            },
            inplace=True,
        )

        return country_info

In [119]:
%%time

class Transform(Extract):

    def __init__(self, parser: ConfigParser, indicators):

        super().__init__(parser)

        self.data = self.add_country_info(self.data, self.country_info)

        timeseries=data.data.groupby(["date", "continent"]).agg({"iso3": "max","deaths": "sum", "cases": "sum", "recovered": "sum", "population": "sum"})
        timeseries.reset_index(inplace=True)
        timeseries.rename(columns={"continent": "region"}, inplace=True)
       
        world=data.data.groupby(["date"]).agg({"region": "max","iso3": "max","deaths": "sum", "cases": "sum", "recovered": "sum", "population": "sum"})
        world.reset_index(inplace=True)
        world.loc[:,"region"] = "World"
      
        timeseries = timeseries.append(world)
        timeseries.loc[:,"iso3"] = False
       
        timeseries = timeseries.append(data.data.drop(columns={"continent"}))

        for i, indicator in indicators().items():
            timeseries = self.add_indicator(
                timeseries, indicator["name"], indicator["columns"], indicator["norming"], indicator["digits"])

        self.timeseries = timeseries

        self.per_country_max = timeseries[timeseries.date ==timeseries.date.max()]

    def add_indicator(self, data_input, name, attributes, norming, digits):
        '''
        adds columns with values for indicators as calculated from "attributes"
        '''

        data = data_input.copy()
        data.loc[:, name] = (data.loc[:, attributes[0]] /
                             data.loc[:, attributes[1]] * norming).round(digits)

        return data

    def add_country_info(self, data, country_info):
        data = pd.merge(
            data,
            country_info[["iso_alpha", "population", "continent"]],
            left_on="iso3",
            right_on="iso_alpha",
            how="inner",
        )
        
        data.drop(columns=["iso_alpha"], inplace=True)
        
        return data

    def create_timeseries(self, data, region):

        if region:
            timeseries = data.groupby(
                [region, "date"]).sum()

            timeseries.reset_index(inplace=True)
        else:
            timeseries = data.groupby(
                "date").sum()

            timeseries.reset_index(inplace=True)

        return timeseries


class DataLoader(Transform):
    def __init__(self, parser: ConfigParser):

        super().__init__(parser, self.indicators)

        self.regions = self.regions()

        self.countries = self.countries_geojson(parser)

    def countries_geojson(self, parser):

        with urlopen(parser.get("urls", "mapbox_countries_url")) as response:
            countries = json.load(response)

        return countries

    def regions(self):

        regions = {
            "World": {"name": "World", "center": {"lat": 35, "lon": 0}, "zoom": 0.2},
            "EU": {"name": "Europe", "center": {"lat": 52, "lon": 0}, "zoom": 2.5},
            "NA": {"name": "N.America", "center": {"lat": 50, "lon": -95}, "zoom": 2},
            "SA": {"name": "S.America", "center": {"lat": -20, "lon": -70}, "zoom": 1.7},
            "AS": {"name": "Asia", "center": {"lat": 40, "lon": 90}, "zoom": 1.7},
            "AF": {"name": "Africa", "center": {"lat": 5, "lon": 10}, "zoom": 1.6},
            "OC": {"name": "Oceania", "center": {"lat": -30, "lon": 145}, "zoom": 2.2}
        }

        return regions

    def indicators(self):
        indicators = {
            "cases": {
                "name": "cases/1M capita",
                "columns": ["cases", "population"],
                "norming": 100000,
                "digits": 0
            },
            "deaths": {
                "name": "deaths/1M capita",
                "columns": ["deaths", "population"],
                "norming": 100000,
                "digits": 0},
            "recovered": {
                "name": "recovered(%)",
                "columns": ["recovered", "cases"],
                "norming": 100,
                "digits": 0},
            "lethality": {
                "name": "lethality(%)",
                "columns": ["deaths", "cases"],
                "norming": 100,
                "digits": 2},
            "mortality": {
                "name": "mortality(%)",
                "columns": ["deaths", "population"],
                "norming": 100,
                "digits": 3},
        }

        return indicators
    
data = DataLoader(parser)

CPU times: user 1.74 s, sys: 63.2 ms, total: 1.8 s
Wall time: 3.59 s


In [122]:
data.timeseries[data.timeseries.region=="World"]

Unnamed: 0,date,region,iso3,deaths,cases,recovered,population,cases/1M capita,deaths/1M capita,recovered(%),lethality(%),mortality(%)
0,2020-01-22,World,False,17,555,28,7527061250,0.0,0.0,5.0,3.06,0.000
1,2020-01-23,World,False,18,654,30,7527061250,0.0,0.0,5.0,2.75,0.000
2,2020-01-24,World,False,26,941,36,7527061250,0.0,0.0,4.0,2.76,0.000
3,2020-01-25,World,False,42,1434,39,7527061250,0.0,0.0,3.0,2.93,0.000
4,2020-01-26,World,False,56,2118,52,7527061250,0.0,0.0,2.0,2.64,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...
78,2020-04-09,World,False,95437,1594445,353326,7527061250,21.0,1.0,22.0,5.99,0.001
79,2020-04-10,World,False,102505,1690748,375425,7527061250,22.0,1.0,22.0,6.06,0.001
80,2020-04-11,World,False,108483,1770510,401433,7527061250,24.0,1.0,23.0,6.13,0.001
81,2020-04-12,World,False,114071,1845675,421045,7527061250,25.0,2.0,23.0,6.18,0.002


In [57]:
timeseries=data.data.groupby(["date", "continent"]).agg({"iso3": "max","deaths": "sum", "cases": "sum", "recovered": "sum", "population": "sum"})
timeseries.reset_index(inplace=True)
timeseries.rename(columns={"continent": "region"}, inplace=True)
timeseries.loc[:,"iso3"] = False
timeseries#[timeseries.region=="AF"]

Unnamed: 0,date,region,iso3,deaths,cases,recovered,population
0,2020-01-22,AF,False,0,0,0,1264761817
1,2020-01-22,AS,False,17,554,28,4474285900
2,2020-01-22,EU,False,0,0,0,745571470
3,2020-01-22,,False,0,1,0,578407428
4,2020-01-22,OC,False,0,0,0,40635640
...,...,...,...,...,...,...,...
493,2020-04-13,AS,False,11319,309324,150494,4474285900
494,2020-04-13,EU,False,80036,909441,231638,745571470
495,2020-04-13,,False,24960,620018,53566,578407428
496,2020-04-13,OC,False,66,7722,2353,40635640


In [82]:
world=data.data.groupby(["date"]).agg({"iso3": "max","deaths": "sum", "cases": "sum", "recovered": "sum", "population": "sum"})
world.reset_index(inplace=True)
world.loc[:,"continent"] = "World"
world.loc[:,"iso3"] = False
#timeseries.rename(columns={"continent": "region"}, inplace=True)
#timeseries[timeseries.region=="AF"]
world
timeseries.append(timeseries)

Unnamed: 0,date,region,iso3,deaths,cases,recovered,population
0,2020-01-22,AF,False,0,0,0,1264761817
1,2020-01-22,AS,False,17,554,28,4474285900
2,2020-01-22,EU,False,0,0,0,745571470
3,2020-01-22,,False,0,1,0,578407428
4,2020-01-22,OC,False,0,0,0,40635640
...,...,...,...,...,...,...,...
493,2020-04-13,AS,False,11319,309324,150494,4474285900
494,2020-04-13,EU,False,80036,909441,231638,745571470
495,2020-04-13,,False,24960,620018,53566,578407428
496,2020-04-13,OC,False,66,7722,2353,40635640


In [118]:
timeseries=data.data.groupby(["date", "continent"]).agg({"iso3": "max","deaths": "sum", "cases": "sum", "recovered": "sum", "population": "sum"})
timeseries.reset_index(inplace=True)
timeseries.rename(columns={"continent": "region"}, inplace=True)
#timeseries.loc[:,"continent"] = timeseries.loc[:,"region"]

world=data.data.groupby(["date"]).agg({"region": "max","iso3": "max","deaths": "sum", "cases": "sum", "recovered": "sum", "population": "sum"})
world.reset_index(inplace=True)
world.loc[:,"region"] = "World"
#world.drop(columns={"continent"}, inplace=True)

timeseries = timeseries.append(world)
timeseries.loc[:,"iso3"] = False
#timeseries.loc[:,"Region"] = False

timeseries = timeseries.append(data.data.drop(columns={"continent"}))

timeseries[timeseries.region=="AF"]
#world

Unnamed: 0,date,region,iso3,deaths,cases,recovered,population
0,2020-01-22,AF,False,0,0,0,1264761817
6,2020-01-23,AF,False,0,0,0,1264761817
12,2020-01-24,AF,False,0,0,0,1264761817
18,2020-01-25,AF,False,0,0,0,1264761817
24,2020-01-26,AF,False,0,0,0,1264761817
...,...,...,...,...,...,...,...
468,2020-04-09,AF,False,627,12253,1624,1264761817
474,2020-04-10,AF,False,693,12917,2139,1264761817
480,2020-04-11,AF,False,742,13631,2358,1264761817
486,2020-04-12,AF,False,788,14497,2823,1264761817


In [125]:
len(["test"])

1

In [33]:
        indicators = {
            "cases": {
                "name": "cases/1M capita",
                "columns": ["cases", "population"],
                "norming": 100000
            },
            "deaths": {
                "name": "deaths/1M capita",
                "columns": ["deaths", "population"],
                "norming": 100000},
            "recovered": {
                "name": "recovered/1M capita",
                "columns": ["recovered", "population"],
                "norming": 100000},
            "lethality": {
                "name": "lethality(%)",
                "columns": ["deaths", "cases"],
                "norming": 100},
            "mortality": {
                "name": "mortality(%)",
                "columns": ["deaths", "population"],
                "norming": 100},
        }

In [34]:
def dropdown_options(indicators):
    options = []
    for i,j in indicators.items():
        print(j["name"])
        options.append({"label": j["name"], "value": i})
        
    return options
        
test = dropdown_options(indicators)
test

cases/1M capita
deaths/1M capita
recovered/1M capita
lethality(%)
mortality(%)


[{'label': 'cases/1M capita', 'value': 'cases'},
 {'label': 'deaths/1M capita', 'value': 'deaths'},
 {'label': 'recovered/1M capita', 'value': 'recovered'},
 {'label': 'lethality(%)', 'value': 'lethality'},
 {'label': 'mortality(%)', 'value': 'mortality'}]

In [36]:
for i,j in indicators.items():
    print(j["columns"])

['cases', 'population']
['deaths', 'population']
['recovered', 'population']
['deaths', 'cases']
['deaths', 'population']


In [38]:
indicators{0}

SyntaxError: invalid syntax (<ipython-input-38-9e91b292fa09>, line 1)

In [197]:
class Transform(Extract):

    def __init__(self, data, country_info):

        self.info = "transform data without loading it every time XXX"

        #self.data = super().data

        #self.data = self.extract.data

        self.data = self.add_country_info(data, country_info)

    def add_country_info(self, data,country_info):
        data = pd.merge(
            data,
            country_info[["iso_alpha", "population", "continent"]],
            left_on="iso3",
            right_on="iso_alpha",
            how="inner",
        )#.drop(columns=["iso3", "iso2", "code3"])

        return data

NameError: name 'Extract' is not defined

In [215]:
class Person:
    def __init__(self, fname, lname):
        self.firstname = fname
        self.lastname = lname
        
    def printname(self):
        print(self.firstname, self.lastname)

class Student(Person):
    def __init__(self, fname, lname):
        super().__init__(fname, lname)
        self.graduationyear = 2019

x = Student("Mike", "Olsen")
print(x.graduationyear)

2019


In [175]:
transform = Transform(load.data, load.country_info)


In [177]:
transform.data

Unnamed: 0,date,deaths,iso3,cases,recovered,iso_alpha,population,continent
0,2020-01-22,0,AFG,0,0,AFG,37172386,AS
1,2020-01-23,0,AFG,0,0,AFG,37172386,AS
2,2020-01-24,0,AFG,0,0,AFG,37172386,AS
3,2020-01-25,0,AFG,0,0,AFG,37172386,AS
4,2020-01-26,0,AFG,0,0,AFG,37172386,AS
...,...,...,...,...,...,...,...,...
14737,2020-04-07,2,ZWE,11,0,ZWE,14439018,AF
14738,2020-04-08,3,ZWE,11,0,ZWE,14439018,AF
14739,2020-04-09,3,ZWE,11,0,ZWE,14439018,AF
14740,2020-04-10,3,ZWE,13,0,ZWE,14439018,AF


In [50]:
data.deaths.sort_values("date", ascending=False)#[data.confirmed.date=="index"]#

AttributeError: 'DataLoader' object has no attribute 'deaths'

In [37]:
data.country_info.head(3)

Unnamed: 0,iso_alpha2,iso_alpha,iso_num,fips,region,Capital,Area in km²,population,continent
0,AD,AND,20,AN,Andorra,Andorra la Vella,468.0,77006,EU
1,AE,ARE,784,AE,United Arab Emirates,Abu Dhabi,82880.0,9630959,AS
2,AF,AFG,4,AF,Afghanistan,Kabul,647500.0,37172386,AS


In [38]:
data_norm = pd.merge(data.deaths.sort_values(["region", "date"]), data.country_info[["iso_alpha", "population", "continent"]], left_on="iso3", right_on="iso_alpha", how="inner").drop(columns=["iso3", "iso2", "code3"])
data_norm = pd.merge(data_norm, data.confirmed.sort_values(["region", "date"])[["date","confirmed",  "iso3"]], left_on=["iso_alpha", "date"], right_on=["iso3", "date"], how="inner").drop(columns=["iso3"])
data_norm = pd.merge(data_norm, data.recovered.sort_values(["region", "date"])[["date","recovered",  "iso3"]], left_on=["iso_alpha", "date"], right_on=["iso3", "date"], how="inner").drop(columns=["iso3"])

data_norm[data_norm.iso_alpha=="USA"]

Unnamed: 0,region,date,deaths,iso_alpha,population,continent,confirmed,recovered
13440,US,2020-01-22,0,USA,327167434,,1,0
13441,US,2020-01-23,0,USA,327167434,,1,0
13442,US,2020-01-24,0,USA,327167434,,2,0
13443,US,2020-01-25,0,USA,327167434,,2,0
13444,US,2020-01-26,0,USA,327167434,,5,0
...,...,...,...,...,...,...,...,...
13515,US,2020-04-06,10783,USA,327167434,,366667,19581
13516,US,2020-04-07,12722,USA,327167434,,396223,21763
13517,US,2020-04-08,14695,USA,327167434,,429052,23559
13518,US,2020-04-09,16478,USA,327167434,,461437,25410


In [10]:
data_norm.loc[:, "confirmed/1M capita"] = (
    data_norm.confirmed / data_norm.population * 1000000
).round(0)
data_norm.loc[:, "deaths/1M capita"] = (
    data_norm.deaths / data_norm.population * 1000000
).round(0)
data_norm.loc[:, "recovered/1M capita"] = (
    data_norm.recovered / data_norm.population * 1000000
).round(0)

data_norm[data_norm.iso_alpha=="USA"].tail()

NameError: name 'data_norm' is not defined

In [12]:
summary_country = data_norm[data_norm.date==data_norm.date.max()]#.groupby("iso_alpha").sum()



aggregation = {
    "cases": "sum",
    "deaths": "sum",
    "country": "max",
    "population": "max",
}
#summary_country = summary_country.agg(aggregation)

summary_country[summary_country.region=="Germany"]

NameError: name 'data_norm' is not defined

In [13]:
timeseries = data_norm.groupby(["continent", "date"]).sum()

world = timeseries.groupby("date").sum()
world = pd.DataFrame(index=[pd.Series(data="world").repeat(len(world.index)), world.index], data=world.values, columns=world.columns)

timeseries = pd.concat([timeseries, world])
timeseries#.loc["world","confirmed"]
timeseries.loc["NA"].head(50)

NameError: name 'data_norm' is not defined

In [15]:
summary_country.loc[:, "Cases/Mio. capita"] = (
    summary_country.cases / summary_country.population * 1000000
).round(0)
summary_country.loc[:, "Deaths/Mio. capita"] = (
    summary_country.deaths / summary_country.population * 1000000
).round(0)
summary_country.reset_index(inplace=True)

NameError: name 'summary_country' is not defined

In [39]:
# check consistency
test = data.confirmed[data.confirmed.iso2=="CN"]
test[test.date=="1/22/20"]

Unnamed: 0,region,iso2,iso3,code3,date,confirmed
2880,China,CN,CHN,156.0,2020-01-22,548


In [40]:
geonames_country_info = "http://download.geonames.org/export/dump/countryInfo.txt"
country_info = pd.read_csv(geonames_country_info, skiprows=49, delimiter="\t", na_filter=False)
country_info.drop(columns=["#ISO"], inplace=True)
country_info.rename(columns={"fips": "geoId", "ISO3":"iso_alpha", "ISO-Numeric": "iso_num"}, inplace=True)
country_info[country_info.Country=="Canada"].head(3)

Unnamed: 0,iso_alpha,iso_num,geoId,Country,Capital,Area(in sq km),Population,Continent,tld,CurrencyCode,CurrencyName,Phone,Postal Code Format,Postal Code Regex,Languages,geonameid,neighbours,EquivalentFipsCode
37,CAN,124,CA,Canada,Ottawa,9984670.0,37058856,,.ca,CAD,Dollar,1,@#@ #@#,^([ABCEGHJKLMNPRSTVXY]\d[ABCEGHJKLMNPRSTVWXYZ]...,"en-CA,fr-CA,iu",6251999,US,


In [18]:
def create_timeseries(data, lookup, id_vars, var_name, value_name):
    timeseries_d = pd.melt(data, id_vars=id_vars, var_name=var_name, value_name=value_name)
    timeseries = pd.merge(lookup[["iso2", "iso3", "code3", "Country/Region"]].groupby("Country/Region").first(), timeseries_d, on=id_vars, how="right")
    return timeseries

test = create_timeseries(recovered_c, lookup, "Country/Region", "date", "recovered")
test[test.iso2=="CN"]

NameError: name 'recovered_c' is not defined

In [41]:
data.__dict__.keys()

dict_keys(['country_info', 'lookup', 'confirmed', 'deaths', 'recovered', 'countries', 'data_norm', 'timeseries', 'world', 'per_country_max', 'regions'])

In [26]:
recovered_timeseries_1 = pd.merge(lookup[["iso2", "iso3", "code3", "Country/Region"]], recovered_timeseries, left_on="Country/Region", right_on="Country/Region", how="left").drop(columns=["Country/Region"])
recovered_timeseries_1[recovered_timeseries_1.iso2=="CN"]

Unnamed: 0,iso2,iso3,code3,date,recovered
18560,CN,CHN,156.0,1/22/20,28
18561,CN,CHN,156.0,1/23/20,30
18562,CN,CHN,156.0,1/24/20,36
18563,CN,CHN,156.0,1/25/20,39
18564,CN,CHN,156.0,1/26/20,49
...,...,...,...,...,...
21115,CN,CHN,156.0,4/6/20,77310
21116,CN,CHN,156.0,4/7/20,77410
21117,CN,CHN,156.0,4/8/20,77567
21118,CN,CHN,156.0,4/9/20,77679
