In [18]:
import pandas as pd
import hvplot.pandas
import holoviews as hv
from holoviews import opts
from holoviews.plotting.util import process_cmap
import panel as pn
import numpy as np
from datetime import timedelta  
from scipy.optimize import curve_fit
from bokeh.models.formatters import DatetimeTickFormatter, NumeralTickFormatter
import matplotlib.colors as mcolors
import warnings
warnings.filterwarnings('ignore')

pn.extension()
pd.options.plotting.backend = 'holoviews'

hv.renderer('bokeh').theme = 'light_minimal' # Graph style
def_opts = {'width': 500, 'height': 400, 'padding': 0.1, 'shared_axes': False, 
            'yformatter': NumeralTickFormatter(format='0,0'), 'xrotation' : 45,
            'xformatter': DatetimeTickFormatter(days = '%b %d')}
opts.defaults(opts.Scatter(**def_opts), 
              opts.Curve(line_width=2.5, **def_opts))

In [19]:
cmap = list(mcolors.TABLEAU_COLORS.values())  # List of Hexdec values corresponding to default colormap of Bokeh/mpl

def Plot_All (df, log = True, lin = True, kind='scatter', ylabel = 'Number of cases', yformatter = '%d', ymax = None):
    '''
    Plots the data in the following way:
    One tab for each country and each tab can have two tabs for linear and log plot.
    The first tab is the plot of all Countries together
    
    ARGUMENTS:
    - df: Dataframe containing all data
    - log: set to False if you don't want the logarithmic scale tab
    - lin: set to False if you don't want the linear scale tab
    - kind: you can choose the plot kind (scatter by default)
    - ylabel: label of y axis
    '''    
    xformatter = DatetimeTickFormatter(days = '%b %d')   # Formatter for date time axis, so that date is e.g. 'Jan 25'
    
    opts_comm = {'title' : '', 'width' : 600, 'height':400, 'padding' : 0.1,   # Options in common between linear and log plots
                 'kind' : kind,  'xformatter': xformatter, 'xlabel': '', 'ylabel': ylabel}
    opts_lin = {**opts_comm}    # Options for linear plots
    opts_log = {**opts_comm, **{'logy' : True, 'yformatter' : yformatter, 'ylim' : [0.6, ymax]}}  # Options for log plots       
    
    # Create the first tab with all countries in the same plot
    if log == False: plt_all = df.plot(**opts_lin)
    elif lin == False: plt_all = df.plot(**opts_log)
    else: plt_all = pn.Tabs(('Linear', df.plot(**opts_lin)),
                            ('Logarithmic', df.plot(**opts_log)))
    tab_states = pn.Tabs(('All', plt_all))
    # Create other tabs, one for each country
    for i, Country in enumerate(df.columns):
        opts_lin['color'] = cmap[i]
        opts_log['color'] = cmap[i]
        if log == False: plt_country = df[Country].plot(**opts_lin)
        elif lin == False: plt_country = df[Country].plot(**opts_log)
        else: plt_country = pn.Tabs(('Linear', df[Country].plot(**opts_lin)),
                                    ('Logarithmic', df[Country].plot(**opts_log)))
        tab_states.append((Country, plt_country))
    return tab_states

# **COVID-19 Data visualization**
**Data source**: 
https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series.
It's a repository from Johns Hopkins, which is updated around midnight each day.

**Source code that you can execute**:
https://mybinder.org/v2/gh/gioarma/covid-19_analysis/b5e55e36aa6ddaf4a797740d2fdfbb707ce901a1?filepath=Covid_19.ipynb

---

# ***WORLD***

## Countries with most infected people

These are the 20 countries with most Coronavirus cases in the world.
In the second Tab you can see the caountries with the highest number of new cases during the last day

In [20]:
data = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
df = pd.read_csv(data).T                                                        # Read the data transposing them
df.columns = [df.iloc[1,i] for i in range(len(df.columns))]                     # Change column names to Country
df.drop(['Province/State', 'Country/Region', 'Lat', 'Long'], inplace = True)    # Drop not needed columns
df.index = pd.to_datetime(df.index, format='%m/%d/%y')                          # Change index to DateTime for plotting
df.index.name = 'Date'
df = df.fillna(0.0).astype(int)                                                 # Convert data from float to int
df = df.groupby(level = 0, axis=1).sum()                                        # Sum all columns regarding the same state   

# Plot
Top_20 = df.iloc[-1, :].sort_values(ascending = False).head(20)
plt_Countries = Top_20.\
             plot(kind = 'bar', title = '', height = 500, legend = False, 
                  ylim = [0, df.max().max()*1.2], xformatter = '%d', hover = False).\
             opts(invert_axes=True, invert_yaxis = True, shared_axes = False)
overlay_Countries = plt_Countries
for i in range(20):
    overlay_Countries = overlay_Countries*hv.Text(x = Top_20.index[i], 
                                                  y = Top_20[i]+Top_20.max()*0.08, 
                                                  text = str("{:,}".format(Top_20[i])))
    
Top_20_new = df.diff().iloc[-1, :].sort_values(ascending = False).head(20)
plt_Countries_new = Top_20_new.plot(kind = 'bar', title = '', height = 500, legend = False,
                                ylim = [0, df.diff().iloc[-1, :].max()*1.2], xformatter = '%d', 
                                hover = False)\
                              .opts(invert_axes=True, invert_yaxis = True)

overlay_Countries_new = plt_Countries_new
for i in range(20):
    overlay_Countries_new = overlay_Countries_new*\
                            hv.Text(x = Top_20_new.index[i],
                                    y = Top_20_new[i]+Top_20_new.max()*0.08,
                                    text = str("{:,}".format(Top_20_new[i])))
    
pn.Tabs(('Total Cases', overlay_Countries.opts(shared_axes = False)), 
        ('New Cases', overlay_Countries_new.opts(shared_axes = False)))
#overlay_Countries_new


## Total Cases

Among all countries, we show the data for the following ones:

* China
* Italy
* France
* Germany
* Spain
* United States
* United Kingdom

The plot below shows the number of total cases in the selected countries.
You can view all of them in the same plot, or one by one by switching tabs.

You can also view each plot in linear or logarithmic scale.

In [21]:
Countries = ['Italy', 'China', 'France', 'Germany', 'Spain', 'US', 'United Kingdom']
TotCases = df.filter(Countries)
sorted_columns = list(TotCases.max().sort_values(ascending = False).index)
TotCases = TotCases[sorted_columns]
pn.Row(Plot_All(TotCases, kind = 'line', ylabel = 'Total cases'), pn.Spacer(width = 50),
        pn.pane.Markdown(""" 
        <br><br><br>
        ### Tip:
        You can activate the items in the side bar to move the plots, zoom and save the image."""))

## New cases

Here you can see how many *new* cases are recorded each day in the selected countries

In [22]:
NewCases = TotCases.diff().fillna(0.0).astype(int)    # Calculate new cases as difference between successive rows, then convert to int
NewCases[NewCases<0] = 0.0                            # Fix errors due to total cases data decreasing over time

pn.Row(Plot_All(NewCases, log = False, kind = 'line', ylabel = 'New cases'), 
       pn.Spacer(width = 50), 
       pn.pane.Markdown(""" 
        <br><br><br>
        ### Tip:
        In the "All" tab you can deactivate specific plots by clicking on the corresponding country in the side bar."""))

## Visualizing the end of the pandemic

The following is a plot of the new cases as a function of the total cases for all countries. This plot is not easy to understand because on the x axis there is not time. However, the total nomber of cases can only increase with time (or stay constant when the pandemic is over), therefore going from left to right on the x axis, we are also going forward in time.

The reason why this plot is useful is the following. During the first phase of the epidemic growth is always **exponential**, which means that the number of *new cases* is proportional to the number of *total cases*:
$$C_{new}\propto C_{tot}$$
Indeed, the number of new daily cases is basically the derivative of the total cases, making this a standard differential equation with an exponential solution. Therefore, in the first part of the epidemic the relation between new and total cases is **linear**, which is what we observe in the initial part of this plot.

The plot uses a **log-log scale**, where a linear relation still visually corresponds to a straight line. The effect of this scale is collapsing all the values that have the same order of magnitude close together so that only drastic changes are visible. The points showed here are actually a weekly average in order to reduce noise and allow for a better visualization of the linear regime.

In [23]:
Plt_MinPhys = hv.Overlay()
for Country in TotCases.columns:
    plt = hv.Scatter((TotCases[Country].replace(0, np.nan).values, 
                      NewCases[Country].replace(0, np.nan).rolling(7, center = True).mean().values),  # I consider the rolling mean over the past 7 days
                     label = Country)
    Plt_MinPhys = Plt_MinPhys*plt
Plt_MinPhys.opts(opts.Scatter(size = 5, logx = True, logy = True, 
                              ylim = (3, None), xlim = (10, None),
                              legend_position = 'right', xlabel = 'Total Cases', 
                              ylabel = 'New Cases (Weekly Average)', width = 750, height = 500, 
                              color = hv.Cycle('Category10'), 
                              xformatter = NumeralTickFormatter(format='0,0')))
Plt_MinPhys*hv.Text(text = 'Linear region:\n Exponential growth', x = 300, y = 1e3)*\
hv.Arrow(x=70e3, y = 40, direction='>', text = 'Success!  ', 
         points=10, arrowstyle='->')

If we look at **China**, we see that at a certain point the number of new cases drastically decreases, **escaping the linear region**, i.e. the exponential growth. This is because at that point in China the growth stopped being exponential.
What is nice about this plot is that it makes clear which countries have managed to escape from the exponential growth and are leading towards the end of the pandemic.

For more info about this kind of plot look at this video of **MinutePhysics**, who first proposed this idea:
https://www.youtube.com/watch?v=54XLXg4fYsc

## Growth Factor

The growth factor is given by the number of new cases one day $N_d$ divided by the number of cases the previous day $N_{d-1}$:

$$G = \frac{N_d}{N_{d-1}}$$

When $G$ reaches the value of **1**, then we are at the inflection point, where the number of new cases every day starts to decrease. In the final part of the epidemic it decreases below 1, reaching 0 at the end

In [24]:
G = NewCases/NewCases.shift() #Divide by same df, but shifted back by 1
G.replace(np.inf, np.nan, inplace = True)
G.replace(0, np.nan, inplace = True)
panel_G = Plot_All(G.dropna().rolling(7, center = True).mean(), log = True, ylabel = 'Growth factor', 
                   kind = 'scatter', lin = False, yformatter = '%.1f', ymax = 5)
panel_G

---

# ***ITALY***

For this part we take data from Protezione Civile, which are more reliable.

Source: https://github.com/pcm-dpc/COVID-19/tree/master/dati-andamento-nazionale

These curves represent the number of total cases in Italy and the new daily cases.

In [25]:
dati_italia = 'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv'
columns = ['data', 'totale_casi', 'deceduti', 'terapia_intensiva', 'dimessi_guariti', 'totale_positivi', 'tamponi']
Italia = pd.read_csv(dati_italia, usecols = columns, index_col='data')
Italia.index = pd.to_datetime(Italia.index)
Italia.index.name = 'Date'
Italia.columns = ['int_care', 'pos', 'rec', 'dead', 'tot', 'tests']

# Calculate new columns
Italia['new'] = Italia.tot.diff()                                # New daily cases
Italia['new_pos'] = Italia.pos.diff()                            # New daily positives
Italia['new_dead'] = Italia.dead.diff()                          # New dead
Italia['new_rec'] = Italia.rec.diff()                            # New recovered
Italia['new_wtests'] = Italia.new/Italia.tests.diff()*100        # Tested resulted positive

my_opts = {'width': 450, 'height': 400, 'xlabel': '', 'shared_axes': False,  'padding':0.1}
plt_TotItalia = Italia.tot.plot(label = 'Total Cases' , color = cmap[0], **my_opts)
plt_NewItalia = Italia.new.plot(label = 'New Cases',color = cmap[1], **my_opts)
plt_TotItalia+plt_NewItalia

In the following we show, among all cases, which **percentage** is currently infected, recovered or dead. The three cases are stack one on top of the other so that you can have a visual feedback of these percentages.

In [26]:
plt_Percentages = (Italia.rec/Italia.tot*100).plot.area(label = 'Recovered', color = 'mediumseagreen')*\
(Italia.pos/Italia.tot*100).plot.area(label = 'Positive', color = cmap[0])*\
(Italia.dead/Italia.tot*100).plot.area(label = 'Dead', color = 'orangered')

hv.Area.stack(plt_Percentages).opts(legend_position = 'right', height = 400, width = 800, 
                                    padding = 0, ylabel = 'Cases (%)', xlabel = 'Date', 
                                    xformatter = DatetimeTickFormatter(days = '%b %d'), 
                                    title = 'Percentage of recovered, positive and dead')

Here we show the curves of infected, recovered and dead people (left column), as well as their daily variation (right column)

In [27]:
my_opts = {'width': 450, 'height': 400, 'xlabel': '', 'shared_axes': False,  'padding':0.1}
cmap_cases = process_cmap('Category20c', ncolors=20)
plt_PosItalia = Italia.pos.plot(label = 'Total Infected',color = cmap_cases[0], **my_opts)
plt_NewPosItalia = Italia.new_pos.plot(label = 'New Daily Infected',color = cmap_cases[1], **my_opts)
plt_RecItalia = Italia.rec.plot(label = 'Total Recovered',color = cmap_cases[8], **my_opts)
plt_NewRecItalia = Italia.new_rec.plot(label = 'New Daily Recovered',color = cmap_cases[9], **my_opts)
plt_DeadItalia = Italia.dead.plot(label = 'Total Dead',color = cmap_cases[4], **my_opts)
plt_NewDeadItalia = Italia.new_dead.plot(label = 'New Daily Dead',color = cmap_cases[5], **my_opts)
plt_IntCare = Italia.int_care.plot(label = 'In Intensive Care',color = cmap_cases[12], **my_opts)
plt_TestedPositive = Italia.new_wtests.plot(label = 'Positives Among Tested',color = cmap_cases[13], 
                                            yformatter = '%d%%',**my_opts)

plt_Cases_Italia = [plt_PosItalia, plt_NewPosItalia,
                    plt_RecItalia, plt_NewRecItalia,
                    plt_DeadItalia, plt_NewDeadItalia, 
                    plt_IntCare, plt_TestedPositive]
hv.Layout(plt_Cases_Italia).cols(2)

In [28]:
#def sigmoid(t, K, A, r):
#    return K/(1+A*np.exp(-r*(t)))
#
#extra_days = 40    # How much we extend the fit beyond the current date for prediction
#m = 15
#N = Italia.index.size
#
#C2m, Cm, C = Italia.tot[N-2*m], Italia.tot[N-m], Italia.tot[N-1]
#K_guess = Cm*(C2m*Cm-2*C2m*C+Cm*C)/(Cm**2-C*C2m)
#r_guess = 1/m*np.log(C*(Cm-C2m)/(C2m*(C-Cm)))
#A_guess = (C-Cm)*(Cm-C2m)/(Cm**2-C*C2m)*(C/C2m*(Cm-C2m)/(C-Cm))**((N-1-m)/m)
#init_guesses = [K_guess, A_guess, r_guess]   
#
## The index is in datetime format, with which I can't fit
## -> I consider days from beginning (0,1,2,3,...) as index
#popt, pcov = curve_fit(sigmoid, [i for i in range(N)],
#                       Italia.tot.values, 
#                       p0 = init_guesses)
#
#fit = pd.Series(index = pd.date_range(Italia.index[0], Italia.index[-1]+timedelta(days = extra_days), freq='D'), 
#                data = sigmoid(np.arange(0, Italia.index.size+extra_days-1, 1), *popt))
#
#my_opts = {'width': 600, 'height': 400, 'xlabel': ''}
#
#plt_TotPred = Italia.tot.plot( kind = 'scatter', label = 'Data', **my_opts)\
#              * fit.plot(color = 'red', label = 'Prediction', hover = False, **my_opts)\
#              #* hv.HLine(popt[0]).opts(color = 'black', line_width=1, line_dash='dashed')\
#              #*hv.Text(text='        Predicted final size: '+str(int(popt[0])), y =popt[0]+5e3, x = fit.index[12], fontsize=11)#* \
#              #hv.Arrow(Italia.index[-1]+timedelta(days=1), fit[Italia.index.size],
#              #         'Predicted for '+(Italia.index[-1]+timedelta(days=1)).strftime('%b %d')+': '+str(int(fit[Italia.index.size])), 
#              #         '^', arrowstyle = '-')
#plt_NewPred = Italia.new.plot(kind = 'scatter', label = 'Data', c = cmap[1], **my_opts)*\
#              fit.diff().plot(color = 'red', label = 'Prediction', hover = False)
#(plt_TotPred.opts(title = 'Total cases') + plt_NewPred.opts(title = 'New cases')).cols(1)

In [29]:
##Italia.index = [i for i in range(len(Italia.index))]   # Convert the index to days since the beginning
#J = 9                      # J will be the number of fitted curves: from index 0 to N-J, then to N-J+1, N-J+2, etc.
#fits = pd.DataFrame(index = [i for i in range(N+1+extra_days)])
#K_values = []
## In each cycle I perform the fit of the curve stopping at index n, with N-J<n<N-1
#for n in range(N-J, N):
#    # Find best initial guesses, based on (11), (12), (13) of [1]
#    C2m, Cm, C = Italia.tot[n-2*m], Italia.tot[n-m], Italia.tot[n]
#    K_guess = Cm*(C2m*Cm-2*C2m*C+Cm*C)/(Cm**2-C*C2m)
#    r_guess = 1/m*np.log(C*(Cm-C2m)/(C2m*(C-Cm)))
#    A_guess = (C-Cm)*(Cm-C2m)/(Cm**2-C*C2m)*(C/C2m*(Cm-C2m)/(C-Cm))**((n-2-m)/m)  # t_k = n-2. E.g. when n=N-1
#    init_guesses = [K_guess, A_guess, r_guess]   
#    
#    # Fit
#    popt, pcov = curve_fit(sigmoid, [i for i in range(n+1)],    # Have to put n+1 because in this cycle N-J<=n<=N, then when I use [] in a df, the last element is not considered
#                           Italia.tot[0:n+1].values, 
#                           p0 = init_guesses)
#    K_values.append(popt[0])
#    
#    # Create fit curves that extend by 'extra_days' beyond the current date
#    fit = pd.Series(index = [i for i in range(N+extra_days)], 
#                             data = sigmoid(np.arange(0, N+extra_days), *popt))
#    #fit.name = 'n = '+str(n)
#    fit.name = Italia.index[n-N].strftime('%b %d')
#    fits = fits.join(fit)        # I put all of them in a single df
#
#plt_pred = Italia.tot.plot(kind = 'scatter')*fit.plot(color = 'red', label = 'Fit')
#plt_pred.opts(padding = 0.1, width = 600, height = 400, yformatter = '%d', 
#              xlabel = ' ')
#
#fits = fits.iloc[:, ::-1]                            # Reverse the order of columns for having the last day on top of the legend
#fits.index = pd.date_range(start = Italia.index[0], 
#                           end = Italia.index[-1]+timedelta(days = extra_days+2))
#K = pd.DataFrame(data =K_values, columns=['K'], 
#                 index = Italia.index[N-J:].strftime('%b %d'))
#
## Plot
#cmap_fits = process_cmap('fire', ncolors=10)
#fits.plot(color = cmap_fits, width = 700, label = 'Fits of the total cases data in the last 9 days')

In [30]:
#K.plot.bar(label = 'Estimated final size of epidemic', ylabel = '', 
#           xlabel = 'Days since Feb 24', height = 400, hover = False, 
#           yformatter = NumeralTickFormatter(format='0,0'))

## ***ITALIAN REGIONS***

In this plot we compare the total cases in all italian regions

In [31]:
dati_regioni = 'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv'
cols = ['data', 'denominazione_regione', 'totale_positivi', 'nuovi_positivi', 'dimessi_guariti', 'deceduti', 'totale_casi']
Regioni = pd.read_csv(dati_regioni, usecols = cols)
Regioni.columns = ['Data', 'Regione', 'tot_pos', 'new_pos', 'rec', 'dead', 'tot']
Regioni.Data = pd.to_datetime(Regioni.Data)              # Convert to datetime index
Regioni.set_index (['Regione', 'Data'], inplace=True)
Regioni.sort_index(inplace=True)
Regioni.loc['P.A. Trento'] = (Regioni.loc['P.A. Trento']+\
                              Regioni.loc['P.A. Bolzano']).values    # Put Trento and Bolzano in the same index
Regioni.rename(index={'P.A. Trento': 'Trentino Alto Adige'}, inplace = True) # Rename it as Trentino Alto Adige
Regioni.drop('P.A. Bolzano', inplace = True)                         # Delete Bolzano column
Regioni['new'] = Regioni.tot.diff()
Regioni[Regioni<0] = 0                    # Delete any negative value coming from data of different regions being subtracted


Regioni_LastDay = Regioni.unstack(level = 0).iloc[-1].sort_values(ascending = False)
plt_Regioni = Regioni_LastDay.tot.plot(kind = 'bar', height = 500, legend = False, 
                                       hover = False, ylim = [None, Regioni_LastDay.max()+Regioni_LastDay.tot.max()*0.15], 
                                       label = 'Total cases', color = 'green', 
                                       xlabel = '', ylabel = '')\
              .opts(invert_axes=True, invert_yaxis = True)
for i in range(20):           # Add labels to each bar
    plt_Regioni = plt_Regioni*hv.Text(x = Regioni_LastDay.tot.index[i], 
                                      y = Regioni_LastDay.tot[i]+Regioni_LastDay.tot.max()*0.08, 
                                      text = str("{:,}".format(int(Regioni_LastDay.tot[i]))))
plt_Regioni

#### Total cases

These are the curves of the total cases in all regions, divided by North, Center and South. You can click on the different tabs to display a specific group and also specific regions.

In [32]:
Nord_list = ['Valle d\'Aosta', 'Piemonte', 'Lombardia', 'Liguria', 'Emilia-Romagna', 'Veneto', 'Trentino Alto Adige', 'Friuli Venezia Giulia']
Centro_list = ['Toscana', 'Marche', 'Umbria', 'Lazio', 'Abruzzo']
Sud_list = ['Campania', 'Molise', 'Puglia', 'Basilicata', 'Calabria', 'Sicilia', 'Sardegna']
Nord, Centro, Sud = Regioni.loc[Nord_list], Regioni.loc[Centro_list], Regioni.loc[Sud_list]      
pn.Tabs(('Nord', Plot_All(Nord.unstack(level=0).tot\
                 [list(Nord.unstack(level=0).tot.max().sort_values(ascending = False).index)], 
                          log = False, kind = 'line', ylabel = 'Total cases')),
        ('Centro', Plot_All(Centro.unstack(level=0).tot\
                   [list(Centro.unstack(level=0).tot.max().sort_values(ascending = False).index)], 
                            log = False, kind = 'line', ylabel = 'Total cases')),
        ('Sud', Plot_All(Sud.unstack(level=0).tot\
                   [list(Sud.unstack(level=0).tot.max().sort_values(ascending = False).index)], 
                         log = False, kind = 'line', ylabel = 'Total cases')))

#### New cases

These are the corresponding plots of the *new* cases.

In [33]:
pn.Tabs(('Nord', Plot_All(Nord.unstack(level=0).new\
                 [list(Nord.unstack(level=0).new.max().sort_values(ascending = False).index)], 
                          log = False, kind = 'line', ylabel = 'New cases')),
        ('Centro', Plot_All(Centro.unstack(level=0).new\
                   [list(Centro.unstack(level=0).new.max().sort_values(ascending = False).index)], 
                            log = False, kind = 'line', ylabel = 'New cases')),
        ('Sud', Plot_All(Sud.unstack(level=0).new\
                   [list(Sud.unstack(level=0).new.max().sort_values(ascending = False).index)], 
                         log = False, kind = 'line', ylabel = 'New cases')))

In [34]:
%%html
<script src="https://cdn.rawgit.com/parente/4c3e6936d0d7a46fd071/raw/65b816fb9bdd3c28b4ddf3af602bfd6015486383/code_toggle.js"></script>