# Compare Brazilian States Trajectories - Confirmed Cases

Author: Cleber Jorge Amaral, based on [Pratap Vardhan](https://twitter.com/PratapVardhan) work published at [covid19-dashboard](https://github.com/github/covid19-dashboard) which uses [fastpages](https://github.com/fastai/fastpages) by [Hamel Husain](https://github.com/hamelsmu).

It must be updated manually. Go to https://covid.saude.gov.br/, download the file using the button '*Arquivo CSV*' and place this file in the same folder of this notebook.


In [1]:
#hide
import pandas as pd
import altair as alt
import math
from IPython.display import HTML

CHART_WIDTH = 600
CHART_HEIGHT = 450



In [2]:
# Get source original file from Brazilian Ministry
# Expected format: ['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulados', 'obitosNovos', 'obitosAcumulados']
url = ('https://raw.githubusercontent.com/cleberjamaral/cleberjamaral.github.io/master/knowledge/research/machine-learning/arquivo_geral.csv')
src = pd.read_csv(url, sep=';')
df = src
df['data'] = pd.to_datetime(df['data'])
df['data'] = df['data'].dt.strftime('%m/%d/%y')

In [3]:
# Pivot data regarding dates, putting them as columns
df = df[['estado','data','casosNovos']].groupby(['estado','data'],as_index = False).sum().pivot('estado','data').fillna(0)
df = df.droplevel(None, axis=1)
df = df.reset_index()
#df.to_csv('modified_absolute.csv')
#df.head()

In [4]:
# Make data as accumulative column by column
dt_cols = df.columns[~df.columns.isin(['estado'])]
acc = df[dt_cols].cumsum(axis=1)
acc['Province/State'] = df['estado']
#acc.tail()

In [5]:
# Get differences for each day
dff = acc.groupby('Province/State')[dt_cols].sum().stack().reset_index(name='Confirmed Cases').rename(columns={'data': 'Date'})
dff['Date'] = pd.to_datetime(dff['Date'], format='%m/%d/%y')
#dff.tail()

In [6]:
#hide
MIN_CASES = 300
LAST_DATE = dt_cols[-1]
# sometimes last column may be empty, then go backwards
for c in dt_cols[::-1]:
    if not df[c].fillna(0).eq(0).all():
        LAST_DATE = c
        break
countries = dff[dff['Date'].eq(LAST_DATE) & dff['Confirmed Cases'].ge(MIN_CASES)
       ].sort_values(by='Confirmed Cases', ascending=False)
countries = countries['Province/State'].values

In [7]:
#hide
SINCE_CASES_NUM = 100
dff2 = dff[dff['Province/State'].isin(countries)].copy()
days_since = (dff2.assign(F=dff2['Confirmed Cases'].ge(SINCE_CASES_NUM))
              .set_index('Date')
              .groupby('Province/State')['F'].transform('idxmax'))
dff2['Days since 100 cases'] = (dff2['Date'] - days_since.values).dt.days.values
dff2 = dff2[dff2['Days since 100 cases'].ge(0)]
#list(dff2)

In [8]:
#hide_input
baseline_countries = ['AM','CE','DF','RJ','SC','SP']
max_date = dff2['Date'].max()
color_domain = list(dff2['Province/State'].unique())
y_domain_max = pow(10, math.ceil(math.log10(dff['Confirmed Cases'].max())))
ref_max_day = math.ceil(math.log(y_domain_max / SINCE_CASES_NUM, 1.33))

def make_since_chart(highlight_countries=[], baseline_countries=baseline_countries):
    selection = alt.selection_multi(fields=['Province/State'], bind='legend', 
                                    init=[{'Province/State': x} for x in highlight_countries + baseline_countries])

    base = alt.Chart(dff2, width=CHART_WIDTH, height=CHART_HEIGHT).encode(
        x='Days since 100 cases:Q',
        y=alt.Y('Confirmed Cases:Q', scale=alt.Scale(type='log')),
        color=alt.Color('Province/State:N', scale=alt.Scale(domain=color_domain), legend=alt.Legend(columns=math.ceil(len(color_domain)/26), symbolLimit=len(color_domain))),
        tooltip=list(dff2),
        opacity=alt.condition(selection, alt.value(1), alt.value(0.05))
    )
    max_day = dff2['Days since 100 cases'].max()
    ref = pd.DataFrame([[x, 100*1.33**x] for x in range(ref_max_day)], columns=['Days since 100 cases', 'Confirmed Cases'])
    base_ref = alt.Chart(ref).encode(x='Days since 100 cases:Q', y='Confirmed Cases:Q')
    return (
        base_ref.mark_line(color='black', opacity=.5, strokeDash=[3,3]) +
        base_ref.transform_filter(
            alt.datum['Days since 100 cases'] >= ref_max_day - 1
        ).mark_text(dx=-30, dy=30, align='right', fontSize=10, text='33% Daily Growth') +
        base.mark_line(point=True).add_selection(selection) + 
        base.transform_filter(
            alt.datum['Date'] >= int(max_date.timestamp() * 1000)
        ).mark_text(dy=-8, align='right', fontWeight='bold').encode(text='Province/State:N')
    ).properties(
        title=f"Compare {', '.join(highlight_countries)} trajectory with {', '.join(baseline_countries)}"
    )
chart = make_since_chart()
chart

In [9]:
#hide_input
HTML(f'<small class="float-right">Last Updated on {pd.to_datetime(LAST_DATE).strftime("%B, %d %Y")}</small>')

Based on the [notebook](https://github.com/pratapvardhan/notebooks/blob/master/covid19/covid19-compare-country-trajectories.ipynb) developed by [Pratap Vardhan](https://twitter.com/PratapVardhan), adapted by [Cleber Jorge Amaral](http://cleberjamaral.github.io/). Data source: [brazilian ministry of health](https://covid.saude.gov.br/)