# Comparativo trajetória por estado [in Portuguese]
> Casos confirmados (escala logarítmica)

- toc: false
- badges: true
- comments: true
- author: Cleber Jorge Amaral
- categories: [covid-19, brasil, comparativo, altair, jupyter]
- image: images/brazil-states-cases-trajectories.png

### NOTA
Os dados utilizados nestas análises são inerentemente incertos. Não há garantia de que os dados e as análises estejam devidamente atualizados e corretos. Antes de tomar conclusões é fundamental verificar dados e conclusões com outras fontes.

In [1]:
#hide
import pandas as pd
import altair as alt
import math
import requests
from altair_saver import save
from IPython.display import HTML

CHART_WIDTH = 600
CHART_HEIGHT = 450



In [2]:
pip install lzma

Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement lzma (from versions: none)[0m
[31mERROR: No matching distribution found for lzma[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
#hide
# Get source original file from brasil.io
# Expected to have the columns: ['state', 'date', 'confirmed']
url = ('https://data.brasil.io/dataset/covid19/caso.csv.gz')

# Pretending to be a browser
header = {
  "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/581.0.4044.138 Safari/537.36",
  "x-requested-with": "XMLHttpRequest",
  "accept-encoding": "gzip, deflate, br"
}

response = requests.get(url, headers=header, stream=True)

raw = response.raw
with open('../assets/db/caso.csv.gz', 'wb') as out_file:
    while True:
        chunk = raw.read(1024, decode_content=True)
        if not chunk:
            break
        out_file.write(chunk)

#src = pd.read_csv('caso.csv.gz')
src = pd.read_csv('../assets/db/caso.csv.gz')
df = src[(src['place_type'] == 'state')]
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].dt.strftime('%m/%d/%y')
#df.head()

In [None]:
#hide
# Pivot data regarding dates, putting them as columns
df = df[['state','date','confirmed']].groupby(['state','date'],as_index = False).sum().pivot('state','date').fillna(0)
df = df.droplevel(None, axis=1)
df = df.reset_index()
#df.head()

In [None]:
#hide
STATE_COLUMN = "Estado"
DATE_COLUMN = "Data"
MEAN_FUNCTION = '_Média'
MEDIAN_FUNCTION = '_Mediana'
SUM_FUNCTION = '_Brasil'
CONFIRMED_CASES = 'Casos confirmados'
DEATHS = 'Mortes'
DAYS_SINCE_100_CASES = 'Dias desde 100 casos'
DAYS_SINCE_10_DEATHS = 'Dias desde 10 mortes'
COMPARE_TRAJECTORY_WITH = 'Comparativo da trajetória entre os estados'

df = df.rename(columns={"state": STATE_COLUMN})
acc = df
#df.tail()

In [None]:
#hide
dt_cols = df.columns[~df.columns.isin([STATE_COLUMN])]
#if data of last cells is empty (actually the accumulative minor then prior), copy prior
for c in range(0, len(dt_cols[0::])+1):
    for r in range(0, len(df)):
        if c > 1 and df.iloc[r,c] < df.iloc[r,c-1]:
            df.iloc[r,c] = df.iloc[r,c-1]
#df.to_csv('modified_absolute.csv')
#df.tail()

In [None]:
#hide
# Get mean, median and total
mean = df.mean(axis=0)
median = df.median(axis=0)
total = df.sum(axis=0)
df.loc['Mean',dt_cols]=mean
df.loc['Mean',STATE_COLUMN] = MEAN_FUNCTION
df.loc['Median',dt_cols]=median
df.loc['Median',STATE_COLUMN] = MEDIAN_FUNCTION
df.loc['Total',dt_cols]=total
df.loc['Total',STATE_COLUMN] = SUM_FUNCTION

In [None]:
#hide
# Get differences for each day
dff = acc.groupby(STATE_COLUMN)[dt_cols].sum().stack().reset_index(name=CONFIRMED_CASES).rename(columns={'date': DATE_COLUMN})
dff[DATE_COLUMN] = pd.to_datetime(dff[DATE_COLUMN], format='%m/%d/%y')
#dff.tail()

In [None]:
#hide
MIN_CASES = 300
LAST_DATE = dt_cols[-1]
# sometimes last column may be empty, then go backwards
# it should not occur, just keeping original code
for c in dt_cols[::-1]:
    if not df[c].fillna(0).eq(0).all():
        LAST_DATE = c
        break
countries = dff[dff[DATE_COLUMN].eq(LAST_DATE) & dff[CONFIRMED_CASES].ge(MIN_CASES)
       ].sort_values(by=CONFIRMED_CASES, ascending=False)
countries = countries[STATE_COLUMN].values

In [None]:
#hide
SINCE_CASES_NUM = 100
dff2 = dff[dff[STATE_COLUMN].isin(countries)].copy()
days_since = (dff2.assign(F=dff2[CONFIRMED_CASES].ge(SINCE_CASES_NUM))
              .set_index(DATE_COLUMN)
              .groupby(STATE_COLUMN)['F'].transform('idxmax'))
dff2[DAYS_SINCE_100_CASES] = (dff2[DATE_COLUMN] - days_since.values).dt.days.values
dff2 = dff2[dff2[DAYS_SINCE_100_CASES].ge(0)]
#list(dff2)

In [None]:
#hide_input
preselected_items = ['AM','CE','DF','RJ','SC','SP']
max_date = dff2[DATE_COLUMN].max()
color_domain = list(dff2[STATE_COLUMN].unique())
y_domain_max = pow(10, math.ceil(math.log10(dff[CONFIRMED_CASES].max())))
ref_max_day = math.ceil(math.log(y_domain_max / SINCE_CASES_NUM, 1.33))

def make_since_chart(highlight_items=[], preselected_items=preselected_items):
    selection = alt.selection_multi(fields=[STATE_COLUMN], bind='legend', 
                                    init=[{STATE_COLUMN: x} for x in highlight_items + preselected_items])

    base = alt.Chart(dff2, width=CHART_WIDTH, height=CHART_HEIGHT).encode(
        x=DAYS_SINCE_100_CASES+':Q',
        y=alt.Y(CONFIRMED_CASES+':Q', scale=alt.Scale(type='log')),
        color=alt.Color(STATE_COLUMN+':N', scale=alt.Scale(domain=color_domain), legend=alt.Legend(columns=math.ceil(len(color_domain)/26), symbolLimit=len(color_domain))),
        tooltip=list(dff2),
        opacity=alt.condition(selection, alt.value(1), alt.value(0.05))
    )
    max_day = dff2[DAYS_SINCE_100_CASES].max()
    ref = pd.DataFrame([[x, 100*1.33**x] for x in range(ref_max_day)], columns=[DAYS_SINCE_100_CASES, CONFIRMED_CASES])
    base_ref = alt.Chart(ref).encode(x=DAYS_SINCE_100_CASES+':Q', y=CONFIRMED_CASES+':Q')
    return (
        base_ref.mark_line(color='black', opacity=.5, strokeDash=[3,3]) +
        base_ref.transform_filter(
            alt.datum[DAYS_SINCE_100_CASES] >= ref_max_day - 1
        ).mark_text(dx=-30, dy=30, align='right', fontSize=10, text='33% Daily Growth') +
        base.mark_line(point=True).add_selection(selection) + 
        base.transform_filter(
            alt.datum[DATE_COLUMN] >= int(max_date.timestamp() * 1000)
        ).mark_text(dy=-8, align='right', fontWeight='bold').encode(text=STATE_COLUMN+':N')
    ).properties(
        title=f" {CONFIRMED_CASES+': '+COMPARE_TRAJECTORY_WITH+' '+', '.join(preselected_items)}"
    )
chart = make_since_chart()
chart

In [None]:
#hide
df = src[(src['place_type'] == 'state')]
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].dt.strftime('%m/%d/%y')
#df.head()

In [None]:
#hide
# Pivot data regarding dates, putting them as columns
df = df[['state','date','deaths']].groupby(['state','date'],as_index = False).sum().pivot('state','date').fillna(0)
df = df.droplevel(None, axis=1)
df = df.reset_index()
#df.head()

In [None]:
#hide
df = df.rename(columns={"state": STATE_COLUMN})
#df.to_csv('modified_absolute.csv')
acc = df
#df.tail()

In [None]:
#hide
dt_cols = df.columns[~df.columns.isin([STATE_COLUMN])]
#if data of last cells is empty (actually the accumulative minor then prior), copy prior
for c in range(0, len(dt_cols[0::])+1):
    for r in range(0, len(df)):
        if c > 1 and df.iloc[r,c] < df.iloc[r,c-1]:
            df.iloc[r,c] = df.iloc[r,c-1]
#df.to_csv('modified_absolute.csv')
#df.tail()

In [None]:
#hide
# Get mean, median and total
mean = df.mean(axis=0)
median = df.median(axis=0)
total = df.sum(axis=0)
df.loc['Mean',dt_cols]=mean
df.loc['Mean',STATE_COLUMN] = MEAN_FUNCTION
df.loc['Median',dt_cols]=median
df.loc['Median',STATE_COLUMN] = MEDIAN_FUNCTION
df.loc['Total',dt_cols]=total
df.loc['Total',STATE_COLUMN] = SUM_FUNCTION

In [None]:
#hide
# Get differences for each day
dff = acc.groupby(STATE_COLUMN)[dt_cols].sum().stack().reset_index(name=DEATHS).rename(columns={'date': DATE_COLUMN})
dff[DATE_COLUMN] = pd.to_datetime(dff[DATE_COLUMN], format='%m/%d/%y')
#dff.tail()

In [None]:
#hide
MIN_CASES = 30
LAST_DATE = dt_cols[-1]
# sometimes last column may be empty, then go backwards
# it should not occur, just keeping original code
for c in dt_cols[::-1]:
    if not df[c].fillna(0).eq(0).all():
        LAST_DATE = c
        break
countries = dff[dff[DATE_COLUMN].eq(LAST_DATE) & dff[DEATHS].ge(MIN_CASES)
       ].sort_values(by=DEATHS, ascending=False)
countries = countries[STATE_COLUMN].values

In [None]:
#hide
SINCE_CASES_NUM = 10
dff2 = dff[dff[STATE_COLUMN].isin(countries)].copy()
days_since = (dff2.assign(F=dff2[DEATHS].ge(SINCE_CASES_NUM))
              .set_index(DATE_COLUMN)
              .groupby(STATE_COLUMN)['F'].transform('idxmax'))
dff2[DAYS_SINCE_10_DEATHS] = (dff2[DATE_COLUMN] - days_since.values).dt.days.values
dff2 = dff2[dff2[DAYS_SINCE_10_DEATHS].ge(0)]
#list(dff2)

In [None]:
#hide_input
preselected_items = ['AM','CE','DF','RJ','SC','SP',SUM_FUNCTION]
max_date = dff2[DATE_COLUMN].max()
color_domain = list(dff2[STATE_COLUMN].unique())
y_domain_max = pow(10, math.ceil(math.log10(dff[DEATHS].max())))
ref_max_day = math.ceil(math.log(y_domain_max / SINCE_CASES_NUM, 1.33))

def make_since_chart(highlight_items=[], preselected_items=preselected_items):
    selection = alt.selection_multi(fields=[STATE_COLUMN], bind='legend', 
                                    init=[{STATE_COLUMN: x} for x in highlight_items + preselected_items])

    base = alt.Chart(dff2, width=CHART_WIDTH, height=CHART_HEIGHT).encode(
        x=DAYS_SINCE_10_DEATHS+':Q',
        y=alt.Y(DEATHS+':Q', scale=alt.Scale(type='log')),
        color=alt.Color(STATE_COLUMN+':N', scale=alt.Scale(domain=color_domain), legend=alt.Legend(columns=math.ceil(len(color_domain)/26), symbolLimit=len(color_domain))),
        tooltip=list(dff2),
        opacity=alt.condition(selection, alt.value(1), alt.value(0.05))
    )
    max_day = dff2[DAYS_SINCE_10_DEATHS].max()
    ref = pd.DataFrame([[x, 100*1.33**x] for x in range(ref_max_day)], columns=[DAYS_SINCE_10_DEATHS, DEATHS])
    base_ref = alt.Chart(ref).encode(x=DAYS_SINCE_10_DEATHS+':Q', y=DEATHS+':Q')
    return (
        base_ref.mark_line(color='black', opacity=.5, strokeDash=[3,3]) +
        base_ref.transform_filter(
            alt.datum[DAYS_SINCE_10_DEATHS] >= ref_max_day - 1
        ).mark_text(dx=-30, dy=30, align='right', fontSize=10, text='33% Daily Growth') +
        base.mark_line(point=True).add_selection(selection) + 
        base.transform_filter(
            alt.datum[DATE_COLUMN] >= int(max_date.timestamp() * 1000)
        ).mark_text(dy=-8, align='right', fontWeight='bold').encode(text=STATE_COLUMN+':N')
    ).properties(
        title=f" {DEATHS+': '+COMPARE_TRAJECTORY_WITH+' '+', '.join(preselected_items)}"
    )
chart = make_since_chart()
chart

In [None]:
#hide_input
HTML(f'<small class="float-right">Última atualização em {pd.to_datetime(LAST_DATE).strftime("%d/%m/%Y")}</small>')

Based on the [notebook](https://github.com/pratapvardhan/notebooks/blob/master/covid19/covid19-compare-country-trajectories.ipynb) developed by [Pratap Vardhan](https://twitter.com/PratapVardhan), adapted by [Cleber Jorge Amaral](http://cleberjamaral.github.io/). Data source: [brasil.io](https://brasil.io/home/)

In [None]:
#hide
save(chart,"../images/brazil-states-cases-trajectories.png")