# Importing Libraries

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import requests
import io
import gzip

# Getting the data

In [2]:
url = 'https://data.brasil.io/dataset/covid19/caso.csv.gz'

response = requests.get(url)
bytes_io = io.BytesIO(response.content)
with gzip.open(bytes_io, 'rt') as read_file:
    covid_bra = pd.read_csv(read_file)

covid_bra.tail()

Unnamed: 0,date,state,city,place_type,confirmed,deaths,order_for_place,is_last,estimated_population_2019,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
426442,2020-06-23,SP,Óleo,city,1,0,5,False,2496.0,3533809.0,40.0641,0.0
426443,2020-06-22,SP,Óleo,city,1,0,4,False,2496.0,3533809.0,40.0641,0.0
426444,2020-06-21,SP,Óleo,city,1,0,3,False,2496.0,3533809.0,40.0641,0.0
426445,2020-06-20,SP,Óleo,city,1,0,2,False,2496.0,3533809.0,40.0641,0.0
426446,2020-06-19,SP,Óleo,city,1,0,1,False,2496.0,3533809.0,40.0641,0.0


In [3]:
covid_bra.dtypes

date                               object
state                              object
city                               object
place_type                         object
confirmed                           int64
deaths                              int64
order_for_place                     int64
is_last                              bool
estimated_population_2019         float64
city_ibge_code                    float64
confirmed_per_100k_inhabitants    float64
death_rate                        float64
dtype: object

# Preparing the dataframe

## Converting date column to datetime type

In [4]:
covid_bra['date'] = pd.to_datetime(covid_bra['date'])

covid_bra.dtypes

date                              datetime64[ns]
state                                     object
city                                      object
place_type                                object
confirmed                                  int64
deaths                                     int64
order_for_place                            int64
is_last                                     bool
estimated_population_2019                float64
city_ibge_code                           float64
confirmed_per_100k_inhabitants           float64
death_rate                               float64
dtype: object

## Selecting important columns

In [5]:
covid_bra = covid_bra[['date', 'state', 'city', 'place_type', 
                      'confirmed', 'deaths', 'order_for_place']]

covid_bra.head()

Unnamed: 0,date,state,city,place_type,confirmed,deaths,order_for_place
0,2020-07-27,AP,,state,35364,556,129
1,2020-07-26,AP,,state,35220,554,128
2,2020-07-25,AP,,state,35162,554,127
3,2020-07-24,AP,,state,35026,552,126
4,2020-07-23,AP,,state,34838,548,125


## Separating states and cities

In [6]:
covid_bra_states = covid_bra[covid_bra.place_type == 'state']

covid_bra_states = covid_bra_states.drop(columns='city')

covid_bra_states.tail()

Unnamed: 0,date,state,place_type,confirmed,deaths,order_for_place
371178,2020-02-29,SP,state,2,0,5
371179,2020-02-28,SP,state,2,0,4
371180,2020-02-27,SP,state,1,0,3
371181,2020-02-26,SP,state,1,0,2
371182,2020-02-25,SP,state,1,0,1


In [7]:
covid_bra_cities = covid_bra[covid_bra.place_type == 'city']

covid_bra_cities.head()

Unnamed: 0,date,state,city,place_type,confirmed,deaths,order_for_place
129,2020-07-26,AP,Amapá,city,383,4,89
130,2020-07-25,AP,Amapá,city,383,4,88
131,2020-07-24,AP,Amapá,city,383,4,87
132,2020-07-23,AP,Amapá,city,382,4,86
133,2020-07-22,AP,Amapá,city,376,4,85


## Reordering the dataframes

In [8]:
covid_bra_states = (covid_bra_states.groupby('state')
                        .apply(lambda x: x.sort_values('date', ascending=True))
                        .reset_index(drop=True))

covid_bra_states.head()

Unnamed: 0,date,state,place_type,confirmed,deaths,order_for_place
0,2020-03-17,AC,state,3,0,1
1,2020-03-18,AC,state,3,0,2
2,2020-03-19,AC,state,4,0,3
3,2020-03-20,AC,state,7,0,4
4,2020-03-21,AC,state,11,0,5


In [9]:
covid_bra_cities = (covid_bra_cities.groupby(['state', 'city'])
                        .apply(lambda x: x.sort_values('date', ascending=True))
                        .reset_index(drop=True))

covid_bra_cities.head()

Unnamed: 0,date,state,city,place_type,confirmed,deaths,order_for_place
0,2020-03-29,AC,Acrelândia,city,2,0,1
1,2020-03-30,AC,Acrelândia,city,6,0,2
2,2020-03-31,AC,Acrelândia,city,7,0,3
3,2020-04-01,AC,Acrelândia,city,8,0,4
4,2020-04-02,AC,Acrelândia,city,8,0,5


## Creating diff columns in covid_bra_states

In [10]:
covid_bra_states['cases_per_day'] = covid_bra_states.groupby('state')['confirmed'].diff()
covid_bra_states['deaths_per_day'] = covid_bra_states.groupby('state')['deaths'].diff()

covid_bra_states.head(16)

Unnamed: 0,date,state,place_type,confirmed,deaths,order_for_place,cases_per_day,deaths_per_day
0,2020-03-17,AC,state,3,0,1,,
1,2020-03-18,AC,state,3,0,2,0.0,0.0
2,2020-03-19,AC,state,4,0,3,1.0,0.0
3,2020-03-20,AC,state,7,0,4,3.0,0.0
4,2020-03-21,AC,state,11,0,5,4.0,0.0
5,2020-03-22,AC,state,11,0,6,0.0,0.0
6,2020-03-23,AC,state,17,0,7,6.0,0.0
7,2020-03-24,AC,state,21,0,8,4.0,0.0
8,2020-03-25,AC,state,23,0,9,2.0,0.0
9,2020-03-26,AC,state,23,0,10,0.0,0.0


In [11]:
first_cases = covid_bra_states.groupby('state')[['order_for_place', 'confirmed', 'deaths']].min()
first_cases = (first_cases.rename(columns={'confirmed': 'cases_per_day', 
                                           'deaths': 'deaths_per_day'})
                   .reset_index()
                   .set_index(['state','order_for_place']))

first_cases

Unnamed: 0_level_0,Unnamed: 1_level_0,cases_per_day,deaths_per_day
state,order_for_place,Unnamed: 2_level_1,Unnamed: 3_level_1
AC,1,3,0
AL,1,1,0
AM,1,2,0
AP,1,1,0
BA,1,1,0
CE,1,9,0
DF,1,1,0
ES,1,1,0
GO,1,3,0
MA,1,1,0


In [12]:
#indexes_nan_values = covid_bra_states[covid_bra_states.isnull().any(1)].index
covid_bra_states_set_index = covid_bra_states.set_index(['state','order_for_place'])
covid_bra_states_set_index.update(first_cases)
covid_bra_states = covid_bra_states_set_index.reset_index()

covid_bra_states.head()

Unnamed: 0,state,order_for_place,date,place_type,confirmed,deaths,cases_per_day,deaths_per_day
0,AC,1,2020-03-17,state,3,0,3.0,0.0
1,AC,2,2020-03-18,state,3,0,0.0,0.0
2,AC,3,2020-03-19,state,4,0,1.0,0.0
3,AC,4,2020-03-20,state,7,0,3.0,0.0
4,AC,5,2020-03-21,state,11,0,4.0,0.0


In [13]:
covid_bra_states[['cases_per_day', 'deaths_per_day']] = covid_bra_states[['cases_per_day', 'deaths_per_day']].astype(int)

covid_bra_states.head()

Unnamed: 0,state,order_for_place,date,place_type,confirmed,deaths,cases_per_day,deaths_per_day
0,AC,1,2020-03-17,state,3,0,3,0
1,AC,2,2020-03-18,state,3,0,0,0
2,AC,3,2020-03-19,state,4,0,1,0
3,AC,4,2020-03-20,state,7,0,3,0
4,AC,5,2020-03-21,state,11,0,4,0


## Creating diff columns in covid_bra_cities

In [14]:
covid_bra_cities['cases_per_day'] = covid_bra_cities.groupby('state')['confirmed'].diff()
covid_bra_cities['deaths_per_day'] = covid_bra_cities.groupby('state')['deaths'].diff()

covid_bra_cities.head()

Unnamed: 0,date,state,city,place_type,confirmed,deaths,order_for_place,cases_per_day,deaths_per_day
0,2020-03-29,AC,Acrelândia,city,2,0,1,,
1,2020-03-30,AC,Acrelândia,city,6,0,2,4.0,0.0
2,2020-03-31,AC,Acrelândia,city,7,0,3,1.0,0.0
3,2020-04-01,AC,Acrelândia,city,8,0,4,1.0,0.0
4,2020-04-02,AC,Acrelândia,city,8,0,5,0.0,0.0


In [15]:
first_cases_city = (covid_bra_cities.groupby(['state','city'])
                        [['order_for_place', 'confirmed', 'deaths']]
                        .min())

first_cases_city = (first_cases_city.rename(columns={'confirmed': 'cases_per_day', 
                                                     'deaths': 'deaths_per_day'})
                        .reset_index()
                        .set_index(['state', 'city','order_for_place']))

first_cases_city.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,cases_per_day,deaths_per_day
state,city,order_for_place,Unnamed: 3_level_1,Unnamed: 4_level_1
AC,Acrelândia,1,2,0
AC,Assis Brasil,1,1,0
AC,Brasiléia,1,1,0
AC,Bujari,1,1,0
AC,Capixaba,1,1,0


In [16]:
covid_bra_cities_set_index = covid_bra_cities.set_index(['state', 'city', 'order_for_place'])
covid_bra_cities_set_index.update(first_cases_city)
covid_bra_cities = covid_bra_cities_set_index.reset_index()

covid_bra_cities.head()

Unnamed: 0,state,city,order_for_place,date,place_type,confirmed,deaths,cases_per_day,deaths_per_day
0,AC,Acrelândia,1,2020-03-29,city,2,0,2.0,0.0
1,AC,Acrelândia,2,2020-03-30,city,6,0,4.0,0.0
2,AC,Acrelândia,3,2020-03-31,city,7,0,1.0,0.0
3,AC,Acrelândia,4,2020-04-01,city,8,0,1.0,0.0
4,AC,Acrelândia,5,2020-04-02,city,8,0,0.0,0.0


In [17]:
covid_bra_cities[['cases_per_day', 'deaths_per_day']] = covid_bra_cities[['cases_per_day', 'deaths_per_day']].astype(int)

covid_bra_cities.head()

Unnamed: 0,state,city,order_for_place,date,place_type,confirmed,deaths,cases_per_day,deaths_per_day
0,AC,Acrelândia,1,2020-03-29,city,2,0,2,0
1,AC,Acrelândia,2,2020-03-30,city,6,0,4,0
2,AC,Acrelândia,3,2020-03-31,city,7,0,1,0
3,AC,Acrelândia,4,2020-04-01,city,8,0,1,0
4,AC,Acrelândia,5,2020-04-02,city,8,0,0,0


## Adding state_code column in covid_bra_states

In [18]:
covid_bra_states['state_code'] = 'BR-' +  covid_bra_states['state'].astype(str)

covid_bra_states.head()

Unnamed: 0,state,order_for_place,date,place_type,confirmed,deaths,cases_per_day,deaths_per_day,state_code
0,AC,1,2020-03-17,state,3,0,3,0,BR-AC
1,AC,2,2020-03-18,state,3,0,0,0,BR-AC
2,AC,3,2020-03-19,state,4,0,1,0,BR-AC
3,AC,4,2020-03-20,state,7,0,3,0,BR-AC
4,AC,5,2020-03-21,state,11,0,4,0,BR-AC


# Creating csv files with prepared dataset

In [19]:
covid_bra.to_csv(r'~/covid-19/data/covid-19-bra.csv', index=False, header=False)

In [20]:
covid_bra_states.to_csv(r'~/covid-19/data/covid-19-bra-states.csv', index=False, header=False)

In [21]:
#covid_bra_cities.to_csv(r'~/covid-19/data/covid-19-bra-cities.csv', index=False, header=False)

In [22]:
yesterday = datetime.strftime(datetime.today() - timedelta(1),'%Y-%m-%d')

yesterday

'2020-07-27'

In [23]:
covid_bra[covid_bra.date == yesterday].to_csv(r'~/covid-19/data/covid-19-bra-today.csv', 
                                          index=False, header=False)

In [24]:
(covid_bra_states[covid_bra_states.date == yesterday]
     .to_csv(r'~/covid-19/data/covid-19-bra-states-today.csv', 
             index=False, header=False))