# 1 - Data Extraction

Here we will download all the data we need.

Because it is too large (about 150 GB), we'll also apply some pre-processing on the data.

In [30]:
# imports
import os
import json
import requests
import pandas as pd
from tqdm import tqdm

In [2]:
# dotenv incantations
from dotenv import load_dotenv, find_dotenv

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

## 1.1. OpenDataSUS - Jab data

First, let's extract the jabs data. This is a dataset of each and every COVID jab applied in Brazil.

In [27]:
# get API connection info via .env file
COVIDJAB_API_URL = os.environ.get("COVIDJAB_API_URL")
COVIDJAB_API_URL_FIRST = os.environ.get("COVIDJAB_API_URL_FIRST")
COVIDJAB_API_USER = os.environ.get("COVIDJAB_API_USER")
COVIDJAB_API_PASSWD = os.environ.get("COVIDJAB_API_PASSWD")

api_auth = requests.auth.HTTPBasicAuth(
    COVIDJAB_API_USER,
    COVIDJAB_API_PASSWD
)

print(f'API URL: "{COVIDJAB_API_URL}"')
print(f'API URL (first page): "{COVIDJAB_API_URL_FIRST}"')
print(f'API username: "{COVIDJAB_API_USER}"')
print(f'API password: "{COVIDJAB_API_PASSWD}"')

API URL: "https://imunizacao-es.saude.gov.br/_search/scroll"
API URL (first page): "https://imunizacao-es.saude.gov.br/_search?scroll=1m"
API username: "imunizacao_public"
API password: "qlto5t&7r_@+#Tlstigi"


In [28]:
# establish request params
payload = {
    'size': 5,
}

In [25]:
# extract via requests
html = requests.post(COVIDJAB_API_URL_FIRST, 
    auth = api_auth,
    json = payload,
)
json_data = html.json()
print(json.dumps(json_data, indent = 3))

{
   "_scroll_id": "FGluY2x1ZGVfY29udGV4dF91dWlkDnF1ZXJ5VGhlbkZldGNoHhY5VDJCckZvOVFyMm03ZHRPNjF4QUdBAAAAAHzO0XoWbXl3TVIzRUFTMUNUU2p2NHRuaXIzZxZwSHBiRklZX1RKaXpLdm5RWjN0OE53AAAAABoNVZUWYXM2M3RXcVVSUjIwZUk5bmJhUHFHURY5VDJCckZvOVFyMm03ZHRPNjF4QUdBAAAAAHzO0X4WbXl3TVIzRUFTMUNUU2p2NHRuaXIzZxZ1ZldXQzg0YlFGS2FfTWRMQXk3WUxnAAAAAIjfenEWVU9BamwxZ1BSNU9IcFpCRGtVbGtzdxZLZUlSV29fQlRwT0tTSW5RZ1ctOHRBAAAAAITzV90WSV80VGNNRVRUU0NUU0RBYzFoTlNkURZKQWV3TjVla1NtLXE0Vmh6eFU3djRRAAAAAHTWy80WblVHTHVkWFpRYk83ajB1anZlNTVnQRY3akQzZVRla1QwR01Id3RCc1RDa3d3AAAAAAHVqlsWcUlmaVY2MUlROHVlbEVtMVNXNDZTdxZLZUlSV29fQlRwT0tTSW5RZ1ctOHRBAAAAAITzV9wWSV80VGNNRVRUU0NUU0RBYzFoTlNkURZheXE2MUpBU1NxQ0kzRXlpcnhhYWhRAAAAAIVZOFgWc09SS2Q1UHhUM1c5LVNJOVZfemVNQRY5VDJCckZvOVFyMm03ZHRPNjF4QUdBAAAAAHzO0XsWbXl3TVIzRUFTMUNUU2p2NHRuaXIzZxY5VDJCckZvOVFyMm03ZHRPNjF4QUdBAAAAAHzO0XwWbXl3TVIzRUFTMUNUU2p2NHRuaXIzZxZLZUlSV29fQlRwT0tTSW5RZ1ctOHRBAAAAAITzV9sWSV80VGNNRVRUU0NUU0RBYzFoTlNkURZheXE2MUpBU1NxQ0kzRXlpcnhhYWhRAAAAAIVZOFcWc09SS2Q1UHhUM1c5LVNJOVZf

In [26]:
# extract 2nd page

payload = {
    'scroll_id': json_data['_scroll_id'],
    'scroll': '1m'
}

html = requests.post(COVIDJAB_API_URL, 
    auth = api_auth,
    json = payload,
)
json_data = html.json()
print(json.dumps(json_data, indent=3))

{
   "_scroll_id": "FGluY2x1ZGVfY29udGV4dF91dWlkDnF1ZXJ5VGhlbkZldGNoHhY5VDJCckZvOVFyMm03ZHRPNjF4QUdBAAAAAHzO0XoWbXl3TVIzRUFTMUNUU2p2NHRuaXIzZxZwSHBiRklZX1RKaXpLdm5RWjN0OE53AAAAABoNVZUWYXM2M3RXcVVSUjIwZUk5bmJhUHFHURY5VDJCckZvOVFyMm03ZHRPNjF4QUdBAAAAAHzO0X4WbXl3TVIzRUFTMUNUU2p2NHRuaXIzZxZ1ZldXQzg0YlFGS2FfTWRMQXk3WUxnAAAAAIjfenEWVU9BamwxZ1BSNU9IcFpCRGtVbGtzdxZLZUlSV29fQlRwT0tTSW5RZ1ctOHRBAAAAAITzV90WSV80VGNNRVRUU0NUU0RBYzFoTlNkURZKQWV3TjVla1NtLXE0Vmh6eFU3djRRAAAAAHTWy80WblVHTHVkWFpRYk83ajB1anZlNTVnQRY3akQzZVRla1QwR01Id3RCc1RDa3d3AAAAAAHVqlsWcUlmaVY2MUlROHVlbEVtMVNXNDZTdxZLZUlSV29fQlRwT0tTSW5RZ1ctOHRBAAAAAITzV9wWSV80VGNNRVRUU0NUU0RBYzFoTlNkURZheXE2MUpBU1NxQ0kzRXlpcnhhYWhRAAAAAIVZOFgWc09SS2Q1UHhUM1c5LVNJOVZfemVNQRY5VDJCckZvOVFyMm03ZHRPNjF4QUdBAAAAAHzO0XsWbXl3TVIzRUFTMUNUU2p2NHRuaXIzZxY5VDJCckZvOVFyMm03ZHRPNjF4QUdBAAAAAHzO0XwWbXl3TVIzRUFTMUNUU2p2NHRuaXIzZxZLZUlSV29fQlRwT0tTSW5RZ1ctOHRBAAAAAITzV9sWSV80VGNNRVRUU0NUU0RBYzFoTlNkURZheXE2MUpBU1NxQ0kzRXlpcnhhYWhRAAAAAIVZOFcWc09SS2Q1UHhUM1c5LVNJOVZf

In [7]:
print(payload)

{'size': 5, 'scroll_id': 'FGluY2x1ZGVfY29udGV4dF91dWlkDnF1ZXJ5VGhlbkZldGNoHhZLZUlSV29fQlRwT0tTSW5RZ1ctOHRBAAAAAITzG1oWSV80VGNNRVRUU0NUU0RBYzFoTlNkURZwSHBiRklZX1RKaXpLdm5RWjN0OE53AAAAABoNHNsWYXM2M3RXcVVSUjIwZUk5bmJhUHFHURY5VDJCckZvOVFyMm03ZHRPNjF4QUdBAAAAAHzOmzUWbXl3TVIzRUFTMUNUU2p2NHRuaXIzZxZ1ZldXQzg0YlFGS2FfTWRMQXk3WUxnAAAAAIjfO6MWVU9BamwxZ1BSNU9IcFpCRGtVbGtzdxY5VDJCckZvOVFyMm03ZHRPNjF4QUdBAAAAAHzOmzQWbXl3TVIzRUFTMUNUU2p2NHRuaXIzZxZKQWV3TjVla1NtLXE0Vmh6eFU3djRRAAAAAHTWlH8WblVHTHVkWFpRYk83ajB1anZlNTVnQRY3akQzZVRla1QwR01Id3RCc1RDa3d3AAAAAAHVcoQWcUlmaVY2MUlROHVlbEVtMVNXNDZTdxZwSHBiRklZX1RKaXpLdm5RWjN0OE53AAAAABoNHN0WYXM2M3RXcVVSUjIwZUk5bmJhUHFHURZheXE2MUpBU1NxQ0kzRXlpcnhhYWhRAAAAAIVZAGsWc09SS2Q1UHhUM1c5LVNJOVZfemVNQRY5VDJCckZvOVFyMm03ZHRPNjF4QUdBAAAAAHzOmzYWbXl3TVIzRUFTMUNUU2p2NHRuaXIzZxY5VDJCckZvOVFyMm03ZHRPNjF4QUdBAAAAAHzOmzcWbXl3TVIzRUFTMUNUU2p2NHRuaXIzZxZLZUlSV29fQlRwT0tTSW5RZ1ctOHRBAAAAAITzG1sWSV80VGNNRVRUU0NUU0RBYzFoTlNkURZheXE2MUpBU1NxQ0kzRXlpcnhhYWhRAAAAAIVZAGoWc09SS2Q1UHhUM1c5LV

Let's define a preprocessing function for each page so I won't run out of space in my hard drive.

In [101]:
def preprocess_index(df):
    return df.set_index(['document_id', 'paciente_id'])

def preprocess_drop_invalid_rows(df):
    return df[df['status']=='final']

def preprocess_drop_cols(df):
    return df[
        [
            'estabelecimento_uf',
            'vacina_dataAplicacao',
            'vacina_nome',
        ]
    ]

def preprocess_dtype(df):
    df['vacina_dataAplicacao'] = pd.to_datetime(df['vacina_dataAplicacao'])
    df['estabelecimento_uf'] = df['estabelecimento_uf'].astype('category')
    df['vacina_nome'] = df['vacina_nome'].astype('category')

    return df

def preprocess(df):
    df_raw = df.copy()

    df = (df_raw
        .pipe(preprocess_index)
        .pipe(preprocess_drop_invalid_rows)
        .pipe(preprocess_drop_cols)
        .pipe(preprocess_dtype)
    )

    return df

In [114]:
HITS_PER_PAGE = 10000
full_dfs = None

# first page POST params
payload = {
    'size': HITS_PER_PAGE
}

# extract first page
html = requests.post(COVIDJAB_API_URL_FIRST, 
    auth = api_auth,
    json = payload,
)
response_json = html.json()

hits = response_json['hits']['hits']
total_hits = response_json['hits']['total']['value']
total_pages = total_hits // HITS_PER_PAGE
page_count = 0

while hits:
    # get scroll id for making next page request
    scroll_id = response_json['_scroll_id']

    # getting the data into a dataframe
    dataframe_dict = [ hit['_source'] for hit in hits]
    single_df = pd.DataFrame(dataframe_dict)

    # accumulate into a single dataframe
    if full_dfs is None: # first page of results
        full_dfs = single_df.copy()
    else: 
        full_dfs = pd.concat([full_dfs, single_df])

    # extract subsequent pages
    payload = {
        'scroll_id': scroll_id,
        'scroll': '1m'
    }
    
    html = requests.post(COVIDJAB_API_URL, 
        auth = api_auth,
        json = payload,
    )
    response_json = html.json()
    hits = response_json['hits']['hits']
    
    # progress display
    page_count += 1
    if not page_count % 10:
        print(f'Page count: {page_count} of {total_pages} ({page_count/total_pages:.1%} done)')

Page count: 10 of 53959 (0.0% done)
Page count: 20 of 53959 (0.0% done)


KeyboardInterrupt: 

In [34]:
pd.json_normalize(json_data, ['hits', 'hits'])

Unnamed: 0,_index,_type,_id,_score,_source.estalecimento_noFantasia,_source.vacina_lote,_source.estabelecimento_municipio_codigo,_source.estabelecimento_valor,_source.vacina_nome,_source.ds_condicao_maternal,...,_source.paciente_endereco_nmMunicipio,_source.estabelecimento_municipio_nome,_source.vacina_codigo,_source.paciente_enumSexoBiologico,_source.dt_deleted,_source.co_condicao_maternal,_source.vacina_grupoAtendimento_nome,_source.paciente_racaCor_valor,_source.vacina_dataAplicacao,_source.data_importacao_rnds
0,desc-imunizacao-v5,_doc,171a0693-332e-4690-8a51-ae7f0427ddd4-i0b0,1.0,UBS EURIPES DE FATIMA GALDINO PELE,210556A,352310,2773236,COVID-19 SINOVAC/BUTANTAN - CORONAVAC,,...,GUARULHOS,ITAQUAQUECETUBA,86,F,2023-02-01T00:00:00.000Z,,Pessoas de 18 a 64 anos,AMARELA,2022-10-03T00:00:00.000Z,2022-10-03T19:06:24.000Z
1,desc-imunizacao-v5,_doc,d784dd2d-87e3-4e45-be44-48a6590d9b0f-i0b0,1.0,AMA UBS INTEGRADA JD BRASILIA,220252,355030,4050088,COVID-19 SINOVAC/BUTANTAN - CORONAVAC,,...,SAO PAULO,SAO PAULO,86,F,2023-02-01T00:00:00.000Z,,Pessoas de 65 a 69 anos,SEM INFORMACAO,2022-11-01T00:00:00.000Z,2022-11-01T14:06:25.000Z
2,desc-imunizacao-v5,_doc,c0eb9ce1-f0da-4716-98c7-abd463ed91aa-i0b0,1.0,UBS DONA LUIZA,FT5177,351880,9130756,COVID-19 PFIZER - COMIRNATY,,...,GUARULHOS,GUARULHOS,87,M,2023-02-01T00:00:00.000Z,,Pessoas de 18 a 64 anos,AMARELA,2022-11-05T00:00:00.000Z,2022-11-05T18:35:43.000Z
3,desc-imunizacao-v5,_doc,c2891871-74bd-42fc-8f45-0d245539c303-i0b0,1.0,UBS JARDIM GUARANI,220253,355030,2787415,COVID-19 SINOVAC/BUTANTAN - CORONAVAC,,...,GUARULHOS,SAO PAULO,86,M,2023-02-01T00:00:00.000Z,,Pessoas de 18 a 64 anos,SEM INFORMACAO,2022-11-29T00:00:00.000Z,2022-11-29T14:19:45.000Z
4,desc-imunizacao-v5,_doc,f156849d-2a55-4aa0-9f34-44cec1e6ac29-i0b0,1.0,CS II JOSE F ROSAS,220255,353860,2040514,COVID-19 SINOVAC/BUTANTAN - CORONAVAC,,...,PIRACAIA,PIRACAIA,86,F,2023-02-01T00:00:00.000Z,,Pessoas de 18 a 64 anos,SEM INFORMACAO,2022-12-07T00:00:00.000Z,2022-12-07T14:37:04.000Z


In [37]:
dataframe_dict = [ rec['_source'] for rec in json_data['hits']['hits']]
pd.DataFrame(dataframe_dict)

Unnamed: 0,estalecimento_noFantasia,vacina_lote,estabelecimento_municipio_codigo,estabelecimento_valor,vacina_nome,ds_condicao_maternal,paciente_endereco_coPais,@version,document_id,paciente_nacionalidade_enumNacionalidade,...,paciente_endereco_nmMunicipio,estabelecimento_municipio_nome,vacina_codigo,paciente_enumSexoBiologico,dt_deleted,co_condicao_maternal,vacina_grupoAtendimento_nome,paciente_racaCor_valor,vacina_dataAplicacao,data_importacao_rnds
0,UBS EURIPES DE FATIMA GALDINO PELE,210556A,352310,2773236,COVID-19 SINOVAC/BUTANTAN - CORONAVAC,,10,1,171a0693-332e-4690-8a51-ae7f0427ddd4-i0b0,B,...,GUARULHOS,ITAQUAQUECETUBA,86,F,2023-02-01T00:00:00.000Z,,Pessoas de 18 a 64 anos,AMARELA,2022-10-03T00:00:00.000Z,2022-10-03T19:06:24.000Z
1,AMA UBS INTEGRADA JD BRASILIA,220252,355030,4050088,COVID-19 SINOVAC/BUTANTAN - CORONAVAC,,10,1,d784dd2d-87e3-4e45-be44-48a6590d9b0f-i0b0,B,...,SAO PAULO,SAO PAULO,86,F,2023-02-01T00:00:00.000Z,,Pessoas de 65 a 69 anos,SEM INFORMACAO,2022-11-01T00:00:00.000Z,2022-11-01T14:06:25.000Z
2,UBS DONA LUIZA,FT5177,351880,9130756,COVID-19 PFIZER - COMIRNATY,,10,1,c0eb9ce1-f0da-4716-98c7-abd463ed91aa-i0b0,B,...,GUARULHOS,GUARULHOS,87,M,2023-02-01T00:00:00.000Z,,Pessoas de 18 a 64 anos,AMARELA,2022-11-05T00:00:00.000Z,2022-11-05T18:35:43.000Z
3,UBS JARDIM GUARANI,220253,355030,2787415,COVID-19 SINOVAC/BUTANTAN - CORONAVAC,,10,1,c2891871-74bd-42fc-8f45-0d245539c303-i0b0,B,...,GUARULHOS,SAO PAULO,86,M,2023-02-01T00:00:00.000Z,,Pessoas de 18 a 64 anos,SEM INFORMACAO,2022-11-29T00:00:00.000Z,2022-11-29T14:19:45.000Z
4,CS II JOSE F ROSAS,220255,353860,2040514,COVID-19 SINOVAC/BUTANTAN - CORONAVAC,,10,1,f156849d-2a55-4aa0-9f34-44cec1e6ac29-i0b0,B,...,PIRACAIA,PIRACAIA,86,F,2023-02-01T00:00:00.000Z,,Pessoas de 18 a 64 anos,SEM INFORMACAO,2022-12-07T00:00:00.000Z,2022-12-07T14:37:04.000Z


In [115]:
full_dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220000 entries, 0 to 9999
Data columns (total 42 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   estalecimento_noFantasia                  220000 non-null  object 
 1   vacina_lote                               217882 non-null  object 
 2   estabelecimento_municipio_codigo          220000 non-null  object 
 3   estabelecimento_valor                     220000 non-null  object 
 4   vacina_nome                               220000 non-null  object 
 5   ds_condicao_maternal                      71900 non-null   object 
 6   paciente_endereco_coPais                  219977 non-null  object 
 7   @version                                  220000 non-null  object 
 8   document_id                               220000 non-null  object 
 9   paciente_nacionalidade_enumNacionalidade  219996 non-null  object 
 10  vacina_categoria_codig

In [118]:
full_dfs['status'].value_counts(normalize = True)

final               0.938491
entered-in-error    0.061509
Name: status, dtype: float64

In [119]:
pre_df = preprocess(full_dfs[full_dfs['paciente_idade']>=18])

In [120]:
pre_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 164775 entries, ('39e44af5-3a9b-4370-bee6-c28b86534ce9-i0b0', 'e48bffae6603281e2cd42acb403cd5276373502e862abdbfa9031899d2904c02') to ('801f57e6-aebc-49f0-add6-cbde04fe9791-i0b0', '976d4725bcda278e055cf2a01aa0d928a9447971f7647198f4902259f44d36bd')
Data columns (total 3 columns):
 #   Column                Non-Null Count   Dtype              
---  ------                --------------   -----              
 0   estabelecimento_uf    164775 non-null  category           
 1   vacina_dataAplicacao  164775 non-null  datetime64[ns, UTC]
 2   vacina_nome           164775 non-null  category           
dtypes: category(2), datetime64[ns, UTC](1)
memory usage: 13.6+ MB


In [121]:
pre_df['vacina_nome'].value_counts()

COVID-19 PFIZER - COMIRNATY                  84874
COVID-19 ASTRAZENECA/FIOCRUZ - COVISHIELD    32982
COVID-19 JANSSEN - Ad26.COV2.S               29191
COVID-19 SINOVAC/BUTANTAN - CORONAVAC        16391
COVID-19 ASTRAZENECA - ChAdOx1-S              1213
COVID-19 PFIZER - COMIRNATY PEDIÁTRICA          58
COVID-19 SINOVAC - CORONAVAC                    58
COVID-19 PEDIÁTRICA - PFIZER COMIRNATY           8
Name: vacina_nome, dtype: int64

In [65]:
full_dfs[full_dfs['paciente_idade']>=18]['vacina_nome'].value_counts()

COVID-19 PFIZER - COMIRNATY                  549
COVID-19 ASTRAZENECA/FIOCRUZ - COVISHIELD    174
COVID-19 SINOVAC/BUTANTAN - CORONAVAC         65
COVID-19 JANSSEN - Ad26.COV2.S                47
COVID-19 ASTRAZENECA - ChAdOx1-S               6
COVID-19 PFIZER - COMIRNATY PEDIÁTRICA         2
COVID-19 SINOVAC - CORONAVAC                   1
Name: vacina_nome, dtype: int64

In [68]:
full_dfs[(full_dfs['paciente_idade']>=18)&full_dfs['vacina_nome'].str.lower().str.contains('pedi')][['paciente_idade', 'vacina_nome']]

Unnamed: 0,paciente_idade,vacina_nome
2,44,COVID-19 PFIZER - COMIRNATY PEDIÁTRICA
2,22,COVID-19 PFIZER - COMIRNATY PEDIÁTRICA


In [122]:
def mechanisms(name):
    name = name.lower()
    if 'comirnaty' in name:
        return 'mrna'
    elif 'coronavac' in name:
        return 'inactivated'
    elif 'novavax' in name:
        return 'subunit'
    elif 'covishield' in name or 'chad' in name or 'ad26' in name or 'sputnik' in name:
        return 'viral_vector'
    else:
        return 'unknown'
    
full_dfs['mechanism'] = full_dfs['vacina_nome'].apply(mechanisms)


In [124]:
full_dfs['mechanism'].value_counts(normalize = True)

mrna            0.527032
viral_vector    0.311636
inactivated     0.161332
Name: mechanism, dtype: float64

In [125]:
jabs_by_uf_month_type = pre_df.groupby([pd.Grouper(key = 'vacina_dataAplicacao', freq = 'M'), 'estabelecimento_uf', 'vacina_nome']).size()

In [126]:
jabs_by_uf_month_type[jabs_by_uf_month_type>0]

vacina_dataAplicacao       estabelecimento_uf  vacina_nome                              
2001-06-30 00:00:00+00:00  PR                  COVID-19 ASTRAZENECA/FIOCRUZ - COVISHIELD      1
2007-07-31 00:00:00+00:00  SC                  COVID-19 ASTRAZENECA/FIOCRUZ - COVISHIELD      1
2020-01-31 00:00:00+00:00  RJ                  COVID-19 ASTRAZENECA/FIOCRUZ - COVISHIELD      1
2020-08-31 00:00:00+00:00  AM                  COVID-19 PFIZER - COMIRNATY                    1
2020-09-30 00:00:00+00:00  SP                  COVID-19 PFIZER - COMIRNATY                    2
                                                                                           ... 
2023-02-28 00:00:00+00:00  SE                  COVID-19 PFIZER - COMIRNATY                   15
                           SP                  COVID-19 JANSSEN - Ad26.COV2.S                 2
                                               COVID-19 PFIZER - COMIRNATY                  389
                                               