In [83]:
import pandas as pd
import requests
from functools import reduce
from itables import show

In [84]:
# Constants
RAW_DATA_DIR = '../data/raw'
INTERIM_DATA_DIR = '../data/interim'
PROCESSED_DATA_DIR = '../data/processed'

PRIMARY_KEY = 'CO_MUNICIPIO'

# Dados IBGE

Faz uso da [API de dados agregados do IBGE](https://servicodados.ibge.gov.br/api/docs/agregados?versao=3#api-Variaveis-agregadosAgregadoPeriodosPeriodosVariaveisVariavelGet) para obter a informação do Produto Interno Bruto corrente a preços de mercado (PIB a preços correntes) por município.

In [85]:
DROP_IBGE_COLUMNS = ['localidade.nivel.id', 'localidade.nivel.nome', "localidade.nome"]

def _ibge_json_to_ibge_df(json_data):
    # Extracts the county PIB (Produto Interno Bruto) data from the given JSON data
    target_info_json = json_data[0]["resultados"][0]["series"]
    pibge_df = pd.json_normalize(target_info_json)
    pibge_df = pibge_df.drop(DROP_IBGE_COLUMNS, axis=1)
    return pibge_df

In [86]:
# Build the table with county PIB

years = list(range(2013, 2022))
counties_pib_dfs = []

for year in years:
    print(f"Getting data for year {year}")
    county_pib_url = f"https://servicodados.ibge.gov.br/api/v3/agregados/5938/periodos/{year}/variaveis/37?localidades=N6[all]"
    response = requests.get(county_pib_url, verify=False)

    if response.status_code == 200:
        data = response.json()
        print(f"Building IBGE {year} dataframe...")
        counties_pib_dfs.append(_ibge_json_to_ibge_df(data))
    else:
        print("Request failed with status code:", response.status_code)

print("Merging IBGE dataframes...")
complete_counties_pib = reduce(lambda df1,df2: pd.merge(df1,df2,on='localidade.id', how='outer'), counties_pib_dfs)

# Rename Columns
new_column_names = {'serie.{}'.format(year): 'PIB_{}'.format(year) for year in years}
new_column_names["localidade.id"] = PRIMARY_KEY
complete_counties_pib = complete_counties_pib.rename(columns=new_column_names)

# Cast columns to int
complete_counties_pib = complete_counties_pib.astype(int)

print("Finished building the table with county PIB")

Getting data for year 2013




Building IBGE 2013 dataframe...
Getting data for year 2014




Building IBGE 2014 dataframe...
Getting data for year 2015




Building IBGE 2015 dataframe...
Getting data for year 2016




Building IBGE 2016 dataframe...
Getting data for year 2017




Building IBGE 2017 dataframe...
Getting data for year 2018




Building IBGE 2018 dataframe...
Getting data for year 2019




Building IBGE 2019 dataframe...
Getting data for year 2020




Building IBGE 2020 dataframe...
Getting data for year 2021




Building IBGE 2021 dataframe...
Merging IBGE dataframes...
Finished building the table with county PIB


In [94]:
# Save data
complete_counties_pib.info()
complete_counties_pib.to_csv(F'{INTERIM_DATA_DIR}/ibge_pib_data.csv', sep=';', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5570 entries, 0 to 5569
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   CO_MUNICIPIO  5570 non-null   int64
 1   PIB_2013      5570 non-null   int64
 2   PIB_2014      5570 non-null   int64
 3   PIB_2015      5570 non-null   int64
 4   PIB_2016      5570 non-null   int64
 5   PIB_2017      5570 non-null   int64
 6   PIB_2018      5570 non-null   int64
 7   PIB_2019      5570 non-null   int64
 8   PIB_2020      5570 non-null   int64
 9   PIB_2021      5570 non-null   int64
dtypes: int64(10)
memory usage: 435.3 KB


# Dados Bolsa Família

Não conseguimos dados socioeconômicos com a granularidade, espacial e temporal, necessária. Assim, buscamos usar o percentual da população que recebe benefícios do programa Bolsa Família por município para ter algum dado de cunho social. O número de benefícios do programa Bolsa Família pode ser encontrado no [VIS DATA 3 beta](https://aplicacoes.cidadania.gov.br/vis/data3/v.php?q[]=r6JtZJCug7BtxKW25rV%2FfmdhhJFkl21kmK19ZnB1ZW6maX7KmZO20qfOnJm%2B6IianbSon7SfrrqqkpKcmcuppsK2iKextVi1mpyuwZxNzsmY2F1zyuDAk522pHa2YH9%2BaV6EkmOXbWSEm8GcobZVetufrL%2BrkbbHlNddmMnuslSqvaGmmZ67sliqkseU1rCYmOGuoK%2BtcHXfmrnBnGiS1KjXYK5%2B3q6noWisot6nbY6kksrAlNiscZqif2Rue2JqrGZ9f15Ny8mY2F1zv%2BGspbCslKDapm2zo6C8gaHfqZ994LuYXcVwoNqlwLNyk7jNps94bsPcuaehg3Ct7qZwyViQuNSYirSbwultdKmtqJnap7yKdFSJkWWbamSNqH1lY2ipot6nbY6Zk7bXn4qin9DgbaKxtKFa3qexb7RovcKf3aJuw9y5p6GDcKDapcCzcmjK1qCNuFTA3MCZXL%2Bdn%2BdZjbucoLbCodl7cIStfWZvdWVtpml%2BdVehv8ahin2Vw9rDoFytoa3eWbvDo5l3xqHOXrCY4a6gr61woNqlwLNyaL3Cn92ibpjuwqFfw1ad2qyybq6VvM9TqqqY0NquoquEcmGraX9%2FZF6HjmObZFPR47KiXHCYm%2ByebcWfksWBc8yjks7vsZOiqaJ4qVnBtpybd9Oi36uXhbuvmpu%2BoXSzp8K7nJ%2FAxGKqn5m87MGYm66Wp6Vrdm6cmcrGU9iyn8mbsqKgcVWf5ayybqWiw81Tz6uXfviImp20qJ%2B0n666qpKSnJnLqabCtoinsbVYtZqcrsGcTc7JmNhdc8rgwJOdtqR4tmB%2FfmlghJFml21khJvBnKG2VWLcmsCzV6S%2FxqGKfamPsYFnenhVruGeu26pnMzPl5J9lcPaw6B2gqOv5p6%2Ft5pcl9dloHFmia12VKG0qJ%2BZp8K6o028z5eTXZjJ7rJUqr2hppmeu7JYqpLHlNawmJjhrqCvrXB135q5wZxoktSo17l5vugQ4aixlq2Ze7K8nJPAxJwt3qXG3MBXgqmi%2FSaltq%2BqTaejeYpllNE%2B9lSLvalpq2l%2Ff2BQncKgLeqfxtzAVIyKe1qhmm2%2BmJ%2FLyqWKoZh9yK6ma3plbKxicKSYmcbTU9yio77uwJWgt1X9GaxttJiaGg6f056mfcuPelxwlq484m2drKGGk2OcblyA0a6gq7pVrN6prsGqjrvQUy3dpn3hrqH%2F9aGj2qxtnnlzd4mUiq2Uz%2B%2B2plysmlrGmr99aV2JlFyNk5TJ6r9UoLdVfN6nsrT62rrKooqq9gbftqNccJauPOJtnayhhpNjnG5cgNGuoKu6VZ7oWY%2BzpZK9JODNpqJ96BDdoLGkWqGabb6Yn8vKpYqhmH3IrqZremVsrGLJvnKp091lmm1miqt%2BYWx5iWqpc31%2BcV2Hu24%3D&ma=ano&dt1=2013-01-01&dt2=2021-01-01&ag=m&ultdisp=1&ultdisp=0). Já a estimativa da população de cada município está disponível na [API de dados agregados do IBGE](https://servicodados.ibge.gov.br/api/docs/agregados?versao=3#api-Variaveis-agregadosAgregadoPeriodosPeriodosVariaveisVariavelGet).

In [88]:
# Build the table with population estimate
years = list(range(2013, 2022))
population_estimate_dfs = []

for year in years:
    print(f"Getting data for year {year}")
    population_estimate_url = f"https://servicodados.ibge.gov.br/api/v3/agregados/6579/periodos/{year}/variaveis/9324?localidades=N6[all]"
    response = requests.get(population_estimate_url, verify=False)

    if response.status_code == 200:
        data = response.json()
        print(f"Building IBGE {year} dataframe...")
        population_estimate_dfs.append(_ibge_json_to_ibge_df(data))
    else:
        print("Request failed with status code:", response.status_code)

print("Merging population estimate IBGE dataframes...")
complete_population_estimate = reduce(lambda df1,df2: pd.merge(df1,df2,on='localidade.id', how='outer'), population_estimate_dfs)

# Rename Columns
new_column_names = {'serie.{}'.format(year): 'POP_ESTIMATE_{}'.format(year) for year in years}
new_column_names["localidade.id"] = PRIMARY_KEY
complete_population_estimate = complete_population_estimate.rename(columns=new_column_names)

# Cast columns to int
complete_population_estimate = complete_population_estimate.astype(int)

# Add a auxiliary column to possibly merge with the other table
# The last digit of the CO_MUNICIPIO is a check digit: https://medium.com/@salibi/como-validar-o-c%C3%B3digo-de-munic%C3%ADpio-do-ibge-90dc545cc533
complete_population_estimate[f'AUX_{PRIMARY_KEY}'] = (complete_population_estimate[PRIMARY_KEY]//10)

print("Finished building the table with population estimate")

Getting data for year 2013




Building IBGE 2013 dataframe...
Getting data for year 2014




Building IBGE 2014 dataframe...
Getting data for year 2015




Building IBGE 2015 dataframe...
Getting data for year 2016




Building IBGE 2016 dataframe...
Getting data for year 2017




Building IBGE 2017 dataframe...
Getting data for year 2018




Building IBGE 2018 dataframe...
Getting data for year 2019




Building IBGE 2019 dataframe...
Getting data for year 2020




Building IBGE 2020 dataframe...
Getting data for year 2021




Building IBGE 2021 dataframe...
Merging population estimate IBGE dataframes...
Finished building the table with population estimate


In [89]:
vis_pbf_df = pd.read_csv(F'{RAW_DATA_DIR}/vis_data_3_pbf.csv', sep=',')

# Drop columns and rename
vis_pbf_df=vis_pbf_df[["Código", "Referência", "Famílias PBF (até Out/2021)"]]
new_column_names = {
    'Código': PRIMARY_KEY,
    "Referência" : "ANO",
    "Famílias PBF (até Out/2021)": "NUM_FAMILIAS_PBF"
}
vis_pbf_df = vis_pbf_df.rename(columns=new_column_names)

# Pivot the table
vis_pbf_df = vis_pbf_df.pivot_table(index=PRIMARY_KEY, columns='ANO', values='NUM_FAMILIAS_PBF')
vis_pbf_df = vis_pbf_df.reset_index()

In [96]:
# Merge the tables
social_info_df = pd.merge(vis_pbf_df, complete_population_estimate, how='inner', left_on=PRIMARY_KEY, right_on=f"AUX_{PRIMARY_KEY}")

# Calculate the percentage of families in the PBF program
for year in range(2013, 2022):
    social_info_df[f"PERCENTUAL_FAMILIAS_PBF_{year}"] = (
        social_info_df[year] / social_info_df[f"POP_ESTIMATE_{year}"]
    ).round(5)

# Drop columns and rename
remove_columns = [f"AUX_{PRIMARY_KEY}", f"{PRIMARY_KEY}_x"] + [f"POP_ESTIMATE_{year}" for year in range(2013, 2022)] + [year for year in range(2013, 2022)]
social_info_df.drop(remove_columns, axis=1, inplace=True)
social_info_df.rename(columns={f"{PRIMARY_KEY}_y": PRIMARY_KEY}, inplace=True)

CO_MUNICIPIO,PERCENTUAL_FAMILIAS_PBF_2013,PERCENTUAL_FAMILIAS_PBF_2014,PERCENTUAL_FAMILIAS_PBF_2015,PERCENTUAL_FAMILIAS_PBF_2016,PERCENTUAL_FAMILIAS_PBF_2017,PERCENTUAL_FAMILIAS_PBF_2018,PERCENTUAL_FAMILIAS_PBF_2019,PERCENTUAL_FAMILIAS_PBF_2020,PERCENTUAL_FAMILIAS_PBF_2021
Loading ITables v2.0.1 from the internet... (need help?),,,,,,,,,


In [97]:
# Save data
social_info_df.info()
social_info_df.to_csv(F'{INTERIM_DATA_DIR}/social_pbf_data.csv', sep=';', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5570 entries, 0 to 5569
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   CO_MUNICIPIO                  5570 non-null   int64  
 1   PERCENTUAL_FAMILIAS_PBF_2013  5569 non-null   float64
 2   PERCENTUAL_FAMILIAS_PBF_2014  5570 non-null   float64
 3   PERCENTUAL_FAMILIAS_PBF_2015  5570 non-null   float64
 4   PERCENTUAL_FAMILIAS_PBF_2016  5570 non-null   float64
 5   PERCENTUAL_FAMILIAS_PBF_2017  5570 non-null   float64
 6   PERCENTUAL_FAMILIAS_PBF_2018  5570 non-null   float64
 7   PERCENTUAL_FAMILIAS_PBF_2019  5570 non-null   float64
 8   PERCENTUAL_FAMILIAS_PBF_2020  5570 non-null   float64
 9   PERCENTUAL_FAMILIAS_PBF_2021  5570 non-null   float64
dtypes: float64(9), int64(1)
memory usage: 435.3 KB


# Dados Externos

In [98]:
# Merge the extracted data in a single table
external_data = pd.merge(complete_counties_pib, social_info_df, how='inner', on=PRIMARY_KEY)

In [99]:
# Save data
external_data.to_csv(F'{INTERIM_DATA_DIR}/external_data.csv', sep=';', index=False)