This script will merge the data downloaded in the steps 2 an 3 in a single file, keeping only the relevant columns. Column names will also be renamed for convenience.

In [1]:
import geopandas as gpd
from functools import reduce
import pandas as pd
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')

In [2]:
def read_census_tracts():
    '''
    Reads census tract data,
    selects the relevant variables
    and merges them with the polyogns.
    '''
    
    # Variables of interest with their description:
    information = {
       'setor_censitario_basico_2010' :  {
            'v001': 'total_permanent_households',
            'v009': 'permanent_household_nominal_mean_income',
        },
        'setor_censitario_domicilio_renda_2010': {
            'v005': 'households_1/8_minimum_wage',
            'v006': 'households_1/4_minimum_wage',
            'v007': 'households_1/2_minimum_wage',
            'v008': 'households_minimum_wage'
        },
        'setor_censitario_raca_idade_genero_2010': {
             'v001': 'total_residents',
             'v002': 'white_residents',
             'v003': 'black_residents',
             'v004': 'yellow_residents',
             'v005': 'pardo_residents',
             'v006': 'indigenous_residents'
        },
        'setor_censitario_alfabetizacao_total_2010' : {
            'v001': 'literate_residents',
        }
    }

    
    # Reads all files
    dfs = []
    for k,v in information.items():
        
        df = pd.read_csv(f"../data/brazil/censo/resultados/{k}.csv", dtype={'id_setor_censitario': str})
        
        df = df.rename(columns={'id_setor_censitario': 'code_tract'})
        
        if k == "setor_censitario_domicilio_renda_2010":
            df['total_private_households'] = df.v005 + df.v006 + df.v007 + df.v008 + df.v009 + df.v010 + df.v011 + df.v012 + df.v013 + df.v014
            df['private_households_under_minimum_wage'] = df.v005 + df.v006 + df.v007 + df.v008 + df.v014
            df = df.drop(columns=[col for col in df.columns if col not in ['code_tract', 'private_households_under_minimum_wage', 'total_private_households']])

        else:
            df = df.rename(columns=v)
            df = df.drop(columns=[col for col in df.columns if col not in v.values() and col != 'code_tract'])
        

        dfs.append(df)
        
    # Reduce merge - https://stackoverflow.com/a/30512931
    dfs = reduce(lambda left,right: pd.merge(left,right,on='code_tract'), dfs)    
    
    # Merge with the polygons
    gdf = gpd.read_file("../data/brazil/censo/malha/setores.zip/", dtype={'code_tract': str})
    
    gdf = gdf.merge(dfs, on='code_tract')
    return gdf
        

In [3]:
def main():
    gdf = read_census_tracts()
    gdf.to_feather("../data/brazil/censo/combined/combined.feather")

In [4]:
if __name__ == "__main__":
    main()


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  This is separate from the ipykernel package so we can avoid doing imports until
