In [187]:
import pandas as pd
import numpy as np
import pathfinder as pf

In [188]:
filename = "censo_1872-003_p1.csv"
path = pf.find_file(filename)
df_1872_003_br = pd.read_csv(path)

In [189]:
prov = df_1872_003_br[['Unnamed: 0']]

In [216]:
def format_prov(df:pd.DataFrame) -> pd.DataFrame:
    prov = df[['Unnamed: 0']].copy()
    prov.rename(columns={'Unnamed: 0':"Províncias e Município Neutro"}, inplace=True)
    prov = prov[3:]
    prov.reset_index(drop=True, inplace=True)
    prov.reset_index(drop=False, inplace=True)
    return prov

def format_gen(df:pd.DataFrame) -> pd.DataFrame:
    gen = df[['Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3']]

    gen_header = gen.iloc[2]
    gen.columns = gen_header
    gen = gen[3:]

    columns_multiindex = pd.MultiIndex.from_tuples([
    ('Gêneros', 'Homens'),
    ('Gêneros', 'Mulheres'),
    ('Gêneros', 'Total')])

    gen.columns = columns_multiindex
    gen.reset_index(drop=True, inplace=True)
    # retorna um df 22x4
    return gen

def format_e_civis(df:pd.DataFrame) -> pd.DataFrame:
    estados_civis = df[['Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9']]
    estados_civis_header = estados_civis.iloc[0]
    estados_civis.columns = estados_civis_header
    estados_civis = estados_civis[3:]

    columns_multiindex = pd.MultiIndex.from_tuples([
        ('Homens', 'Solteiros'),
        ('Homens', 'Casados'),
        ('Homens', 'Viúvos'),
        ('Mulheres', 'Solteiras'),
        ('Mulheres', 'Casadas'),
        ('Mulheres', 'Viúvas')
    ])

    estados_civis.columns = columns_multiindex
    estados_civis.reset_index(drop=True, inplace=True)
    return estados_civis

def format_racas(df:pd.DataFrame) -> pd.DataFrame:
    racas = df[['Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17']]
    racas_header = racas.iloc[0]
    racas.columns = racas_header
    racas = racas[3:]

    columns_multindex = pd.MultiIndex.from_tuples([
        ('Homens', 'Brancos'),
        ('Homens', 'Pardos'),
        ('Homens', 'Pretos'),
        ('Homens', 'Caboclos'),
        ('Mulheres', 'Brancas'),
        ('Mulheres', 'Pardas'),
        ('Mulheres', 'Pretas'),
        ('Mulheres', 'Caboclas')
    ])

    racas.columns = columns_multindex
    racas.reset_index(drop=True, inplace=True)
    return racas

def format_relig(df:pd.DataFrame) -> pd.DataFrame:
    religiao = df[['Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21']]
    religiao_header = religiao.iloc[0]
    religiao.columns = religiao_header
    religiao = religiao[3:]
    
    columns_multindex = pd.MultiIndex.from_tuples([
        ('Homens', 'Católicos'),
        ('Homens', 'Acatólicos'),
        ('Mulheres', 'Católicos'),
        ('Mulheres', 'Acatólicos')
    ])

    religiao.columns = columns_multindex
    religiao.reset_index(drop=True, inplace=True)
    return religiao

def format_nacional(df:pd.DataFrame) -> pd.DataFrame:
    nacionalidade = df[['Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25']]
    nacionalidade_header = nacionalidade.iloc[0]
    nacionalidade.columns = nacionalidade_header
    nacionalidade = nacionalidade[3:]
    
    columns_multindex = pd.MultiIndex.from_tuples([
        ('Homens', 'Brasileiros'),
        ('Homens', 'Estrangeiros'),
        ('Mulheres', 'Brasileiras'),
        ('Mulheres', 'Estrangeiras')
    ])

    nacionalidade.columns = columns_multindex
    nacionalidade.reset_index(drop=True, inplace=True)
    return nacionalidade

def format_inst(df:pd.DataFrame) -> pd.DataFrame:
    instrucao = df[['Instrução', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35']]
    instrucao_header = instrucao.iloc[0]
    instrucao.columns = instrucao_header
    instrucao = instrucao[3:]

    columns_multindex = pd.MultiIndex.from_tuples([
        ('Homens', 'Sabem ler e escrever'),
        ('Homens', 'Analfabetos'),
        ('Mulheres', 'Sabem ler e escrever'),
        ('Mulheres', 'Analfabetos'),
        ('Meninos', 'Frequentam escolas'),
        ('Meninos', 'Não frequentam'),
        ('Meninos', 'Total'),
        ('Meninas', 'Frequentam escolas'),
        ('Meninas', 'Não frequentam'),
        ('Meninas', 'Total')
    ])

    instrucao.columns = columns_multindex
    instrucao.reset_index(drop=True, inplace=True)
    return instrucao

def format_casas(df:pd.DataFrame) -> pd.DataFrame:
    casas = df[['Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38']]
    casas_header = casas.iloc[1]
    casas.columns = casas_header
    casas = casas[3:]

    columns_multindex = pd.MultiIndex.from_tuples([
        ('Casas', 'Habitadas'),
        ('Casas', 'Desabitadas'),
        ('Casas', 'Total'),
    ])

    casas.columns = columns_multindex
    casas.reset_index(drop=True, inplace=True)
    return casas

def format_fogos(df:pd.DataFrame) -> pd.DataFrame:
    fogos = df[[ 'Unnamed: 39']]
    fogos_header = fogos.iloc[2]
    fogos.columns = fogos_header
    fogos = fogos[3:]
    fogos = fogos.iloc[:, 1:]
    return fogos

def format_br_003(df:pd.DataFrame) -> pd.DataFrame:
    gen = format_gen(df)
    e_civis = format_e_civis(df)
    racas = format_racas(df)
    rel = format_relig(df)
    nacional = format_nacional(df)
    inst = format_inst(df)
    casas = format_casas(df)
    #fogos não tem multiindex
    # fogos = format_fogos(df)

    df_concat = pd.concat([gen, e_civis, racas, rel, nacional, inst, casas], axis=1)
    return df_concat

In [191]:
prov = format_prov(df_1872_003_br)
print(prov.shape)
prov.to_csv('./csv/br-1872-003-provincias.csv', index=False)

(22, 2)


In [192]:
prov_csv = pd.read_csv('./csv/br-1872-003-provincias.csv')
prov_csv

Unnamed: 0,index,Províncias e Município Neutro
0,0,Amazonas
1,1,Pará
2,2,Maranhão
3,3,Piauí
4,4,Ceará
5,5,Rio Grande do Norte
6,6,Paraíba
7,7,Pernambuco
8,8,Alagoas
9,9,Sergipe


In [193]:
gen = format_gen(df_1872_003_br)
gen.columns = gen.columns.droplevel(0)
print(gen.shape)
gen.to_csv('./csv/br-1872-003-generos.csv', index=True)

(22, 3)


Unnamed: 0,Homens,Mulheres,Total
0,30983,2564,56631
1,128589,11919,247779
2,141942,14215,284101
3,90322,8810,178427
4,350906,33886,689773
5,112721,10823,220959
6,179433,17526,354700
7,381565,37094,752511
8,155584,15667,312268
9,74739,7888,153620


In [194]:
gen_csv = pd.read_csv('./csv/br-1872-003-generos.csv')
gen_csv

Unnamed: 0.1,Unnamed: 0,Homens,Mulheres,Total
0,0,30983,2564,56631
1,1,128589,11919,247779
2,2,141942,14215,284101
3,3,90322,8810,178427
4,4,350906,33886,689773
5,5,112721,10823,220959
6,6,179433,17526,354700
7,7,381565,37094,752511
8,8,155584,15667,312268
9,9,74739,7888,153620


In [217]:
e_civis = format_e_civis(df_1872_003_br)
e_civis.columns = e_civis.columns.droplevel(0)
print(e_civis.shape)
e_civis.to_csv('./csv/br-1872-003-e_civis.csv', index=True)
e_civis

(22, 6)


Unnamed: 0,Solteiros,Casados,Viúvos,Solteiras,Casadas,Viúvas
0,24959,5105,919,19214,4714,1720
1,94848,28935,4806,86825,26280,6085
2,103283,33644,5015,102302,32337,7520
3,63294,23323,3705,59547,23371,5187
4,241692,99915,9299,224509,99849,14509
5,80803,26832,5086,75915,26735,5588
6,119403,53474,6556,112332,53274,9661
7,258936,111189,11440,240239,111465,19242
8,102729,47865,4990,99772,46743,10169
9,48013,24027,2699,50746,24795,3340


In [196]:
racas = format_racas(df_1872_003_br)
racas.columns = racas.columns.droplevel(0)
print(racas.shape)
racas.to_csv('./csv/br-1872-003-racas.csv', index=True)
racas

(22, 8)


In [213]:
relig = format_relig(df_1872_003_br)
relig.columns = relig.columns.droplevel(0)
print(relig.shape)
relig.to_csv('./csv/br-1872-003-relig.csv', index=True)
relig

(22, 4)


Unnamed: 0,Católicos,Acatólicos,Católicos.1,Acatólicos.1
0,30903,80,25641,7
1,128396,193,119169,21
2,141865,77,142100,59
3,90313,9,88105,0
4,350868,38,338857,10
5,112717,4,108238,0
6,179429,4,175266,1
7,381361,204,370906,40
8,155557,27,156683,1
9,74739,0,78881,0


In [198]:
nac = format_nacional(df_1872_003_br)
nac.columns = nac.columns.droplevel(0)
print(nac.shape)
nac.to_csv('./csv/br-1872-003-nacionalidade.csv', index=True)

(22, 4)


In [210]:
inst = format_inst(df_1872_003_br)
inst.columns = inst.columns.droplevel(0)
print(inst.shape)
inst.to_csv('./csv/br-1872-003-instrucao.csv', index=True)
inst

(22, 10)


Unnamed: 0,Sabem ler e escrever,Analfabetos,Sabem ler e escrever.1,Analfabetos.1,Frequentam escolas,Não frequentam,Total,Frequentam escolas.1,Não frequentam.1,Total.1
0,6160,24823,1453,24195,824,5056,5880,371,3269,3640
1,39718,88871,20677,98513,5768,25753,31521,3441,25020,28461
2,44375,97567,24196,117963,8739,28142,36881,4844,29567,34411
3,17677,72645,10093,78012,1777,15960,17737,1024,16968,17992
4,58657,292249,20903,317964,10021,83808,93829,5399,85087,90486
5,23602,39119,16220,92018,2643,18662,21305,2058,19857,21915
6,29224,150209,11988,163279,6630,44061,50691,3894,45830,49724
7,92664,288901,54659,316287,14069,82133,96202,9301,80960,90261
8,26046,129538,15814,140870,5455,34261,39716,4028,34726,38754
9,18687,56052,10447,68434,3402,14934,18336,1832,17333,19165


In [200]:
casas = format_casas(df_1872_003_br)
casas.columns = casas.columns.droplevel(0)
print(casas.shape)
casas.to_csv('./csv/br-1872-003-casas.csv', index=True)

(22, 3)


In [201]:
fogos = format_fogos(df_1872_003_br)
print(fogos.shape)
# fogos

(22, 0)


In [202]:
format_br_003(df_1872_003_br)

Unnamed: 0_level_0,Gêneros,Gêneros,Gêneros,Homens,Homens,Homens,Mulheres,Mulheres,Mulheres,Homens,...,Mulheres,Meninos,Meninos,Meninos,Meninas,Meninas,Meninas,Casas,Casas,Casas
Unnamed: 0_level_1,Homens,Mulheres,Total,Solteiros,Casados,Viúvos,Solteiras,Casadas,Viúvas,Brancos,...,Analfabetos,Frequentam escolas,Não frequentam,Total,Frequentam escolas,Não frequentam,Total,Habitadas,Desabitadas,Total
0,30983.0,2564.0,56631.0,Solteiros,Casados,Viúvos,Solteiras,Casadas,Viúvas,6911.0,...,24195.0,824.0,5056.0,5880.0,371.0,3269.0,3640.0,7661.0,229.0,7890.0
1,128589.0,11919.0,247779.0,24959,5105,919,19214,4714,1720,49663.0,...,98513.0,5768.0,25753.0,31521.0,3441.0,25020.0,28461.0,38978.0,2077.0,41055.0
2,141942.0,14215.0,284101.0,94848,28935,4806,86825,26280,6085,52267.0,...,117963.0,8739.0,28142.0,36881.0,4844.0,29567.0,34411.0,47166.0,1405.0,48571.0
3,90322.0,8810.0,178427.0,103283,33644,5015,102302,32337,7520,22208.0,...,78012.0,1777.0,15960.0,17737.0,1024.0,16968.0,17992.0,29208.0,223.0,29431.0
4,350906.0,33886.0,689773.0,63294,23323,3705,59547,23371,5187,136940.0,...,317964.0,10021.0,83808.0,93829.0,5399.0,85087.0,90486.0,99901.0,2989.0,102890.0
5,112721.0,10823.0,220959.0,241692,99915,9299,224509,99849,14509,52835.0,...,92018.0,2643.0,18662.0,21305.0,2058.0,19857.0,21915.0,36710.0,706.0,37416.0
6,179433.0,17526.0,354700.0,80803,26832,5086,75915,26735,5588,73475.0,...,163279.0,6630.0,44061.0,50691.0,3894.0,45830.0,49724.0,43767.0,522.0,44289.0
7,381565.0,37094.0,752511.0,119403,53474,6556,112332,53274,9661,149930.0,...,316287.0,14069.0,82133.0,96202.0,9301.0,80960.0,90261.0,133920.0,4381.0,188301.0
8,155584.0,15667.0,312268.0,258936,111189,11440,240239,111465,19242,45343.0,...,140870.0,5455.0,34261.0,39716.0,4028.0,34726.0,38754.0,55894.0,2030.0,57924.0
9,74739.0,7888.0,153620.0,102729,47865,4990,99772,46743,10169,24358.0,...,68434.0,3402.0,14934.0,18336.0,1832.0,17333.0,19165.0,25542.0,567.0,26109.0


In [203]:
df_save = format_br_003(df_1872_003_br)
df_save.to_csv('br-1872-003.csv', index=False)

In [204]:
filename2 = "br-1872-003.csv"
df_csv = pd.read_csv(filename2)
df_csv

Unnamed: 0,Gêneros,Gêneros.1,Gêneros.2,Homens,Homens.1,Homens.2,Mulheres,Mulheres.1,Mulheres.2,Homens.3,...,Mulheres.12,Meninos,Meninos.1,Meninos.2,Meninas,Meninas.1,Meninas.2,Casas,Casas.1,Casas.2
0,Homens,Mulheres,Total,Solteiros,Casados,Viúvos,Solteiras,Casadas,Viúvas,Brancos,...,Analfabetos,Frequentam escolas,Não frequentam,Total,Frequentam escolas,Não frequentam,Total,Habitadas,Desabitadas,Total
1,30983,2564,56631,Solteiros,Casados,Viúvos,Solteiras,Casadas,Viúvas,6911,...,24195,824,5056,5880,371,3269,3640,7661,229,7890
2,128589,11919,247779,24959,5105,919,19214,4714,1720,49663,...,98513,5768,25753,31521,3441,25020,28461,38978,2077,41055
3,141942,14215,284101,94848,28935,4806,86825,26280,6085,52267,...,117963,8739,28142,36881,4844,29567,34411,47166,1405,48571
4,90322,8810,178427,103283,33644,5015,102302,32337,7520,22208,...,78012,1777,15960,17737,1024,16968,17992,29208,223,29431
5,350906,33886,689773,63294,23323,3705,59547,23371,5187,136940,...,317964,10021,83808,93829,5399,85087,90486,99901,2989,102890
6,112721,10823,220959,241692,99915,9299,224509,99849,14509,52835,...,92018,2643,18662,21305,2058,19857,21915,36710,706,37416
7,179433,17526,354700,80803,26832,5086,75915,26735,5588,73475,...,163279,6630,44061,50691,3894,45830,49724,43767,522,44289
8,381565,37094,752511,119403,53474,6556,112332,53274,9661,149930,...,316287,14069,82133,96202,9301,80960,90261,133920,4381,188301
9,155584,15667,312268,258936,111189,11440,240239,111465,19242,45343,...,140870,5455,34261,39716,4028,34726,38754,55894,2030,57924


In [205]:
df_csv.columns

Index(['Gêneros', 'Gêneros.1', 'Gêneros.2', 'Homens', 'Homens.1', 'Homens.2',
       'Mulheres', 'Mulheres.1', 'Mulheres.2', 'Homens.3', 'Homens.4',
       'Homens.5', 'Homens.6', 'Mulheres.3', 'Mulheres.4', 'Mulheres.5',
       'Mulheres.6', 'Homens.7', 'Homens.8', 'Mulheres.7', 'Mulheres.8',
       'Homens.9', 'Homens.10', 'Mulheres.9', 'Mulheres.10', 'Homens.11',
       'Homens.12', 'Mulheres.11', 'Mulheres.12', 'Meninos', 'Meninos.1',
       'Meninos.2', 'Meninas', 'Meninas.1', 'Meninas.2', 'Casas', 'Casas.1',
       'Casas.2'],
      dtype='object')