In [2]:
# import beautifulsoup4, zip, pandas
import os
import requests
import zipfile
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup
base_dir = "/pfs/work7/workspace/scratch/tu_zxobe27-master_thesis/"

In [13]:
def fetch():
    dl_config = {2000: "https://ftp.ibge.gov.br/Censos/Censo_Demografico_2000/Microdados/",
                 }#2010: "https://ftp.ibge.gov.br/Censos/Censo_Demografico_2010/Resultados_Gerais_da_Amostra/Microdados/"
    for year, base_url in dl_config.items():
        # parse directory for files with .zip extension and two-digit-names
        ftp_list = requests.get(base_url).text
        ftp_list = BeautifulSoup(ftp_list, "html.parser")
        ftp_list = [a["href"] for a in ftp_list.find_all("a") if a["href"].endswith(".zip") and len(a["href"]) == 6]
        # download and extract files
        for file in ftp_list:
            with open(base_dir + "data/misc/raw/census/" + str(year) + "/" + file, "wb") as f:
                f.write(requests.get(base_url + file).content)
            with zipfile.ZipFile(base_dir + "data/misc/raw/census/" + str(year) + "/" + file, "r") as z:
                z.extractall(base_dir + "data/misc/raw/census/" + str(year))
fetch()

In [4]:
def preprocess():
    # get all files in the directories
    states_directories = {y: [base_dir + "data/misc/raw/census/" + f"{y}/" + x for x in os.listdir(base_dir + "data/misc/raw/census/" + f"{y}") if not x.endswith(".zip")] for y in [2000, 2010]}
    
    # official column indexes (1-based) and names
    positions = {
        2000: {"state": (1, 2), "municipality": (12, 18), "weight": (335, 345), "race": (87, 87), "from_municipality": (108, 108), "always_lived_in_municipality": (103, 103), "from_state": (110, 110),  "municipality_5y": (130, 136)},
        2010: {"state": (1, 2), "municipality": (3, 7), "weight": (29, 44), "race": (68, 68), "from_municipality": (74, 74), "from_state": (75, 75), "municipality_5y": (132, 138)}
        }

    def worker(state_directory, year):
        # read file into list, separate by line
        # in state_directory, find the one file with "Amostra_Pessoas" in name
        # decode latin-1
        for file in os.listdir(state_directory):
            if "Amostra_Pessoas" in file or "PES" in file or "Pes" in file:
                with open(state_directory + "/" + file, "r", encoding="latin-1") as f:
                    raw_data = f.readlines()
                    break

        # extract data from lines into dictionary
        parsed_data = []
        for line in raw_data:
            parsed_data.append({key: line[start - 1:end].strip() for key, (start, end) in positions[year].items()})
            
        # convert to pandas dataframe
        df = pd.DataFrame(parsed_data)
        # place decimal point in weight after third digit from left
        df["weight"] = df.weight.str[:3] + "." + df.weight.str[3:]
        # convert all columns to numeric, coerce errors to NaN
        df = df.apply(pd.to_numeric, errors='coerce')
        # add year
        df["year"] = year
        
        return df
        
    census_data = pd.concat([pd.concat([worker(state_directory, y) for state_directory in tqdm(states_directories[y])]) for y in [2000, 2010]]).reset_index(drop=True)
    
    # pre-process data
    census_data.loc[census_data.year == 2010, "municipality"] = census_data["municipality"] + census_data.state * 1e5
    #census_data.loc[census_data.year == 2010, "municipality_before"] = census_data["municipality_before"] + census_data.state * 1e5

    census_data["race"] = census_data.race.map({1: "white", 2: "black", 3: "asian", 4: "brown", 5: "indigenous", 9: "unknown"}).astype("category")

    census_data.loc[((census_data.year == 2000) & (census_data.from_municipality == 2)), "from_municipality"] = 3
    census_data.loc[(census_data.from_municipality.isna() & (census_data.always_lived_in_municipality == 1)), "from_municipality"] = 1
    census_data["from_municipality"] = census_data.from_municipality.map({1: "yes", 2: "yes, lived elsewhere", 3: "no"}).astype("category")

    census_data.loc[((census_data.year == 2000) & (census_data.from_state == 2)), "from_state"] = 3
    census_data["from_state"] = census_data.from_state.map({1: "yes", 2: "yes, lived elsewhere", 3: "no"}).astype("category")
    
    census_data[["municipality", "year", "weight", "race", "from_municipality", "from_state"]].to_parquet(base_dir + "data/misc/census.parquet", index=False)
    
preprocess()

100%|██████████| 27/27 [02:18<00:00,  5.11s/it]
100%|██████████| 26/26 [01:56<00:00,  4.47s/it]


In [5]:
census_data = pd.read_parquet(base_dir + "data/misc/census.parquet")

In [None]:
census_data.groupby("year").from_m

In [8]:
census_data.query("always_lived_in_municipality == 1")#.from_municipality.value_counts()

Unnamed: 0,municipality,year,weight,race,from_municipality,from_state,always_lived_in_municipality
2,5300108.0,2000,9.049340,brown,,,1.0
18,5300108.0,2000,10.898449,white,,,1.0
25,5300108.0,2000,9.522366,white,,,1.0
75,5300108.0,2000,14.211130,white,,,1.0
83,5300108.0,2000,13.722883,white,,,1.0
...,...,...,...,...,...,...,...
19175312,,2000,,black,,,1.0
19175336,,2000,,,,,1.0
19175505,,2000,,,,,1.0
19175792,,2000,,,,,1.0
