In [1]:
from pathlib import Path
import sys

data_path = Path().joinpath('data')  # You can switch to another base path here
data_path.mkdir(exist_ok=True, parents=True)

#### Hierarchy

In [2]:
LEVELS = {
    'regioes': 0,
    'estados': 1,
    'municipios': 2,
    'distritos': 3,
    'setores': 4
}

#### Initializing the database

In [4]:
import sqlite3

sqlite_path = data_path / 'censo2010.sqlite'
con = sqlite3.connect(str(sqlite_path))

In [5]:
con.execute('''
    CREATE TABLE IF NOT EXISTS geopart (
        id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
        id_parent INTEGER,
        level INTEGER NOT NULL DEFAULT 0,
        code TEXT NOT NULL DEFAULT '',
        name TEXT NOT NULL DEFAULT '',
        abbr TEXT NOT NULL DEFAULT '',
        lat REAL DEFAULT 0.0,
        lng REAL DEFAULT 0.0,
        lat_min REAL DEFAULT 0.0,
        lat_max REAL DEFAULT 0.0,
        lng_min REAL DEFAULT 0.0,
        lng_max REAL DEFAULT 0.0,
        area REAL DEFAULT 0.0,
        polygon TEXT NOT NULL DEFAULT '',
        FOREIGN KEY (id_parent) REFERENCES geopart(id)
    )
''')

<sqlite3.Cursor at 0x7fa94c21aea0>

#### Seeding initial data

In [None]:
REGIONS = ('Centro-Oeste', 'Sul', 'Nordeste', 'Norte', 'Sudeste')

In [None]:
for region in REGIONS:
    con.executemany('''
        INSERT INTO geopart (level, name) VALUES (?, ?)
    ''', [[LEVELS['regioes'], region]])

In [None]:
con.commit()

In [6]:
STATES = {
    'Acre (AC)': 'Norte',
    'Alagoas (AL)': 'Nordeste',
    'Amapá (AP)': 'Norte',
    'Amazonas (AM)': 'Norte',
    'Bahia (BA)': 'Nordeste',
    'Ceará (CE)': 'Nordeste',
    'Distrito Federal (DF)': 'Centro-Oeste',
    'Espírito Santo (ES)': 'Sudeste',
    'Goiás (GO)': 'Centro-Oeste',
    'Maranhão (MA)': 'Nordeste',
    'Mato Grosso (MT)': 'Centro-Oeste',
    'Mato Grosso do Sul (MS)': 'Centro-Oeste',
    'Minas Gerais (MG)': 'Sudeste',
    'Pará (PA)': 'Norte',
    'Paraíba (PB)': 'Nordeste',
    'Paraná (PR)': 'Sul',
    'Pernambuco (PE)': 'Nordeste',
    'Piauí (PI)': 'Nordeste',
    'Rio de Janeiro (RJ)': 'Sudeste',
    'Rio Grande do Norte (RN)': 'Nordeste',
    'Rio Grande do Sul (RS)': 'Sul',
    'Rondônia (RO)': 'Norte',
    'Roraima (RR)': 'Norte',
    'Santa Catarina (SC)': 'Sul',
    'São Paulo (SP)': 'Sudeste',
    'Sergipe (SE)': 'Nordeste',
    'Tocantins (TO)': 'Norte'
}

In [7]:
sql_rows = []

for state_info, region in STATES.items():
    name, abbr = re.match(r'(.*?) \(([A-Z]{2})\)', state_info).groups()
    sql_rows += [{'region': region, 'parent_level': LEVELS['regioes'],
                  'level': LEVELS['estados'], 'name': name, 'abbr': abbr}]

In [8]:
con.executemany('''
    INSERT INTO geopart
        (id_parent, level, name, abbr)
    SELECT
        id, :level, :name, :abbr
    FROM geopart
    WHERE
        name = :region AND level = :parent_level
''', sql_rows)
con.commit()

<sqlite3.Cursor at 0x7fa94c21af10>

#### URLs for the SHP files in IBGE's FTP server

In [57]:
import contextlib
import ftputil
import re

HOST = 'geoftp.ibge.gov.br'
URL = ('organizacao_do_territorio/malhas_territoriais/'
       'malhas_de_setores_censitarios__divisoes_intramunicipais/'
       'censo_2010/setores_censitarios_shp')

STATES_ZIP_FILES = []

with ftputil.FTPHost(HOST, user='anonymous') as host:
    
    for path in host.listdir(URL):
        path = Path(URL) / path
        abbr = path.name.upper()
        
        if not re.match('^[A-Z]{2}$', abbr):
            continue
        
        for fzip in host.listdir(str(path)):
            fzip = path / fzip
            levels = [level for name, level in LEVELS.items() if name in fzip.name]
            if not levels:
                continue
            STATES_ZIP_FILES.append([abbr, levels[0], str(fzip), host.path.getsize(str(fzip))])

  if sys.path[0] == '':


#### Effectively downloading the ZIP files

In [60]:
from tqdm import tqdm_notebook

with closing(tqdm_notebook(total=sum(s[-1] for s in STATES_ZIP_FILES))) as pbar:
    with ftputil.FTPHost(HOST, user='anonymous') as host:
        for abbr, level, url, size in STATES_ZIP_FILES:
            dirpath = data_path / abbr
            dirpath.mkdir(exist_ok=True, parents=True)
            output = str(dirpath / '{}.zip'.format(level))
            host.download(url, output)
            pbar.update(size)

HBox(children=(IntProgress(value=0, max=608620781), HTML(value='')))

  after removing the cwd from sys.path.



