In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os
import pandas as pd

In [2]:
PATH_DADOS = 'original_data/dados_transparencia/'

In [3]:
def pegar_dados(codigo_orgao, page_num = 0):
    
    data = {
    'tipoPesquisa' : 'direta',
    'nm_servidor' : '',
    'cd_secretaria' : str(codigo_orgao),
    'tp_servidor' : 'ATIVOS',
    'cd_cargo' : "0",
    'cd_tipo' : "1",
    'Valor' : "0",
    'rowsPerPage' : '500',
    'pageNum' : str(page_num),
    'orderType' : "0",
    'colNum' : "0"
    }
    
    with requests.post('http://transparencia.prefeitura.sp.gov.br/phpconsultainterna',
                  data = data) as r:
        dados = r.json()
    
    return dados

In [4]:
def get_all_data_orgao(codigo_orgao):
    
    page_num = 0
    todos_dados=[]
    while True:
        dados = pegar_dados(codigo_orgao, page_num)
        if dados is None:
            print(f'Orgao {codigo_orgao} sem dados')
            break
        page_num+=1
        todos_dados.extend(dados)
        total = int(dados[0]['total'])
        total_linhas=len(todos_dados)
        if total_linhas>=total:
            break
    return todos_dados

In [5]:
def pegar_ids_orgaos():
    
    with requests.get('http://transparencia.prefeitura.sp.gov.br/funcionalismo/') as r:
        sopa = BeautifulSoup(r.text)
        
    vals = sopa.find('select', {'id' : 'orgao'}).find_all('option')[1:]
    
    return {val['value'] : val.text for val in vals}

In [6]:
def gen_file_name(orgao_name):
    
    if '/' in orgao_name:
        orgao_name = orgao_name.replace('/', '-')
    return os.path.join(PATH_DADOS, f'{orgao_name}.json')

In [7]:
def save_as_json(orgao_data, orgao_name):
    
    f_name = gen_file_name(orgao_name)
    with open(f_name, 'w') as f:
        json.dump(orgao_data, f)

In [8]:
def find_json(orgao_name):
    
    f_name = gen_file_name(orgao_name)
    for file in os.listdir(PATH_DADOS):
        if os.path.join(PATH_DADOS,file)==f_name:
            with open(os.path.join(PATH_DADOS, file), 'r') as f:
                return json.load(f)
    return None

In [9]:
def pull_cache(orgao_name, all_data):
    
    cached = find_json(orgao_name)
    if cached:
        print(f'{orgao_name} - cached')
        all_data[orgao_name] = cached
        return True
    else:
        return False

In [10]:
def download_data(id_orgao, orgao_name, all_data, save_json=True):
    
    orgao_data = get_all_data_orgao(id_orgao)
    print(f'{orgao_name} data downloaded')
    if save_json:
        save_as_json(orgao_data, orgao_name)
    all_data[orgao_name] = orgao_data

In [11]:
def get_all_data_por_orgao(save_json=True, cache=True):
    
    orgaos = pegar_ids_orgaos()
    
    all_data = {}
    for id_orgao, orgao_name in orgaos.items():
        if cache:
            cached = pull_cache(orgao_name, all_data)
            if not cached:
                download_data(id_orgao, orgao_name, all_data, save_json)
        else:
            download_data(id_orgao, orgao_name, all_data, save_json)  
    
    return all_data

In [12]:
data = get_all_data_por_orgao()

CONTROLADORIA GERAL DO MUNICIPIO - cached
GABINETE DO PREFEITO - cached
Orgao 00 sem dados
PREFEITURA DO MUNICIPIO DE SAO PAULO data downloaded
PROCURADORIA GERAL DO MUNICIPIO - cached
SECRETARIA MUNICIPAL DA FAZENDA - cached
SECRETARIA MUNICIPAL DA PESSOA COM DEFICIENCIA - cached
SECRETARIA MUNICIPAL DA SAUDE - cached
SECRETARIA MUNICIPAL DAS SUBPREFEITURAS - cached
SECRETARIA MUNICIPAL DE ASSIST E DESENV SOCIAL - cached
SECRETARIA MUNICIPAL DE CULTURA - cached
SECRETARIA MUNICIPAL DE DESENVOLVIMENTO ECONOMICO - cached
SECRETARIA MUNICIPAL DE DIREITOS HUMANOS E CIDADAN - cached
SECRETARIA MUNICIPAL DE EDUCACAO - cached
SECRETARIA MUNICIPAL DE ESPORTES E LAZER - cached
SECRETARIA MUNICIPAL DE HABITACAO - cached
SECRETARIA MUNICIPAL DE INFRAESTRUTURA URBANA E OB - cached
SECRETARIA MUNICIPAL DE INOVACAO E TECNOLOGIA - cached
SECRETARIA MUNICIPAL DE JUSTICA - cached
SECRETARIA MUNICIPAL DE MOBILIDADE E TRANSPORTES - cached
SECRETARIA MUNICIPAL DE RELACOES INTERNACIONAIS - cached
SECRETAR

In [13]:
with open(os.path.join(PATH_DADOS,'TODOS_ORGAOS.json'), 'w') as f:
    json.dump(data, f)

In [14]:
def open_all_salary_data():
    
    with open(os.path.join(PATH_DADOS,'TODOS_ORGAOS.json'), 'r') as f:
        data = json.load(f)
    
    dfs = []
    
    for orgao in data:
        df = pd.DataFrame(data[orgao])
        dfs.append(df)
    
    df_final = pd.concat(dfs)
    df_final.rename({'total' : 'qtd_servidores_orgao'}, axis=1, inplace=True)
    
    return df_final

In [15]:
df_remuneracao = open_all_salary_data()

In [16]:
df_remuneracao.to_csv(os.path.join('original_data/aggregated', 'remuneracao_transparencia_jul_2021.csv')
                      ,index=False,sep=';')