In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


sns.set()
%matplotlib inline

In [5]:
import unidecode

def normalizeSentence(sentence):
    
    temp = sentence.split(' ')
    new_sentence = []
    for word in temp:
        new_sentence.append(normalizeWord(word))
    new_sentence = ' '.join(new_sentence)
    
    return new_sentence

def normalizeWord(word):
    
    # lower case word
    word = word.lower()
    
    # remove accents
    word = unidecode.unidecode(word)
    
    return word

def joinSeparetedWord(word):
    
    # removing aspas
    word = word.split('\'')
    word = ''.join(word)
    
    # separete words from word
    word = word.split()
    
    # join separeted words by '-'
    word = '-'.join(word)
    
    return word

In [6]:
def selectByWord(word, sentences):
    # select subset of sentences that contains word
    # return indexs of sentences
    
    # normalize word
    normalized_word = normalizeWord(word)
    
    indexs = []
    
    for i, sentence in enumerate(sentences):
        # normalize sentence
        normalized_sentence = normalizeSentence(sentence)
        
        if normalized_word in normalized_sentence:
            #print(i, normalized_sentence)
            indexs.append(i)
    return indexs

In [7]:
def selectDfRowsByWord(data, word, column):
    '''
    Select specific rows that contains especific word in the column
    '''
    
    sentences = np.array(data[column].unique())

    # find sentences
    idxs = selectByWord(word, sentences)
    obj_sentences = sentences[idxs]

    servicos = pd.DataFrame()
    for sentence in obj_sentences:

        # select rows of exemplo
        temp = data[data[column] == sentence]

        # concatenate
        servicos = pd.concat([servicos, temp])
    try:
        temp = len(servicos[column].unique())
    except:
        print(word + ': Not found')
    
    return servicos.copy()

In [8]:
class Municipio:
    '''
    
    Município
    
    '''
    
    def __init__(self, name, years = [2018]):
        
        self.keep_columns_ = ['ano_exercicio', 'mes_ref_extenso', 'tp_despesa', 'vl_despesa',
                              'ds_funcao_governo', 'ds_subfuncao_governo', 'ds_programa', 'ds_acao']
        self.numeric_columns_ = ['ano_exercicio', 'vl_despesa']
        
        self.years_ = years
        self.name_ = name
        self.df_ = pd.DataFrame()
        self.population_ = None
        print('Reading data...')
        self.readData()
        print('Preparing data...')
        self.prepareData()
        print('\nBase for {} is ready!!'.format(self.name_))
        
    def readData(self):
        
        print('> reading population...')
        # reading population
        
        municipios_base = pd.read_csv('datasets/municipios.csv')
        municipios = municipios_base['Município'].values
        for i, municipio in enumerate(municipios):
            if normalizeWord(self.name_) == normalizeWord(municipio):
                self.population_ = municipios_base['Estimativa 2019'].values[i]
        if self.population_ is None:
            print('Erro: population not found')
            return
        
        print('> reading despesas...')
        # reading despesas
        
        for year in self.years_:
            print('>> year: '+str(year))
            temp = pd.read_csv('datasets/despesas/'+ str(year) +'/despesas-' + self.name_ + '-'+str(year)+'.csv', sep = ';',  encoding = "ISO-8859-1")
            temp = temp[self.keep_columns_]
            self.df_ = pd.concat([self.df_, temp]).reset_index(drop = True)

        del temp
        
    def prepareData(self):
        
        # string para float
        self.df_['vl_despesa'] = self.df_.vl_despesa.apply(lambda x : float('.'.join(x.split(','))))
        
        # normalizando nome dos meses
        for column in self.df_.columns:
            if column in self.numeric_columns_:
                 continue
            self.df_[column] = self.df_[column].apply(lambda x : normalizeWord(x))
        
        self.df_['vl_despesa_per_capita'] = self.df_['vl_despesa'].apply(lambda x : x/self.population_)
        
        # ajsutando nomes com . no meio
        for column in ['ds_programa', 'ds_acao']:
            self.df_[column] = self.df_[column].apply(lambda x : ''.join(x.split('.')))
        
        
    


In [10]:
guarulhos = Municipio('guarulhos', years = [2014, 2019])

Reading data...
> reading population...
> reading despesas...
>> year: 2014


FileNotFoundError: [Errno 2] File b'datasets/despesas/2014/despesas-guarulhos-2014.csv' does not exist: b'datasets/despesas/2014/despesas-guarulhos-2014.csv'

In [59]:
exemplo.population_

1379182

In [60]:
test = exemplo.df_.copy()

In [61]:
test['tp_despesa'].unique()

array(['empenhado', 'reforco', 'anulacao', 'valor pago',
       'valor liquidado'], dtype=object)

In [62]:
# ds_funcao_governo
funcoes_governo = ['gestao ambiental', 'saneamento','administracao']

In [63]:
test_liq = test[test['tp_despesa'] == 'valor liquidado']
test_liq.head()

Unnamed: 0,ano_exercicio,mes_ref_extenso,tp_despesa,vl_despesa,ds_funcao_governo,ds_subfuncao_governo,ds_programa,ds_acao,vl_despesa_per_capita
2614,2018,janeiro,valor liquidado,20761.29,urbanismo,administracao geral,melhoria do transito e do transporte coletivo,gestao e modernizacao dos servicos de transpor...,0.015053
2615,2018,janeiro,valor liquidado,389859.89,saneamento,administracao geral,gestao administrativa operacional sob supervis...,obrigacoes contributivas diversas,0.282675
2616,2018,janeiro,valor liquidado,618.53,saneamento,administracao geral,gestao administrativa operacional sob supervis...,obrigacoes contributivas diversas,0.000448
2617,2018,janeiro,valor liquidado,954.0,encargos especiais,outros encargos especiais,encargos especiais,"encargos, restituicoes e indenizacoes diversas",0.000692
2618,2018,janeiro,valor liquidado,12966.55,administracao,protecao e beneficios ao trabalhador,gestao administrativa operacional sob supervis...,beneficios ao trabalhador - encargos,0.009402


In [64]:
test_liq_f_governo = test_liq[test_liq['ds_funcao_governo'] == funcoes_governo[0]]

In [65]:
list(test_liq_f_governo['ds_programa'].unique())

['gestao administrativa operacional da secretaria de meio ambiente',
 'manutencao e implantacao de parques, pracas, arborizacao, licenciamento e fiscalizacao ambiental',
 'gestao e conservacao da biodiversidade e protecao aos animais',
 'gestao do fundo municipal de meio ambiente']