In [1]:
import pandas as pd

In [2]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

# Coleta de Dados

In [3]:
# Dataframe com os dados das declarações (a seleção e tratamento dos dados foi feito no Receita Data)
df = pd.read_csv("Dados/declaracoes_2019_1_4.csv", sep=';', encoding = "latin1")

In [4]:
df.shape

(50000, 53)

In [5]:
# Dados com gini por estado (IBGE)
df_gini = pd.read_csv("Dados/gini_2019.csv", sep=';', encoding = "UTF-8")

In [6]:
df_gini.shape

(27, 3)

In [7]:
df_gini.head()

Unnamed: 0,estado,nome,gini
0,RO,Rondônia,0.471957
1,AC,Acre,0.556493
2,AM,Amazonas,0.568123
3,RR,Roraima,0.576438
4,PA,Pará,0.528684


In [8]:
# Dados de população por município
df_pop = pd.read_csv("Dados/populacao_2019.csv", sep=';', encoding = "UTF-8")

In [9]:
df_pop.shape

(5570, 5)

In [10]:
df_pop.head(17)

Unnamed: 0,UF,COD_UF,COD_MUNIC,NOME_MUNICIPIO,POPULACAO
0,RO,11,15,Alta Floresta D'Oeste,22945
1,RO,11,23,Ariquemes,107863
2,RO,11,31,Cabixi,5312
3,RO,11,49,Cacoal,85359
4,RO,11,56,Cerejeiras,16323
5,RO,11,64,Colorado do Oeste,15882
6,RO,11,72,Corumbiara,7391
7,RO,11,80,Costa Marques,18331
8,RO,11,98,Espigão D'Oeste,32374
9,RO,11,106,Guajará-Mirim,46174


In [11]:
# Tabela para relacionar o código de município utilizado na declaração e o código do IBGE 
df_mun = pd.read_csv("Dados/municipios_TOM.csv", sep=';', encoding = "latin1")

In [12]:
df_mun.shape

(5578, 6)

In [13]:
df_mun.head()

Unnamed: 0,nr_municipio,nb_municipio,latitude_municipio,longitude_municipio,cd_municipio_ibge,ae_municipio
0,-7.0,Inválido,-7.0,-7.0,IN,-7.0
1,-9.0,Não se aplica,-9.0,-9.0,,-9.0
2,2125.0,Barrocas - BA,-11.529,-39.078,2903276,200.965
3,5564.0,Itanhangá - MT,-12.219,-56.638,5104542,2898.075
4,5565.0,Aroeiras do Itaim - PI,-7.077,-41.467,2200954,257.137


# Processamento / Tratamento dos Dados

## Declaração

### Excluir Coluna ID

In [14]:
# Excluir coluna id
df.drop(['id'],axis=1, inplace = True)

### Ocupação Principal

In [15]:
df.query("ocupacao=='NI' or ocupacao=='IN'").shape

(9100, 52)

In [16]:
# Transformar ocupação de string para numérico
df["ocupacao"] = pd.to_numeric(df["ocupacao"],errors='coerce' )

In [17]:
df.ocupacao.isna().sum()

9100

In [18]:
# Alterar valores nulos para 999 (Não Informado)
df["ocupacao"] = df["ocupacao"].fillna(999)
df.query("ocupacao==999").shape

(9100, 52)

In [19]:
df.ocupacao.isna().sum()

0

### Óbito

In [20]:
# Variável óbito possui o ano de óbito ou -9 caso contribuinte vivo
# Transformar a variável óbito em 0 ou 1. 
# 0 - Vivo     - se menor ou igual a 0
# 1 - Falecido - se maior que 0
df['obito'] = df.obito.apply(lambda x: 0 if x<=0 else 1)

In [21]:
df.obito.value_counts()

0    49397
1      603
Name: obito, dtype: int64

In [22]:
df.shape

(50000, 52)

In [23]:
# Excluir Declarações com marca de óbito, pois podem não ter informações de todo o ano.
df.drop(df.loc[df['obito']==1].index, inplace=True)

In [24]:
df.shape

(49397, 52)

In [25]:
# Exclusão da variável óbito
df.drop(['obito'],axis=1, inplace = True)

### Criar Variável Alvo

In [26]:
def define_alvo(i):
    alvo = 1 # Considerado uma declaração normal
    # Alterar as que são anomalias
    # Malha fiscal finalizada com alteração de imposto devido
    if df.loc[i, 'auto'] == 1:
        alvo = -1
    # Declaração em malha fiscal
    elif df.loc[i, 'malha'] == 2:
        alvo = -1
    # Informou rendimentos abaixo da DIRF
    elif df.loc[i, 'dif_dirf'] > 0:
        alvo = -1
    # Declaração em malha preenchimento
    elif df.loc[i, 'malha'] == 1:
        alvo = -1
    # Informou dados de rendimentos com erro de preenchimento 
    elif df.loc[i, 'errop'] == 1:
        alvo = -1
    # Informou dados médicos com erro de preenchimento 
    elif df.loc[i,'errom'] == 1:
        alvo = -1
    return alvo

In [27]:
# Criação da variável alvo para identificação das anomalias
df['alvo'] = df.index.map(define_alvo)

In [28]:
df.alvo.value_counts()

 1    39534
-1     9863
Name: alvo, dtype: int64

In [29]:
# Excluir variáveis utilizadas para gerar a detecção das anomalias
df.drop(['malha', 'auto', 'errop', 'errom', 'dif_dirf'],axis=1, inplace = True)

### Corrigir alíquota
Corrigir valores de alíquota que estão com erro

In [30]:
df.aliquota.value_counts()

0.0     15574
7.5      9467
27.5     9239
0.5      6686
15.0     5122
22.5     3309
Name: aliquota, dtype: int64

In [31]:
def define_aliquota(num):
    if num < 22847.77:
        return 0
    elif num >= 22847.77 and num <33919.81:
        return 7.5
    elif num >= 33919.81 and num <45012.61:
        return 15
    elif num >= 45012.61 and num <55976.17:
        return 22.5
    elif num >= 55976.17:
        return 27.5       

In [32]:
df['aliquota'] = df.base_calculo.map(define_aliquota)

In [33]:
df.aliquota.value_counts()

0.0     16592
27.5    12207
7.5     10884
15.0     5835
22.5     3879
Name: aliquota, dtype: int64

### Excluir Exterior 
Os dados do IBGE são referentes ao Brasil, logo foi decidido retirar as declarações de residentes no exterior

In [34]:
df.query('estado=="EX"').estado.count()

39

In [35]:
df.query('estado!="EX"').estado.count()

49358

In [36]:
df.estado.count()

49397

In [37]:
df.drop(df.loc[df['estado']=='EX'].index, inplace=True)

In [38]:
df.estado.count()

49358

### Idade
Para melhorar os gráficos e análises, agrupar idades em que existem poucas declarações

In [39]:
# Agrupar acima de 100 anos
df.query('idade>90').idade.value_counts().sort_index()

91.0     37
92.0     45
93.0     32
94.0     16
95.0     16
96.0     10
97.0      7
98.0      8
99.0      2
100.0     1
101.0     1
102.0     5
103.0     1
112.0     1
Name: idade, dtype: int64

In [40]:
# Altera variável idade para 100 nos casos em que for maior que esse valor
df['idade'] = df.idade.apply(lambda x: 100 if x>100 else x)

In [41]:
# Agrupar abaixo de 15 anos
df.query('idade<18').idade.value_counts().sort_index(ascending=False)

17.0    13
16.0    11
15.0    10
14.0     8
13.0     7
12.0     8
11.0     3
10.0     4
9.0      3
8.0      3
7.0      3
6.0      1
5.0      2
4.0      6
3.0      3
2.0      1
0.0      1
Name: idade, dtype: int64

In [42]:
# Altera variável idade para 15 nos casos em que for menor que esse valor
df['idade'] = df.idade.apply(lambda x: 15 if x < 15 else x)

In [43]:
df.query('idade>100').idade.count()

0

In [44]:
df.query('idade<15').idade.count()

0

# Dados do IBGE

### GINI

In [45]:
df_gini.head()

Unnamed: 0,estado,nome,gini
0,RO,Rondônia,0.471957
1,AC,Acre,0.556493
2,AM,Amazonas,0.568123
3,RR,Roraima,0.576438
4,PA,Pará,0.528684


In [46]:
# Merge com dados de Gini por estado (left outer join)
df = pd.merge(df, df_gini[['estado', 'gini']], on='estado', how='left')

In [47]:
df[df.gini.isnull()].shape

(0, 48)

In [48]:
df.head()

Unnamed: 0,valor_dirf,rend_total,rend_tributavel,base_calculo,trabalho,aluguel,rend_capital,autonomo,livro_caixa,rural,ganho_capital,exterior,outros_rend,rend_isento,rend_exclusivo,contrib_prev,dependentes_ded,pensao_alimenticia,pensao_alimenticia_ded,geducativos,geducativos_ded,gmedicos,gmedicos_ded,aliquota,aliquota_efetiva,imposto_devido,imposto_devido2,resultado_decl,iap,iar,tipo_decl,completa,deducao,idade,sexo,casado,dependentes,alimentandos,estado,municipio,ocupacao,natureza,bens_ant,bens,dividas_ant,dividas,alvo,gini
0,37420.29,40250.29,37420.29,29936.24,40250.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2830.0,4244.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.5,0.013208,531.63,531.63,-465.96,0.0,465.96,1,0,7484.05,55.0,1,0,0.0,0.0,MG,2863.0,290.0,42,0.0,0.0,0.0,0.0,1,0.488047
1,25553.8,27693.23,25553.8,20443.04,27671.48,0.0,21.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2139.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-202.88,0.0,202.88,1,0,5110.76,61.0,1,0,0.0,0.0,SP,4128.0,410.0,1,0.0,11311.68,0.0,0.0,1,0.525704
2,0.0,31580.0,31580.0,25264.0,0.0,0.0,0.0,31580.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.5,0.005738,181.22,181.22,181.22,181.22,0.0,2,0,6316.0,30.0,1,1,1.0,0.0,DF,5418.0,0.0,11,0.0,0.0,0.0,0.0,1,0.553359
3,69134.11,78585.79,69134.11,51618.01,73859.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4725.84,0.0,9451.68,7125.31,9100.32,0.0,0.0,0.0,0.0,1290.47,1290.47,22.5,0.055703,3980.54,3980.54,-1382.92,0.0,1382.92,2,1,17516.1,31.0,0,1,4.0,0.0,PE,1832.0,212.0,1,15923.23,33390.45,0.0,0.0,1,0.572641
4,254170.79,165590.1,165590.1,148835.76,0.0,0.0,0.0,165590.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2235.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.5,0.184175,30497.51,30497.51,5674.88,5674.88,0.0,3,0,16754.34,42.0,1,1,0.0,0.0,ES,3418.0,225.0,11,121332.0,121332.0,0.0,0.0,-1,0.51892


### População

In [49]:
df_pop.head()

Unnamed: 0,UF,COD_UF,COD_MUNIC,NOME_MUNICIPIO,POPULACAO
0,RO,11,15,Alta Floresta D'Oeste,22945
1,RO,11,23,Ariquemes,107863
2,RO,11,31,Cabixi,5312
3,RO,11,49,Cacoal,85359
4,RO,11,56,Cerejeiras,16323


In [50]:
# Juntar os campos COF_UF e COD_MUNIC para formar o cd_municipio_ibge
df_pop['cd_municipio_ibge'] = df_pop['COD_UF'].map(str) + ("00000" + df_pop['COD_MUNIC'].map(str)).str[-5:]

In [51]:
df_pop.head()

Unnamed: 0,UF,COD_UF,COD_MUNIC,NOME_MUNICIPIO,POPULACAO,cd_municipio_ibge
0,RO,11,15,Alta Floresta D'Oeste,22945,1100015
1,RO,11,23,Ariquemes,107863,1100023
2,RO,11,31,Cabixi,5312,1100031
3,RO,11,49,Cacoal,85359,1100049
4,RO,11,56,Cerejeiras,16323,1100056


In [52]:
df_mun.head()

Unnamed: 0,nr_municipio,nb_municipio,latitude_municipio,longitude_municipio,cd_municipio_ibge,ae_municipio
0,-7.0,Inválido,-7.0,-7.0,IN,-7.0
1,-9.0,Não se aplica,-9.0,-9.0,,-9.0
2,2125.0,Barrocas - BA,-11.529,-39.078,2903276,200.965
3,5564.0,Itanhangá - MT,-12.219,-56.638,5104542,2898.075
4,5565.0,Aroeiras do Itaim - PI,-7.077,-41.467,2200954,257.137


In [53]:
# Join entre a tabela de municípios da RFB e a tabela do IBGE (left outer join)
df_pop = pd.merge(df_pop, df_mun[['cd_municipio_ibge', 'nr_municipio']], 
                  on='cd_municipio_ibge', how='left')

In [54]:
df_pop.head()

Unnamed: 0,UF,COD_UF,COD_MUNIC,NOME_MUNICIPIO,POPULACAO,cd_municipio_ibge,nr_municipio
0,RO,11,15,Alta Floresta D'Oeste,22945,1100015,33.0
1,RO,11,23,Ariquemes,107863,1100023,7.0
2,RO,11,31,Cabixi,5312,1100031,37.0
3,RO,11,49,Cacoal,85359,1100049,9.0
4,RO,11,56,Cerejeiras,16323,1100056,27.0


In [55]:
df_pop[df_pop.nr_municipio.isnull()].shape

(0, 7)

In [56]:
# Inserir em df a população do município (left outer join)
df = pd.merge(df, df_pop[['nr_municipio', 'POPULACAO']], left_on='municipio', 
              right_on='nr_municipio', how='left')

In [57]:
df[df.POPULACAO.isnull()].shape

(0, 50)

In [58]:
df.drop(['municipio', 'nr_municipio', 'estado'], axis=1, inplace=True)

In [59]:
df.head()

Unnamed: 0,valor_dirf,rend_total,rend_tributavel,base_calculo,trabalho,aluguel,rend_capital,autonomo,livro_caixa,rural,ganho_capital,exterior,outros_rend,rend_isento,rend_exclusivo,contrib_prev,dependentes_ded,pensao_alimenticia,pensao_alimenticia_ded,geducativos,geducativos_ded,gmedicos,gmedicos_ded,aliquota,aliquota_efetiva,imposto_devido,imposto_devido2,resultado_decl,iap,iar,tipo_decl,completa,deducao,idade,sexo,casado,dependentes,alimentandos,ocupacao,natureza,bens_ant,bens,dividas_ant,dividas,alvo,gini,POPULACAO
0,37420.29,40250.29,37420.29,29936.24,40250.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2830.0,4244.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.5,0.013208,531.63,531.63,-465.96,0.0,465.96,1,0,7484.05,55.0,1,0,0.0,0.0,290.0,42,0.0,0.0,0.0,0.0,1,0.488047,279885
1,25553.8,27693.23,25553.8,20443.04,27671.48,0.0,21.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2139.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-202.88,0.0,202.88,1,0,5110.76,61.0,1,0,0.0,0.0,410.0,1,0.0,11311.68,0.0,0.0,1,0.525704,460671
2,0.0,31580.0,31580.0,25264.0,0.0,0.0,0.0,31580.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.5,0.005738,181.22,181.22,181.22,181.22,0.0,2,0,6316.0,30.0,1,1,1.0,0.0,0.0,11,0.0,0.0,0.0,0.0,1,0.553359,3015268
3,69134.11,78585.79,69134.11,51618.01,73859.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4725.84,0.0,9451.68,7125.31,9100.32,0.0,0.0,0.0,0.0,1290.47,1290.47,22.5,0.055703,3980.54,3980.54,-1382.92,0.0,1382.92,2,1,17516.1,31.0,0,1,4.0,0.0,212.0,1,15923.23,33390.45,0.0,0.0,1,0.572641,392482
4,254170.79,165590.1,165590.1,148835.76,0.0,0.0,0.0,165590.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2235.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.5,0.184175,30497.51,30497.51,5674.88,5674.88,0.0,3,0,16754.34,42.0,1,1,0.0,0.0,225.0,11,121332.0,121332.0,0.0,0.0,-1,0.51892,173555


# Dataset Final

In [60]:
df.isna().sum()

valor_dirf                0
rend_total                0
rend_tributavel           0
base_calculo              0
trabalho                  0
aluguel                   0
rend_capital              0
autonomo                  0
livro_caixa               0
rural                     0
ganho_capital             0
exterior                  0
outros_rend               0
rend_isento               0
rend_exclusivo            0
contrib_prev              0
dependentes_ded           0
pensao_alimenticia        0
pensao_alimenticia_ded    0
geducativos               0
geducativos_ded           0
gmedicos                  0
gmedicos_ded              0
aliquota                  0
aliquota_efetiva          0
imposto_devido            0
imposto_devido2           0
resultado_decl            0
iap                       0
iar                       0
tipo_decl                 0
completa                  0
deducao                   0
idade                     0
sexo                      0
casado              

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49358 entries, 0 to 49357
Data columns (total 47 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   valor_dirf              49358 non-null  float64
 1   rend_total              49358 non-null  float64
 2   rend_tributavel         49358 non-null  float64
 3   base_calculo            49358 non-null  float64
 4   trabalho                49358 non-null  float64
 5   aluguel                 49358 non-null  float64
 6   rend_capital            49358 non-null  float64
 7   autonomo                49358 non-null  float64
 8   livro_caixa             49358 non-null  float64
 9   rural                   49358 non-null  float64
 10  ganho_capital           49358 non-null  float64
 11  exterior                49358 non-null  float64
 12  outros_rend             49358 non-null  float64
 13  rend_isento             49358 non-null  float64
 14  rend_exclusivo          49358 non-null