# Tratamento de dados

In [1]:
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

### Statements

In [71]:
df_statements = pd.read_csv('statements.csv')

In [72]:
# df_statements['Type'] = 'Statement'
df_statements["Date"] = df_statements["Data"].apply(pd.to_datetime, format='%Y-%m-%d %H:%M:%S.%f').dt.strftime('%Y/%m/%d')
df_statements.drop(['Data', 'Id', 'Titulo', 'DataAtualizacao', 'mostrarCabecalho', 'mostrarRodape'], axis=1, inplace=True)

In [73]:
df_statements.rename(columns={'Conteudo': 'Statements'}, inplace=True)
df_statements.sort_values('Date', inplace=True)
df_statements.set_index('Date', inplace=True)

In [74]:
df_statements['Statements'] = df_statements['Statements'].apply(lambda x: BeautifulSoup(x, features="html.parser").get_text())

In [75]:
df_statements.head(5)

Unnamed: 0_level_0,Statements
Date,Unnamed: 1_level_1
2006/03/08,"In the March Meeting, the Banco Central do Br..."
2006/04/19,"In the April Meeting, the Monetary Policy Com..."
2006/05/31,"In the May Meeting, the Monetary Policy Commi..."
2006/07/19,"In the July Meeting, the Copom unanimously de..."
2006/08/30,"In the August Meeting, the Copom unanimously ..."


### Minutes

In [76]:
df_minutes = pd.read_csv('minutes.csv')

In [77]:
# df_minutes['Type'] = 'Minutes'
df_minutes["Minutes_Date"] = df_minutes["data"].apply(pd.to_datetime).dt.strftime('%Y/%m/%d')
df_minutes["Date"] = df_minutes["DataReferencia"].apply(pd.to_datetime).dt.strftime('%Y/%m/%d')

In [78]:
df_minutes.drop(['data', 'DataReferencia', 'BoxDestaque', 'Edicao', 'EsconderCapa', 
                 'EsconderDataReferencia', 'Grafico', 'Id', 
                 'ImagemCapa', 'ImagemCapa', 'LinkPagina', 'Url',
                 'Volume', 'Volume', 'Introducao', 'conteudo', 'Titulo'], axis=1, inplace=True)

In [79]:
df_minutes.rename(columns={'data': 'Date', 'DataReferencia': 'Meeting_Date', 'OutrasInformacoes': 'Minutes'}, inplace=True)
df_minutes.sort_values('Date', inplace=True)
df_minutes.set_index('Date', inplace=True)

In [80]:
df_minutes = df_minutes[df_minutes['Minutes'].notna()]

In [81]:
df_minutes['Minutes'] = df_minutes['Minutes'].apply(lambda x: BeautifulSoup(x, features="html.parser").get_text())

In [82]:
df_minutes.head(5)

Unnamed: 0_level_0,Minutes,Minutes_Date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020/08/05,"1. Regarding the global outlook, the Covid-19 ...",2020/08/11
2020/09/16,A) Update of economic outlook and Copom’s base...,2020/09/22
2020/10/28,A) Update of economic outlook and Copom’s base...,2020/12/15
2020/12/09,A) Update of economic outlook and Copom’s base...,2020/12/15
2021/01/20,A) Update of economic outlook and Copom’s base...,2021/01/26


### Decisions

In [46]:
df_decisions = pd.read_csv('decisions.csv')

In [47]:
df_decisions.drop(['DataFimVigencia', 'DataInicioVigencia', 'TaxaSelicEfetivaAnualizada',
                 'TaxaSelicEfetivaVigencia', 'TaxaTban', 'UsoMetaSelic', 'ReuniaoExtraordinaria',
                 'Vies'], axis=1, inplace=True)

In [48]:
df_decisions["DataReuniaoCopom"] = df_decisions["DataReuniaoCopom"].apply(pd.to_datetime).dt.strftime('%Y/%m/%d')

In [49]:
df_decisions.rename(columns={'DataReuniaoCopom': 'Date', 
                             'MetaSelic': 'Selic', 
                             'NumeroReuniaoCopom': 'Meeting_Number'
                            }, inplace=True)

In [50]:
df_decisions = df_decisions[df_decisions['Date'] != 'NaT']

In [51]:
df_decisions.sort_values('Date', inplace=True)
df_decisions.set_index('Date', inplace=True)

In [52]:
i = 0
df_decisions['Decision'] = np.nan
df_decisions['Decision_txt'] = ''

In [53]:
for i in range(len(df_decisions)-1):
    df_decisions['Decision'].iloc[i+1] = df_decisions['Selic'].iloc[i+1]-df_decisions['Selic'].iloc[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [54]:
def decision_txt(num):
    if num == 0:
        return 'mantain'
    elif num >= 0:
        return 'increase'
    elif num <= 0:
        return 'decrease'
    else:
        return ''

In [55]:
for i in range(len(df_decisions)-1):
    df_decisions['Decision_txt'].iloc[i+1] = decision_txt(df_decisions['Decision'].iloc[i+1])

In [56]:
df_decisions.tail(5)

Unnamed: 0_level_0,Selic,Meeting_Number,Decision,Decision_txt
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022/10/26,13.75,250.0,0.0,mantain
2022/12/07,13.75,251.0,0.0,mantain
2023/02/01,13.75,252.0,0.0,mantain
2023/03/22,13.75,253.0,0.0,mantain
2023/05/03,13.75,254.0,0.0,mantain


### Merge columns

In [85]:
df1 = pd.merge(df_statements, df_minutes, how="outer", on="Date")

In [87]:
df = pd.merge(df1, df_decisions, how="outer", on="Date")

In [89]:
df.sort_values('Date', inplace=True)

In [90]:
df

Unnamed: 0_level_0,Statements,Minutes,Minutes_Date,Selic,Meeting_Number,Decision,Decision_txt
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1996/06/26,,,,1.90,1.0,,
1996/07/30,,,,1.90,2.0,0.00,mantain
1996/08/21,,,,1.88,3.0,-0.02,decrease
1996/09/23,,,,1.82,4.0,-0.06,decrease
1996/10/23,,,,1.78,5.0,-0.04,decrease
1996/11/27,,,,1.74,6.0,-0.04,decrease
1996/12/18,,,,1.70,7.0,-0.04,decrease
1997/01/22,,,,1.66,8.0,-0.04,decrease
1997/02/19,,,,1.62,9.0,-0.04,decrease
1997/03/19,,,,1.58,10.0,-0.04,decrease
