# Tratamento de dados

In [2]:
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

### Statements

In [3]:
df_statements = pd.read_csv('statements.csv')

In [4]:
df_statements['Type'] = 'Statement'
df_statements["Date"] = df_statements["Data"].apply(pd.to_datetime, format='%Y-%m-%d %H:%M:%S.%f').dt.strftime('%Y/%m/%d')
df_statements.drop(['Data', 'Id', 'Titulo', 'DataAtualizacao', 'mostrarCabecalho', 'mostrarRodape'], axis=1, inplace=True)

In [5]:
df_statements.rename(columns={'Conteudo': 'Text'}, inplace=True)
df_statements.sort_values('Date', inplace=True)
df_statements.set_index('Date', inplace=True)

In [6]:
df_statements['Text'] = df_statements['Text'].apply(lambda x: BeautifulSoup(x, features="html.parser").get_text())

In [7]:
df_statements.head(5)

Unnamed: 0_level_0,Text,Type
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2006/03/08,"In the March Meeting, the Banco Central do Br...",Statement
2006/04/19,"In the April Meeting, the Monetary Policy Com...",Statement
2006/05/31,"In the May Meeting, the Monetary Policy Commi...",Statement
2006/07/19,"In the July Meeting, the Copom unanimously de...",Statement
2006/08/30,"In the August Meeting, the Copom unanimously ...",Statement


### Minutes

In [8]:
df_minutes = pd.read_csv('minutes.csv')

In [9]:
df_minutes['Type'] = 'Minutes'
df_minutes["Minutes_Date"] = df_minutes["data"].apply(pd.to_datetime).dt.strftime('%Y/%m/%d')
df_minutes["Date"] = df_minutes["DataReferencia"].apply(pd.to_datetime).dt.strftime('%Y/%m/%d')

In [10]:
df_minutes.drop(['data', 'DataReferencia', 'BoxDestaque', 'Edicao', 'EsconderCapa', 
                 'EsconderDataReferencia', 'Grafico', 'Id', 
                 'ImagemCapa', 'ImagemCapa', 'LinkPagina', 'Url',
                 'Volume', 'Volume', 'Introducao', 'conteudo', 'Titulo'], axis=1, inplace=True)

In [11]:
df_minutes.drop(['Minutes_Date'], axis=1, inplace=True)

In [12]:
df_minutes.rename(columns={'data': 'Date', 'DataReferencia': 'Meeting_Date', 'OutrasInformacoes': 'Text'}, inplace=True)
df_minutes.sort_values('Date', inplace=True)
df_minutes.set_index('Date', inplace=True)

In [13]:
df_minutes = df_minutes[df_minutes['Text'].notna()]

In [14]:
df_minutes['Text'] = df_minutes['Text'].apply(lambda x: BeautifulSoup(x, features="html.parser").get_text())

In [15]:
df_minutes.head(5)

Unnamed: 0_level_0,Text,Type
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020/08/05,"1. Regarding the global outlook, the Covid-19 ...",Minutes
2020/09/16,A) Update of economic outlook and Copom’s base...,Minutes
2020/10/28,A) Update of economic outlook and Copom’s base...,Minutes
2020/12/09,A) Update of economic outlook and Copom’s base...,Minutes
2021/01/20,A) Update of economic outlook and Copom’s base...,Minutes


### Decisions

In [16]:
df_decisions = pd.read_csv('decisions.csv')

In [17]:
df_decisions.drop(['DataFimVigencia', 'DataInicioVigencia', 'TaxaSelicEfetivaAnualizada',
                 'TaxaSelicEfetivaVigencia', 'TaxaTban', 'UsoMetaSelic', 'ReuniaoExtraordinaria',
                 'Vies'], axis=1, inplace=True)

In [18]:
df_decisions["DataReuniaoCopom"] = df_decisions["DataReuniaoCopom"].apply(pd.to_datetime).dt.strftime('%Y/%m/%d')

In [19]:
df_decisions.rename(columns={'DataReuniaoCopom': 'Date', 
                             'MetaSelic': 'Selic', 
                             'NumeroReuniaoCopom': 'Meeting_Number'
                            }, inplace=True)

In [20]:
df_decisions = df_decisions[df_decisions['Date'] != 'NaT']

In [21]:
df_decisions.sort_values('Date', inplace=True)
df_decisions.set_index('Date', inplace=True)

In [22]:
i = 0
df_decisions['Decision'] = np.nan
df_decisions['Decision_txt'] = ''

In [23]:
for i in range(len(df_decisions)-1):
    df_decisions['Decision'].iloc[i+1] = df_decisions['Selic'].iloc[i+1]-df_decisions['Selic'].iloc[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [24]:
def decision_txt(num):
    if num == 0:
        return 'mantain'
    elif num >= 0:
        return 'increase'
    elif num <= 0:
        return 'decrease'
    else:
        return ''

In [25]:
for i in range(len(df_decisions)-1):
    df_decisions['Decision_txt'].iloc[i+1] = decision_txt(df_decisions['Decision'].iloc[i+1])

In [26]:
df_decisions.head()

Unnamed: 0_level_0,Selic,Meeting_Number,Decision,Decision_txt
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996/06/26,1.9,1.0,,
1996/07/30,1.9,2.0,0.0,mantain
1996/08/21,1.88,3.0,-0.02,decrease
1996/09/23,1.82,4.0,-0.06,decrease
1996/10/23,1.78,5.0,-0.04,decrease


### Labels

In [30]:
df_decisions['label_hawk_dove'] = None
df_decisions['label_next_meet'] = None

In [45]:
for i in range(len(df_decisions)-1):
    if df_decisions['Decision_txt'][i+1] == 'decrease':
        if df_decisions['Decision'][i+1] == df_decisions['Decision'][i]:
            df_decisions['label_hawk_dove'][i+1] = 'dovish'
        elif df_decisions['Decision'][i+1] > df_decisions['Decision'][i]:
            df_decisions['label_hawk_dove'][i+1] = 'dovish' # less dovish
        elif df_decisions['Decision'][i+1] < df_decisions['Decision'][i]:
            df_decisions['label_hawk_dove'][i+1] = 'dovish'
    elif df_decisions['Decision_txt'][i+1] == 'increase':
        if df_decisions['Decision'][i+1] == df_decisions['Decision'][i]:
            df_decisions['label_hawk_dove'][i+1] = 'hawkish'
        elif df_decisions['Decision'][i+1] > df_decisions['Decision'][i]:
            df_decisions['label_hawk_dove'][i+1] = 'hawkish'
        elif df_decisions['Decision'][i+1] < df_decisions['Decision'][i]:
            df_decisions['label_hawk_dove'][i+1] = 'hawkish' # less hawkish
    elif df_decisions['Decision_txt'][i+1] == 'mantain':
        if df_decisions['Decision'][i] < 0:
            df_decisions['label_hawk_dove'][i+1] = 'hawkish'
        elif df_decisions['Decision'][i] > 0:
            df_decisions['label_hawk_dove'][i+1] = 'dovish'
        elif df_decisions['Decision'][i] == 0:
            df_decisions['label_hawk_dove'][i+1] = 'neutral'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org

In [46]:
for i in range(len(df_decisions)-1):
    df_decisions['label_next_meet'][i] = df_decisions['Decision_txt'][i+1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [47]:
df_decisions.head()

Unnamed: 0_level_0,Selic,Meeting_Number,Decision,Decision_txt,label_hawk_dove,label_next_meet
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1996/06/26,1.9,1.0,,,,mantain
1996/07/30,1.9,2.0,0.0,mantain,,decrease
1996/08/21,1.88,3.0,-0.02,decrease,dovish,decrease
1996/09/23,1.82,4.0,-0.06,decrease,dovish,decrease
1996/10/23,1.78,5.0,-0.04,decrease,dovish,decrease


### Merge columns

In [48]:
df1 = pd.merge(df_statements, df_minutes, how="outer", on=['Date', 'Text', 'Type'])

In [49]:
df1.sort_values('Date', inplace=True)

In [50]:
df = pd.merge(df_decisions, df1, how="outer", on=['Date'])

In [51]:
df = df[df['Selic'].notna()]

In [52]:
df = df[df['Text'].notna()]

In [53]:
df.tail(20)

Unnamed: 0_level_0,Selic,Meeting_Number,Decision,Decision_txt,label_hawk_dove,label_next_meet,Text,Type
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022/03/16,11.75,245.0,1.0,increase,hawkish,increase,"​In its 245th meeting, the Copom unanimously d...",Statement
2022/03/16,11.75,245.0,1.0,increase,hawkish,increase,A) Update of economic outlook and Copom’s scen...,Minutes
2022/05/04,12.75,246.0,1.0,increase,hawkish,increase,"​In its 246th meeting, the Copom unanimously d...",Statement
2022/05/04,12.75,246.0,1.0,increase,hawkish,increase,A) Update of economic outlook and Copom’s scen...,Minutes
2022/06/15,13.25,247.0,0.5,increase,hawkish,increase,"In its 247th meeting, the Copom unanimously de...",Statement
2022/06/15,13.25,247.0,0.5,increase,hawkish,increase,A) Update of economic outlook and Copom’s scen...,Minutes
2022/08/03,13.75,248.0,0.5,increase,hawkish,mantain,"In its 248th meeting, the Copom unanimously de...",Statement
2022/08/03,13.75,248.0,0.5,increase,hawkish,mantain,A) Update of economic outlook and Copom’s scen...,Minutes
2022/09/21,13.75,249.0,0.0,mantain,dovish,mantain,A) Update of economic outlook and Copom’s scen...,Minutes
2022/09/21,13.75,249.0,0.0,mantain,dovish,mantain,"In its 249th meeting, the Copom decided to mai...",Statement


In [54]:
df.to_csv('df_copom_label.csv', index=True)