# Tratamento de dados

In [82]:
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [83]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ftrav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ftrav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [84]:
def count_words(text):
    words = text.split()
    return len(words)

In [85]:
def cut_voters(text):
    regex = re.compile('(?P<voters>.+) voted for this decision')
    try:
        text_cut = regex.search(str(text)).group('voters')
    except:
        text_cut = text
    return text_cut

In [86]:
def extract_meeting_number(sentence):
    match = re.search(r'\d+', sentence)
    if match:
        return match.group()
    else:
        return None

In [87]:
def preprocess_text(text):

    # Remove newline characters
    text = re.sub(r'\n', ' ', text)

    # Remove tab characters
    text = re.sub(r'\t', ' ', text)

    # Remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Tokenize text into individual words
    words = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Join the words back into a single string
    processed_text = ' '.join(words)

    return processed_text

### Statements

In [88]:
df_statements = pd.read_csv('statements.csv')

In [89]:
df_statements.tail()

Unnamed: 0,Conteudo,Data,DataAtualizacao,Id,Titulo,mostrarCabecalho,mostrarRodape
135,"<div> <p>In the August Meeting, the Copom unan...",2006-08-30T03:00:00Z,,2120,121st Meeting,False,False
136,"<div> <p>In the July Meeting, the Copom unanim...",2006-07-19T03:00:00Z,,2119,120th Meeting,False,False
137,"<div> <p>In the May Meeting, the Monetary Poli...",2006-05-31T03:00:00Z,,2118,119th Meeting,False,False
138,"<div> <p>In the April Meeting, the Monetary Po...",2006-04-19T03:00:00Z,,2117,118th Meeting,False,False
139,"<div> <p>In the March Meeting, the Banco Centr...",2006-03-08T03:00:00Z,,2116,117th Meeting,False,False


In [90]:
df_statements.loc[30:,'Titulo']

30                 Copom lowers Selic rate to 6.00% p.a.
31     223rd Meeting of the Monetary Policy Committee...
32     222nd Meeting of the Monetary Policy Committee...
33     221st Meeting of the Monetary Policy Committee...
34     220th Meeting of the Monetary Policy Committee...
                             ...                        
135                                        121st Meeting
136                                        120th Meeting
137                                        119th Meeting
138                                        118th Meeting
139                                        117th Meeting
Name: Titulo, Length: 110, dtype: object

In [91]:
df_statements['meeting_number'] = df_statements['Titulo'].apply(extract_meeting_number).astype(int)

In [92]:
df_statements.head()

Unnamed: 0,Conteudo,Data,DataAtualizacao,Id,Titulo,mostrarCabecalho,mostrarRodape,meeting_number
0,"<p style=""text-align&#58;justify;"">The global ...",2023-05-03T21:49:02Z,,2475,Copom maintains the Selic rate at 13.75% p.a.,False,False,13
1,"<p style=""text-align&#58;justify;"">Since its p...",2023-03-22T21:33:49Z,,2467,Copom maintains the Selic rate at 13.75% p.a.,False,False,13
2,"<p style=""text-align&#58;justify;"">In its 252<...",2023-02-01T21:37:00Z,,2463,Copom maintains the Selic rate at 13.75% p.a.,False,False,13
3,"<p style=""text-align&#58;justify;"">In its 251<...",2022-12-07T21:31:38Z,,2456,Copom maintains the Selic rate at 13.75% p.a.,False,False,13
4,"<p style=""text-align&#58;justify;"">In its 250<...",2022-10-26T21:39:32Z,,2451,Copom maintains the Selic rate at 13.75% p.a.,False,False,13


In [93]:
df_statements.loc[30:,'meeting_number']

30       6
31     223
32     222
33     221
34     220
      ... 
135    121
136    120
137    119
138    118
139    117
Name: meeting_number, Length: 110, dtype: int32

In [94]:
for i in range(31, 0, -1):
    number = df_statements.iloc[i, 7]
    df_statements.iloc[i-1, 7] = number + 1

In [95]:
df_statements.loc[:,'meeting_number']

0      254
1      253
2      252
3      251
4      250
      ... 
135    121
136    120
137    119
138    118
139    117
Name: meeting_number, Length: 140, dtype: int32

In [96]:
df_statements['type'] = 'statement'
df_statements["date"] = df_statements["Data"].apply(pd.to_datetime, format='%Y-%m-%d %H:%M:%S.%f').dt.strftime('%Y/%m/%d')
df_statements.drop(['Data', 'Id', 'Titulo', 'DataAtualizacao', 'mostrarCabecalho', 'mostrarRodape'], axis=1, inplace=True)

In [97]:
df_statements.rename(columns={'Conteudo': 'text_raw'}, inplace=True)

In [98]:
# df_statements.sort_values('Date', inplace=True)
# df_statements.set_index('Date', inplace=True)
df_statements = df_statements.sort_values('meeting_number').copy()

In [99]:
df_statements['text_raw'] = df_statements['text_raw'].apply(lambda x: BeautifulSoup(x, features="html.parser").get_text())

In [100]:
df_statements.head()

Unnamed: 0,text_raw,meeting_number,type,date
139,"In the March Meeting, the Banco Central do Br...",117,statement,2006/03/08
138,"In the April Meeting, the Monetary Policy Com...",118,statement,2006/04/19
137,"In the May Meeting, the Monetary Policy Commi...",119,statement,2006/05/31
136,"In the July Meeting, the Copom unanimously de...",120,statement,2006/07/19
135,"In the August Meeting, the Copom unanimously ...",121,statement,2006/08/30


In [101]:
# df_statements['Text_raw'][120]

In [102]:
df_statements['text'] = df_statements['text_raw'].apply(cut_voters).apply(preprocess_text)

In [103]:
# df_statements['Text'][120]

In [104]:
df_statements['num_words'] = df_statements['text'].apply(count_words).astype(int)
df_statements['num_words_raw'] = df_statements['text_raw'].apply(count_words).astype(int)

In [105]:
df_statements.tail(5)

Unnamed: 0,text_raw,meeting_number,type,date,text,num_words,num_words_raw
4,"In its 250th meeting, the Copom decided to mai...",250,statement,2022/10/26,"250th meeting , copom decided maintain selic r...",498,759
3,"In its 251st meeting, Copom decided to maintai...",251,statement,2022/12/07,"251st meeting , copom decided maintain selic r...",512,776
2,"In its 252nd meeting, Copom decided to maintai...",252,statement,2023/02/01,"252nd meeting , copom decided maintain selic r...",484,842
1,Since its previous meeting of the Monetary Pol...,253,statement,2023/03/22,since previous meeting monetary policy committ...,444,807
0,The global environment remains challenging. Th...,254,statement,2023/05/03,global environment remains challenging . episo...,436,802


In [106]:
df_statements.shape

(140, 7)

In [107]:
# df_statements.drop(['text_raw', 'num_words_raw'], axis=1, inplace=True)

In [108]:
df_statements

Unnamed: 0,text_raw,meeting_number,type,date,text,num_words,num_words_raw
139,"In the March Meeting, the Banco Central do Br...",117,statement,2006/03/08,"march meeting , banco central brasil 's moneta...",55,67
138,"In the April Meeting, the Monetary Policy Com...",118,statement,2006/04/19,"april meeting , monetary policy committee ( co...",35,40
137,"In the May Meeting, the Monetary Policy Commi...",119,statement,2006/05/31,"may meeting , monetary policy committee ( copo...",35,40
136,"In the July Meeting, the Copom unanimously de...",120,statement,2006/07/19,"july meeting , copom unanimously decided reduc...",30,37
135,"In the August Meeting, the Copom unanimously ...",121,statement,2006/08/30,"august meeting , copom unanimously decided red...",34,44
...,...,...,...,...,...,...,...
4,"In its 250th meeting, the Copom decided to mai...",250,statement,2022/10/26,"250th meeting , copom decided maintain selic r...",498,759
3,"In its 251st meeting, Copom decided to maintai...",251,statement,2022/12/07,"251st meeting , copom decided maintain selic r...",512,776
2,"In its 252nd meeting, Copom decided to maintai...",252,statement,2023/02/01,"252nd meeting , copom decided maintain selic r...",484,842
1,Since its previous meeting of the Monetary Pol...,253,statement,2023/03/22,since previous meeting monetary policy committ...,444,807


### Minutes

In [109]:
df_minutes = pd.read_csv('minutes.csv')

In [110]:
df_minutes.tail(1)

Unnamed: 0,BoxDestaque,DataReferencia,Edicao,EsconderCapa,EsconderDataReferencia,Grafico,Id,ImagemCapa,Introducao,LinkPagina,OutrasInformacoes,Titulo,Url,Volume,conteudo,data
212,,2000-01-10T02:00:00Z,,False,,,1.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,42nd Copom minutes,/content/copom/copomminutes/MIN200042-COPOM200...,,,2000-01-24T02:00:00Z


In [111]:
df_minutes.loc[:,'Titulo']

0                      254th Meeting - May 2-3, 2023
1                  253rd Meeting - March 21-22, 2023
2      252nd Meeting - January 31 - February 1, 2023
3                 251st Meeting - December 6-7, 2022
4                250th Meeting - October 25-26, 2022
                           ...                      
208                               45th Copom minutes
209                               44th Copom minutes
210                               43nd Copom minutes
211                        Changes in Copom meetings
212                               42nd Copom minutes
Name: Titulo, Length: 213, dtype: object

In [112]:
df_minutes['meeting_number'] = df_minutes['Titulo'].apply(extract_meeting_number)

In [113]:
df_minutes.tail()

Unnamed: 0,BoxDestaque,DataReferencia,Edicao,EsconderCapa,EsconderDataReferencia,Grafico,Id,ImagemCapa,Introducao,LinkPagina,OutrasInformacoes,Titulo,Url,Volume,conteudo,data,meeting_number
208,,2000-03-30T03:00:00Z,,False,,,5.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,45th Copom minutes,/content/copom/copomminutes/MIN200045-COPOM200...,,,2000-03-30T03:00:00Z,45.0
209,,2000-02-15T03:00:00Z,,False,,,4.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,44th Copom minutes,/content/copom/copomminutes/MIN200044-COPOM200...,,,2000-02-15T03:00:00Z,44.0
210,,2000-02-04T02:00:00Z,,False,,,3.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,43nd Copom minutes,/content/copom/copomminutes/MIN200043-COPOM200...,,,2000-02-04T02:00:00Z,43.0
211,,2000-01-19T02:00:00Z,,False,,,2.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,Changes in Copom meetings,/content/copom/copomminutes/CHANG200042-COPOM2...,,,2001-08-06T03:00:00Z,
212,,2000-01-10T02:00:00Z,,False,,,1.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,42nd Copom minutes,/content/copom/copomminutes/MIN200042-COPOM200...,,,2000-01-24T02:00:00Z,42.0


In [114]:
df_minutes = df_minutes[df_minutes['meeting_number'].notna()]

In [115]:
df_minutes['meeting_number'] = df_minutes['meeting_number'].astype(int).copy()

In [116]:
df_minutes.tail()

Unnamed: 0,BoxDestaque,DataReferencia,Edicao,EsconderCapa,EsconderDataReferencia,Grafico,Id,ImagemCapa,Introducao,LinkPagina,OutrasInformacoes,Titulo,Url,Volume,conteudo,data,meeting_number
207,,2000-05-04T03:00:00Z,,False,,,6.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,46th Copom minutes,/content/copom/copomminutes/MIN200046-COPOM200...,,,2000-05-04T03:00:00Z,46
208,,2000-03-30T03:00:00Z,,False,,,5.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,45th Copom minutes,/content/copom/copomminutes/MIN200045-COPOM200...,,,2000-03-30T03:00:00Z,45
209,,2000-02-15T03:00:00Z,,False,,,4.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,44th Copom minutes,/content/copom/copomminutes/MIN200044-COPOM200...,,,2000-02-15T03:00:00Z,44
210,,2000-02-04T02:00:00Z,,False,,,3.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,43nd Copom minutes,/content/copom/copomminutes/MIN200043-COPOM200...,,,2000-02-04T02:00:00Z,43
212,,2000-01-10T02:00:00Z,,False,,,1.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,42nd Copom minutes,/content/copom/copomminutes/MIN200042-COPOM200...,,,2000-01-24T02:00:00Z,42


In [117]:
df_minutes['type'] = 'minutes'
# df_minutes["Minutes_Date"] = df_minutes["data"].apply(pd.to_datetime).dt.strftime('%Y/%m/%d')
df_minutes["date"] = df_minutes["DataReferencia"].apply(pd.to_datetime).dt.strftime('%Y/%m/%d')

In [118]:
df_minutes.tail()

Unnamed: 0,BoxDestaque,DataReferencia,Edicao,EsconderCapa,EsconderDataReferencia,Grafico,Id,ImagemCapa,Introducao,LinkPagina,OutrasInformacoes,Titulo,Url,Volume,conteudo,data,meeting_number,type,date
207,,2000-05-04T03:00:00Z,,False,,,6.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,46th Copom minutes,/content/copom/copomminutes/MIN200046-COPOM200...,,,2000-05-04T03:00:00Z,46,minutes,2000/05/04
208,,2000-03-30T03:00:00Z,,False,,,5.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,45th Copom minutes,/content/copom/copomminutes/MIN200045-COPOM200...,,,2000-03-30T03:00:00Z,45,minutes,2000/03/30
209,,2000-02-15T03:00:00Z,,False,,,4.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,44th Copom minutes,/content/copom/copomminutes/MIN200044-COPOM200...,,,2000-02-15T03:00:00Z,44,minutes,2000/02/15
210,,2000-02-04T02:00:00Z,,False,,,3.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,43nd Copom minutes,/content/copom/copomminutes/MIN200043-COPOM200...,,,2000-02-04T02:00:00Z,43,minutes,2000/02/04
212,,2000-01-10T02:00:00Z,,False,,,1.0,/content/publications/PublishingImages/Capas/c...,The minutes of the Monetary Policy Committee’s...,/en/publications/copomminutes/cronologicos,,42nd Copom minutes,/content/copom/copomminutes/MIN200042-COPOM200...,,,2000-01-24T02:00:00Z,42,minutes,2000/01/10


In [119]:
df_minutes.drop(['BoxDestaque', 'DataReferencia', 'Edicao', 'EsconderCapa', 
                 'EsconderDataReferencia', 'Grafico', 'Id', 
                 'ImagemCapa', 'Introducao', 'LinkPagina', 'Titulo',
                 'Url', 'Volume', 'conteudo', 'data'], axis=1, inplace=True)

In [120]:
df_minutes.rename(columns={'OutrasInformacoes': 'text_raw'}, inplace=True)

In [121]:
# df_minutes.sort_values('Date', inplace=True)
# df_minutes.set_index('Date', inplace=True)
df_minutes = df_minutes.sort_values('meeting_number').copy()

In [122]:
df_minutes = df_minutes[df_minutes['text_raw'].notna()]

In [123]:
df_minutes

Unnamed: 0,text_raw,meeting_number,type,date
22,"<div id=""atacompleta""><div id=""ataconteudo""><p...",232,minutes,2020/08/05
21,"<div id=""atacompleta""><div id=""ataconteudo""><h...",233,minutes,2020/09/16
20,"<div id=""atacompleta""><div id=""ataconteudo""><h...",234,minutes,2020/10/28
19,"<div id=""atacompleta""><div id=""ataconteudo""><h...",235,minutes,2020/12/09
18,"<div id=""atacompleta""><div id=""ataconteudo""><h...",236,minutes,2021/01/20
17,"<div id=""atacompleta""><div id=""ataconteudo""><h...",237,minutes,2021/03/17
16,"<div id=""atacompleta""><div id=""ataconteudo""><h...",238,minutes,2021/05/05
15,"<div id=""atacompleta""><div id=""ataconteudo""><h...",239,minutes,2021/06/16
14,"<div id=""atacompleta""><div id=""ataconteudo""><h...",240,minutes,2021/08/04
13,"<div id=""atacompleta""><div id=""ataconteudo""><h...",241,minutes,2021/09/22


In [124]:
df_minutes['text_raw'] = df_minutes['text_raw'].apply(lambda x: BeautifulSoup(x, features="html.parser").get_text())

In [125]:
df_minutes['text'] = df_minutes['text_raw'].apply(cut_voters)

In [126]:
df_minutes['num_words'] = df_minutes['text'].apply(count_words).astype(int)
df_minutes['num_words_raw'] = df_minutes['text_raw'].apply(count_words).astype(int)

In [127]:
df_minutes.head(5)

Unnamed: 0,text_raw,meeting_number,type,date,text,num_words,num_words_raw
22,"1. Regarding the global outlook, the Covid-19 ...",232,minutes,2020/08/05,"1. Regarding the global outlook, the Covid-19 ...",1811,2114
21,A) Update of economic outlook and Copom’s base...,233,minutes,2020/09/16,A) Update of economic outlook and Copom’s base...,1750,2049
20,A) Update of economic outlook and Copom’s base...,234,minutes,2020/10/28,A) Update of economic outlook and Copom’s base...,1563,1887
19,A) Update of economic outlook and Copom’s base...,235,minutes,2020/12/09,A) Update of economic outlook and Copom’s base...,1529,1861
18,A) Update of economic outlook and Copom’s base...,236,minutes,2021/01/20,A) Update of economic outlook and Copom’s base...,1478,1810


In [128]:
df_minutes.shape

(23, 7)

In [129]:
# df_minutes.drop(['text_raw', 'num_words_raw'], axis=1, inplace=True)

In [130]:
df_minutes.head()

Unnamed: 0,text_raw,meeting_number,type,date,text,num_words,num_words_raw
22,"1. Regarding the global outlook, the Covid-19 ...",232,minutes,2020/08/05,"1. Regarding the global outlook, the Covid-19 ...",1811,2114
21,A) Update of economic outlook and Copom’s base...,233,minutes,2020/09/16,A) Update of economic outlook and Copom’s base...,1750,2049
20,A) Update of economic outlook and Copom’s base...,234,minutes,2020/10/28,A) Update of economic outlook and Copom’s base...,1563,1887
19,A) Update of economic outlook and Copom’s base...,235,minutes,2020/12/09,A) Update of economic outlook and Copom’s base...,1529,1861
18,A) Update of economic outlook and Copom’s base...,236,minutes,2021/01/20,A) Update of economic outlook and Copom’s base...,1478,1810


### Decisions

In [131]:
df_decisions = pd.read_csv('decisions.csv')

In [132]:
df_decisions.loc[:,'NumeroReuniaoCopom']

0      254.0
1      253.0
2      252.0
3      251.0
4      250.0
       ...  
257      5.0
258      4.0
259      3.0
260      2.0
261      1.0
Name: NumeroReuniaoCopom, Length: 262, dtype: float64

In [133]:
df_decisions['NumeroReuniaoCopom'].value_counts()

34.0     4
33.0     3
48.0     2
35.0     2
45.0     2
        ..
165.0    1
164.0    1
163.0    1
162.0    1
1.0      1
Name: NumeroReuniaoCopom, Length: 254, dtype: int64

In [134]:
df_decisions.drop(['DataFimVigencia', 'DataInicioVigencia', 'TaxaSelicEfetivaAnualizada',
                 'TaxaSelicEfetivaVigencia', 'TaxaTban', 'UsoMetaSelic', 'ReuniaoExtraordinaria',
                 'Vies'], axis=1, inplace=True)

In [135]:
df_decisions["DataReuniaoCopom"] = df_decisions["DataReuniaoCopom"].apply(pd.to_datetime).dt.strftime('%Y/%m/%d')

In [136]:
df_decisions.rename(columns={'DataReuniaoCopom': 'date', 
                             'MetaSelic': 'selic', 
                             'NumeroReuniaoCopom': 'meeting_number'
                            }, inplace=True)

In [137]:
df_decisions['meeting_number'] = df_decisions['meeting_number'].astype(int).copy()

In [138]:
df_decisions

Unnamed: 0,date,selic,meeting_number
0,2023/05/03,13.75,254
1,2023/03/22,13.75,253
2,2023/02/01,13.75,252
3,2022/12/07,13.75,251
4,2022/10/26,13.75,250
...,...,...,...
257,1996/10/23,1.78,5
258,1996/09/23,1.82,4
259,1996/08/21,1.88,3
260,1996/07/30,1.90,2


In [139]:
# df_decisions = df_decisions[df_decisions['Date'] != 'NaT']

In [140]:
# df_decisions = df_decisions.sort_values('Date').copy()
# df_decisions.set_index('Date', inplace=True)
df_decisions = df_decisions.sort_values('meeting_number').copy()

In [141]:
df_decisions['decision'] = np.nan
df_decisions['decision_txt'] = ''

In [142]:
df_decisions['meeting_number'].count()

262

In [143]:
for i in range(len(df_decisions)-1):
    df_decisions.iloc[i+1,3] = df_decisions.iloc[i+1,1] - df_decisions.iloc[i,1]

In [144]:
def decision_txt(num):
    if num == 0:
        return 'mantain'
    elif num >= 0:
        return 'increase'
    elif num <= 0:
        return 'decrease'
    else:
        return ''

In [145]:
for i in range(len(df_decisions)-1):
    df_decisions.iloc[i+1, 4] = decision_txt(df_decisions.iloc[i+1, 3])

In [146]:
df_decisions.head()

Unnamed: 0,date,selic,meeting_number,decision,decision_txt
261,1996/06/26,1.9,1,,
260,1996/07/30,1.9,2,0.0,mantain
259,1996/08/21,1.88,3,-0.02,decrease
258,1996/09/23,1.82,4,-0.06,decrease
257,1996/10/23,1.78,5,-0.04,decrease


### Labels

In [147]:
df_decisions['label_hawk_dove'] = None
df_decisions['label_next_meet'] = None

In [148]:
i = 0

for i in range(len(df_decisions)-1):
    decisao_atual = df_decisions.iloc[i, 3]
    decisao_seguite =  df_decisions.iloc[i+1, 3]
    decisao_seguite_txt = df_decisions.iloc[i+1, 4]    
        
    if decisao_seguite_txt == 'decrease':
        if decisao_seguite == decisao_atual:
            resultado = 'dovish'
        elif decisao_seguite > decisao_atual:
            resultado = 'dovish' # less dovish
        else:
            resultado = 'dovish' # more dovish
    elif decisao_seguite_txt == 'increase':
        if decisao_seguite == decisao_atual:
            resultado = 'hawkish'
        elif decisao_seguite > decisao_atual:
            resultado = 'hawkish' # more hawkish
        else:
            resultado = 'hawkish' # less hawkish
    elif decisao_seguite_txt == 'mantain':
        if decisao_atual < 0: 
            resultado = 'hawkish' # parou de cortar
        elif decisao_atual > 0:
            resultado = 'dovish' # parou de elevar
        else:
            resultado = 'neutral' # manteve
    
    df_decisions.iloc[i, 5] = resultado 

In [149]:
df_decisions.head()

Unnamed: 0,date,selic,meeting_number,decision,decision_txt,label_hawk_dove,label_next_meet
261,1996/06/26,1.9,1,,,neutral,
260,1996/07/30,1.9,2,0.0,mantain,dovish,
259,1996/08/21,1.88,3,-0.02,decrease,dovish,
258,1996/09/23,1.82,4,-0.06,decrease,dovish,
257,1996/10/23,1.78,5,-0.04,decrease,dovish,


In [150]:
for i in range(len(df_decisions)-1):
    df_decisions.iloc[i, 6] = df_decisions.iloc[i+1, 4]

In [151]:
df_decisions.head()

Unnamed: 0,date,selic,meeting_number,decision,decision_txt,label_hawk_dove,label_next_meet
261,1996/06/26,1.9,1,,,neutral,mantain
260,1996/07/30,1.9,2,0.0,mantain,dovish,decrease
259,1996/08/21,1.88,3,-0.02,decrease,dovish,decrease
258,1996/09/23,1.82,4,-0.06,decrease,dovish,decrease
257,1996/10/23,1.78,5,-0.04,decrease,dovish,decrease


### Merge columns

In [152]:
df1 = pd.merge(df_statements, df_minutes, how="outer")

In [153]:
df1 = df1.sort_values('meeting_number').copy()

In [154]:
df1

Unnamed: 0,text_raw,meeting_number,type,date,text,num_words,num_words_raw
0,"In the March Meeting, the Banco Central do Br...",117,statement,2006/03/08,"march meeting , banco central brasil 's moneta...",55,67
1,"In the April Meeting, the Monetary Policy Com...",118,statement,2006/04/19,"april meeting , monetary policy committee ( co...",35,40
2,"In the May Meeting, the Monetary Policy Commi...",119,statement,2006/05/31,"may meeting , monetary policy committee ( copo...",35,40
3,"In the July Meeting, the Copom unanimously de...",120,statement,2006/07/19,"july meeting , copom unanimously decided reduc...",30,37
4,"In the August Meeting, the Copom unanimously ...",121,statement,2006/08/30,"august meeting , copom unanimously decided red...",34,44
...,...,...,...,...,...,...,...
137,"In its 252nd meeting, Copom decided to maintai...",252,statement,2023/02/01,"252nd meeting , copom decided maintain selic r...",484,842
161,A) Update of economic outlook and Copom’s scen...,253,minutes,2023/03/22,A) Update of economic outlook and Copom’s scen...,2748,3230
138,Since its previous meeting of the Monetary Pol...,253,statement,2023/03/22,since previous meeting monetary policy committ...,444,807
139,The global environment remains challenging. Th...,254,statement,2023/05/03,global environment remains challenging . episo...,436,802


In [155]:
df2 = pd.merge(df1, df_decisions, how="outer", on=['meeting_number'])

In [156]:
df = df2.sort_values('meeting_number').copy()

In [157]:
df.head()

Unnamed: 0,text_raw,meeting_number,type,date_x,text,num_words,num_words_raw,date_y,selic,decision,decision_txt,label_hawk_dove,label_next_meet
163,,1,,,,,,1996/06/26,1.9,,,neutral,mantain
164,,2,,,,,,1996/07/30,1.9,0.0,mantain,dovish,decrease
165,,3,,,,,,1996/08/21,1.88,-0.02,decrease,dovish,decrease
166,,4,,,,,,1996/09/23,1.82,-0.06,decrease,dovish,decrease
167,,5,,,,,,1996/10/23,1.78,-0.04,decrease,dovish,decrease


In [158]:
df = df[df['selic'].notna()]

In [159]:
df = df[df['text'].notna()]

In [160]:
df = df[['meeting_number', 'date_x', 'date_y', 'selic', 'decision', 'decision_txt', 
         'type', 'text', 'num_words', 'text_raw', 'num_words_raw', 'label_hawk_dove', 'label_next_meet']]

In [161]:
df

Unnamed: 0,meeting_number,date_x,date_y,selic,decision,decision_txt,type,text,num_words,text_raw,num_words_raw,label_hawk_dove,label_next_meet
0,117,2006/03/08,2006/03/08,16.50,-0.75,decrease,statement,"march meeting , banco central brasil 's moneta...",55.0,"In the March Meeting, the Banco Central do Br...",67.0,dovish,decrease
1,118,2006/04/19,2006/04/19,15.75,-0.75,decrease,statement,"april meeting , monetary policy committee ( co...",35.0,"In the April Meeting, the Monetary Policy Com...",40.0,dovish,decrease
2,119,2006/05/31,2006/05/31,15.25,-0.50,decrease,statement,"may meeting , monetary policy committee ( copo...",35.0,"In the May Meeting, the Monetary Policy Commi...",40.0,dovish,decrease
3,120,2006/07/19,2006/07/19,14.75,-0.50,decrease,statement,"july meeting , copom unanimously decided reduc...",30.0,"In the July Meeting, the Copom unanimously de...",37.0,dovish,decrease
4,121,2006/08/30,2006/08/30,14.25,-0.50,decrease,statement,"august meeting , copom unanimously decided red...",34.0,"In the August Meeting, the Copom unanimously ...",44.0,dovish,decrease
...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,252,2023/02/01,2023/02/01,13.75,0.00,mantain,statement,"252nd meeting , copom decided maintain selic r...",484.0,"In its 252nd meeting, Copom decided to maintai...",842.0,neutral,mantain
159,253,2023/03/22,2023/03/22,13.75,0.00,mantain,minutes,A) Update of economic outlook and Copom’s scen...,2748.0,A) Update of economic outlook and Copom’s scen...,3230.0,neutral,mantain
160,253,2023/03/22,2023/03/22,13.75,0.00,mantain,statement,since previous meeting monetary policy committ...,444.0,Since its previous meeting of the Monetary Pol...,807.0,neutral,mantain
162,254,2023/05/03,2023/05/03,13.75,0.00,mantain,minutes,A) Update of economic outlook and Copom’s scen...,2501.0,A) Update of economic outlook and Copom’s scen...,2977.0,,


In [162]:
df.to_csv('df_copom_label_en.csv', index=True)