# Deep

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_copom = pd.read_csv('df_copom_label.csv')

In [3]:
df_copom.head()

Unnamed: 0,Date,Selic,Meeting_Number,Decision,Decision_txt,label_hawk_dove,label_next_meet,Text,Type
0,2006/03/08,16.5,117.0,-0.75,decrease,dovish,decrease,"In the March Meeting, the Banco Central do Br...",Statement
1,2006/04/19,15.75,118.0,-0.75,decrease,dovish,decrease,"In the April Meeting, the Monetary Policy Com...",Statement
2,2006/05/31,15.25,119.0,-0.5,decrease,dovish,decrease,"In the May Meeting, the Monetary Policy Commi...",Statement
3,2006/07/19,14.75,120.0,-0.5,decrease,dovish,decrease,"In the July Meeting, the Copom unanimously de...",Statement
4,2006/08/30,14.25,121.0,-0.5,decrease,dovish,decrease,"In the August Meeting, the Copom unanimously ...",Statement


In [4]:
texts = df_copom['Text'].tolist()
labels = df_copom['label_hawk_dove'].tolist()

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Create a tokenizer object
tokenizer = Tokenizer()

# Fit the tokenizer on the text data
tokenizer.fit_on_texts(texts)
tokenizer.fit_on_texts(labels)

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence

In [17]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenize the text and convert to sequences (using the same tokenizer as before)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

sequences_train = [torch.tensor(seq) for seq in sequences_train]
sequences_test = [torch.tensor(seq) for seq in sequences_test]

# Pad sequences to have the same length
padded_sequences_train = pad_sequence(sequences_train, batch_first=True)
padded_sequences_test = pad_sequence(sequences_test, batch_first=True)

# Convert the padded sequences back to numpy arrays if needed
padded_sequences_train = padded_sequences_train.numpy()
padded_sequences_test = padded_sequences_test.numpy()

# Convert labels to one-hot encoded tensors
label_classes = list(set(labels))
num_classes = len(label_classes)

label_to_index = {label: index for index, label in enumerate(label_classes)}
index_to_label = {index: label for label, index in label_to_index.items()}

labels_encoded_train = np.array([label_to_index[label] for label in y_train])
labels_encoded_test = np.array([label_to_index[label] for label in y_test])

labels_one_hot_train = torch.eye(num_classes)[labels_encoded_train]
labels_one_hot_test = torch.eye(num_classes)[labels_encoded_test]

# Convert data to PyTorch tensors and create DataLoader for training and testing datasets
X_train_tensor = torch.from_numpy(padded_sequences_train).long()
y_train_tensor = torch.tensor(labels_one_hot_train).float()

X_test_tensor = torch.from_numpy(padded_sequences_test).long()
y_test_tensor = torch.tensor(labels_one_hot_test).float()

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)



In [18]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        output = self.fc(hidden.squeeze(0))
        return output


In [19]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
hidden_dim = 100
output_dim = num_classes

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())


In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.argmax(dim=1))
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for inputs, labels in test_loader:
            outputs = model(inputs)
            predicted = outputs.argmax(dim=1)
            total += labels.size(0)
            correct += (predicted == labels.argmax(dim=1)).sum().item()

    accuracy = correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}], Test Accuracy: {accuracy}")


Epoch [1/10], Test Accuracy: 0.375
Epoch [2/10], Test Accuracy: 0.40625
Epoch [3/10], Test Accuracy: 0.21875
Epoch [4/10], Test Accuracy: 0.21875
Epoch [5/10], Test Accuracy: 0.21875
Epoch [6/10], Test Accuracy: 0.21875
