# **RNN Model Notebook**

@authors: miguelrocha and Grupo 03

In [45]:
# Notebook Imports
import numpy as np
import pandas as pd
import re
from collections import Counter
import pickle
import random
import time
import os
import requests
import zipfile

from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix
from helpers.dataset import Dataset
from helpers.activation import TanhActivation
from helpers.losses import BinaryCrossEntropy
from helpers.metrics import accuracy
from helpers.activation import ReLUActivation
from models.rnn_model import RNN

**Modifica√ß√£o na classe Optimizer**

In [46]:
class Optimizer:
    def __init__(self, learning_rate=0.01, momentum=0.9):
        self.velocity = {}  # Dicion√°rio para armazenar velocidades dos gradientes
        self.learning_rate = learning_rate
        self.momentum = momentum

    def update(self, param, grad):
        """Atualiza os pesos usando Gradient Descent com Momentum"""

        param_id = id(param)  # Usar ID √∫nico do numpy array

        if param_id not in self.velocity:
            self.velocity[param_id] = np.zeros_like(grad)

        # Atualiza√ß√£o com momentum
        self.velocity[param_id] = self.momentum * self.velocity[param_id] + (1 - self.momentum) * grad
        return param - self.learning_rate * self.velocity[param_id]  # Retorna os novos pesos



### **Tratamento de Dados**

**An√°lise Inicial dos Datasets e Jun√ß√£o dos mesmos para tratamento simult√¢neo**

In [47]:
# Definir os caminhos dos arquivos de TREINO
input_csv1 = "../tarefa_1/test_input_dataset/merged_inputs.csv"
output_csv1 = "../tarefa_1/test_output_dataset/merged_outputs.csv"

# Definir os caminhos dos arquivos de TESTE FINAL
input_csv2 = "../tarefa_2/classify_input_datasets/submission3_inputs.csv"
output_csv2 = "../tarefa_2/layout_datasets/submission3_layout_outputs.csv" # dataset apenas utilizado para adicionar o layout ID Label
 
# Carregar os datasets de treino
df_input1 = pd.read_csv(input_csv1, sep="\t")  
df_output1 = pd.read_csv(output_csv1, sep="\t")

# Carregar os datasets de teste
df_input2 = pd.read_csv(input_csv2, sep="\t")
df_output2 = pd.read_csv(output_csv2, sep="\t")

# Jun√ß√£o com coluna ID
df_train = pd.merge(df_input1, df_output1, on="ID")
df_test = pd.merge(df_input2, df_output2, on="ID")

# Concatenar treino e teste para aplicar as altera√ß√µes simultaneamente
df_dataset1_merged = pd.concat([df_train, df_test], ignore_index=True)

# Mostrar as primeiras 5 linhas do dataset completo
print("\nDataset Completo - Primeiras 5 linhas:")
print(df_dataset1_merged.head())

print("\nDataset Completo - Ultimas 5 linhas:")
print(df_dataset1_merged.tail())


Dataset Completo - Primeiras 5 linhas:
     ID                                               Text  Label
0  3035  We present a general numerical scheme for the ...  Human
1   925  The present paper aims at introducing the inno...  Human
2   470  This research paper investigates the phenomeno...     AI
3  2060  This research paper explores the concept of th...     AI
4  3167  The paper explores the concept of reference fr...     AI

Dataset Completo - Ultimas 5 linhas:
          ID                                               Text Label
1979   D3-96  The relationship between Darwin's theory of ev...   Nap
1980   D3-97  Charles Darwin's historic visit to the Gal√°pag...   Nap
1981   D3-98  The Gal√°pagos Islands (Spanish: Islas Gal√°pago...   Nap
1982   D3-99  The Galapagos Islands played a pivotal role in...   Nap
1983  D3-100  The Galapagos‚Äô natural environment was substan...   Nap


**Remover caracteres especiais e pontua√ß√£o e Converter em min√∫sculas**

In [48]:
# Fun√ß√£o para limpar texto
def clean_text(text):
    text = text.lower()  # Converter para min√∫sculas
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remover pontua√ß√£o
    return text

df_dataset1_merged["clean_text"] = df_dataset1_merged["Text"].apply(clean_text)

# Manter apenas as colunas desejadas e renomear clean_text para Text
df_dataset1_merged = df_dataset1_merged[["ID", "clean_text", "Label"]].rename(columns={"clean_text": "Text"})

print("Texto limpo - primeiras 5 linhas:")
print(df_dataset1_merged.head())

Texto limpo - primeiras 5 linhas:
     ID                                               Text  Label
0  3035  we present a general numerical scheme for the ...  Human
1   925  the present paper aims at introducing the inno...  Human
2   470  this research paper investigates the phenomeno...     AI
3  2060  this research paper explores the concept of th...     AI
4  3167  the paper explores the concept of reference fr...     AI


**Remover stopwords**

In [49]:
# Lista de stopwords comuns
stopwords = {
    "the", "of", "and", "in", "to", "is", "a", "that", "for", "are", "on", "with", 
    "as", "at", "by", "from", "this", "it", "an", "be", "or", "which", "was", "were"
}

# Fun√ß√£o para remover stopwords
def remove_stopwords(text):
    words = text.split()  # Dividir em palavras
    filtered_words = [word for word in words if word not in stopwords]  # Remover stopwords
    return " ".join(filtered_words)  # Juntar as palavras de novo

# Aplicar ao dataset
df_dataset1_merged["Text"] = df_dataset1_merged["Text"].apply(remove_stopwords)

# Exibir as primeiras 5 linhas ap√≥s remo√ß√£o de stopwords
print("Texto sem stopwords - primeiras 5 linhas:")
print(df_dataset1_merged.head())



Texto sem stopwords - primeiras 5 linhas:
     ID                                               Text  Label
0  3035  we present general numerical scheme practical ...  Human
1   925  present paper aims introducing innovative tech...  Human
2   470  research paper investigates phenomenon softeni...     AI
3  2060  research paper explores concept accelerated ex...     AI
4  3167  paper explores concept reference frames specia...     AI


**Criar Embeddings e Label Encoding**

In [50]:
# Mapear labels para valores num√©ricos
label_map = {"Human": 0, "AI": 1}
df_dataset1_merged["Label"] = df_dataset1_merged["Label"].map(label_map)

# Carregar o GloVe
EMBEDDING_DIM = 50  # Dimens√£o do embedding

# Diret√≥rio e nome do ficheiro GloVe
glove_dir = "helpers"
glove_filename = "glove.6B.50d.txt"
glove_zip_url = "http://nlp.stanford.edu/data/glove.6B.zip"  # URL do GloVe oficial

# Criar diret√≥rio se n√£o existir
os.makedirs(glove_dir, exist_ok=True)

# Caminho completo do ficheiro
glove_path = os.path.join(glove_dir, glove_filename)
glove_zip_path = os.path.join(glove_dir, "glove.6B.zip")

# Verificar se o ficheiro j√° existe
if not os.path.exists(glove_path):
    print("Ficheiro GloVe n√£o encontrado. A fazer download...")

    # Download do ficheiro ZIP do GloVe
    response = requests.get(glove_zip_url, stream=True)
    with open(glove_zip_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
    
    print("Download conclu√≠do. A extrair ficheiros...")

    # Extrair apenas o ficheiro necess√°rio
    with zipfile.ZipFile(glove_zip_path, "r") as zip_ref:
        zip_ref.extract(glove_filename, path=glove_dir)

    print("Extra√ß√£o conclu√≠da!")

# Agora podemos carregar o GloVe
embedding_dict = {}
with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype="float32")
        embedding_dict[word] = vector

print(f"Total de palavras carregadas do GloVe: {len(embedding_dict)}")

# Converter palavras para embeddings
def text_to_embedding(text, embedding_dict, embedding_dim=50):
    words = text.split()
    embeddings = [embedding_dict.get(word, np.zeros(embedding_dim)) for word in words]  # Usa vetor do GloVe ou vetor zerado
    
    # Se a lista estiver vazia, retorna um vetor de zeros
    if len(embeddings) == 0:
        embeddings = [np.zeros(embedding_dim)]

    return embeddings

df_dataset1_merged["Embedding"] = df_dataset1_merged["Text"].apply(lambda x: text_to_embedding(x, embedding_dict, EMBEDDING_DIM))


Total de palavras carregadas do GloVe: 400000


**Padronizar o comprimento das sequ√™ncias**


In [51]:
# Padronizar comprimento das sequ√™ncias
MAX_SEQUENCE_LENGTH = 130  # foram testados v√°rios valores sendo o melhor 130

def pad_embedding_sequence(seq, max_length, embedding_dim):
    seq = np.array(seq)  # Garante que a sequ√™ncia √© um array NumPy
    
    if seq.shape[0] == 0:  # Se for uma sequ√™ncia vazia, criar um array de zeros
        seq = np.zeros((1, embedding_dim))

    if seq.shape[0] > max_length:  # Truncar se for maior
        return seq[:max_length]
    
    padding = np.zeros((max_length - seq.shape[0], embedding_dim))  # Criar padding
    return np.vstack([seq, padding])  # Adicionar padding no final

# Aplicar padding √†s sequ√™ncias de embeddings
df_dataset1_merged["Embedding"] = df_dataset1_merged["Embedding"].apply(
    lambda x: pad_embedding_sequence(x, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
)

# Converter para array NumPy para alimentar o modelo
X = np.array(df_dataset1_merged["Embedding"].tolist())
y = np.array(df_dataset1_merged["Label"])  # Labels num√©ricos

print("Formato final dos dados para o modelo:", X.shape)  # Deve ser (n_amostras, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)

# Manter apenas as colunas desejadas e renomear "Embedding" para "Text"
df_dataset1_merged = df_dataset1_merged[["ID", "Embedding", "Label"]].rename(columns={"Embedding": "Text"})

print("Dataset ap√≥s embedding - primeiras 5 linhas:")
print(df_dataset1_merged.head())


Formato final dos dados para o modelo: (1984, 130, 50)
Dataset ap√≥s embedding - primeiras 5 linhas:
     ID                                               Text  Label
0  3035  [[0.5738700032234192, -0.32728999853134155, 0....    0.0
1   925  [[0.7249799966812134, 0.5722399950027466, -0.2...    0.0
2   470  [[0.7125800251960754, 0.6449199914932251, 0.05...    1.0
3  2060  [[0.7125800251960754, 0.6449199914932251, 0.05...    1.0
4  3167  [[-0.7121599912643433, 0.028648000210523605, 0...    1.0


**Normaliza√ß√£o dos Embeddings**


In [52]:
# Fun√ß√£o para normalizar cada embedding (zero mean, unit variance)
def normalize_embedding(emb):
    mean = np.mean(emb, axis=0)  # M√©dia por dimens√£o do embedding
    std = np.std(emb, axis=0) + 1e-8  # Desvio padr√£o (evita divis√£o por zero)
    return (emb - mean) / std

# Aplicar normaliza√ß√£o alternativa aos embeddings
df_dataset1_merged["Text"] = df_dataset1_merged["Text"].apply(normalize_embedding)

# Converter para array NumPy para treinar o modelo
X = np.array(df_dataset1_merged["Text"].tolist())
y = np.array(df_dataset1_merged["Label"])  # Labels num√©ricos

print("Formato final dos dados para o modelo:", X.shape)  # Deve ser (n_amostras, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)

# Print do dataset atualizado
print("\nDataset ap√≥s normaliza√ß√£o dos embeddings:")
print(df_dataset1_merged.head())


Formato final dos dados para o modelo: (1984, 130, 50)

Dataset ap√≥s normaliza√ß√£o dos embeddings:
     ID                                               Text  Label
0  3035  [[0.6229451367824401, -0.6616349872681468, 0.2...    0.0
1   925  [[1.3730016267611846, 1.4058269465314142, -0.6...    0.0
2   470  [[0.9887738921291833, 1.5442610704979023, 0.16...    1.0
3  2060  [[1.5017445407917538, 1.7339764749495419, 0.17...    1.0
4  3167  [[-2.187671688541481, -0.14573981075688355, 0....    1.0


**Drop da coluna ID**

In [53]:
if "ID" in df_dataset1_merged.columns:
    df_dataset1_merged = df_dataset1_merged.drop(columns=["ID"])

print("Formato final dos dados para o modelo:", X.shape)  # Deve ser (n_amostras, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)

# Print do dataset atualizado
print("\nDataset ap√≥s drop:")
print(df_dataset1_merged.head())

Formato final dos dados para o modelo: (1984, 130, 50)

Dataset ap√≥s drop:
                                                Text  Label
0  [[0.6229451367824401, -0.6616349872681468, 0.2...    0.0
1  [[1.3730016267611846, 1.4058269465314142, -0.6...    0.0
2  [[0.9887738921291833, 1.5442610704979023, 0.16...    1.0
3  [[1.5017445407917538, 1.7339764749495419, 0.17...    1.0
4  [[-2.187671688541481, -0.14573981075688355, 0....    1.0


**Divis√£o do Dataset**

Dataset de Treino:

- 70% : Treino
- 15% : Valida√ß√£o
- 15% : Teste

Dataset de Avalia√ß√£o:

- 100% : Teste Final


In [54]:
# Definir seed global para garantir reprodutibilidade
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

######################################################### dataset de teste
# Separar as √∫ltimas linhas para avalia√ß√£o final
df_eval_final = df_dataset1_merged.tail(100)

# Remover essas linhas do dataset antes de embaralhar
df_remaining = df_dataset1_merged.iloc[:-100]
#########################################################

# Embaralhar o dataset restante
df_remaining = df_remaining.sample(frac=1, random_state=SEED).reset_index(drop=True)

# Definir propor√ß√µes de treino (70%), valida√ß√£o (15%) e teste (15%)
train_ratio = 0.7
val_ratio = 0.15  # 15% valida√ß√£o
test_ratio = 0.15  # 15% teste

# Definir √≠ndices para divis√£o
train_index = int(len(df_remaining) * train_ratio)
val_index = train_index + int(len(df_remaining) * val_ratio)

# Separar os conjuntos de treino, valida√ß√£o e teste
df_train = df_remaining.iloc[:train_index]
df_val = df_remaining.iloc[train_index:val_index]
df_test = df_remaining.iloc[val_index:]

# Print dos tamanhos dos datasets
print(f"Tamanho do conjunto de treino: {df_train.shape}")
print(f"Tamanho do conjunto de valida√ß√£o: {df_val.shape}")
print(f"Tamanho do conjunto de teste: {df_test.shape}")
print(f"Tamanho do conjunto de avalia√ß√£o final: {df_eval_final.shape}")

# Converter para arrays NumPy
X_train, y_train = np.array(df_train["Text"].tolist()), np.array(df_train["Label"])
X_val, y_val = np.array(df_val["Text"].tolist()), np.array(df_val["Label"])
X_test, y_test = np.array(df_test["Text"].tolist()), np.array(df_test["Label"])
X_eval_final, y_eval_final = np.array(df_eval_final["Text"].tolist()), np.array(df_eval_final["Label"])

# Print dos formatos dos dados
print(f"Formato dos dados:")
print(f"   Treino: {X_train.shape}")
print(f"   Valida√ß√£o: {X_val.shape}")
print(f"   Teste: {X_test.shape}")
print(f"   Avalia√ß√£o final: {X_eval_final.shape}")



Tamanho do conjunto de treino: (1318, 2)
Tamanho do conjunto de valida√ß√£o: (282, 2)
Tamanho do conjunto de teste: (284, 2)
Tamanho do conjunto de avalia√ß√£o final: (100, 2)
Formato dos dados:
   Treino: (1318, 130, 50)
   Valida√ß√£o: (282, 130, 50)
   Teste: (284, 130, 50)
   Avalia√ß√£o final: (100, 130, 50)


**Verifica√ß√£o Final do Dataset**

In [55]:
print("\n Primeiras 5 entradas do conjunto de TREINO:")
print(df_train.head())

print("\n Primeiras 5 entradas do conjunto de VALIDA√á√ÉO:")
print(df_val.head())

print("\n Primeiras 5 entradas do conjunto de TESTE:")
print(df_test.head())

print("\n Primeiras 5 entradas do conjunto de AVALIA√á√ÉO FINAL:")
print(df_eval_final.head())



 Primeiras 5 entradas do conjunto de TREINO:
                                                Text  Label
0  [[1.2648330130038024, -1.3120997287976162, 0.0...    0.0
1  [[1.923546133922557, -2.3442699269752563, 0.43...    0.0
2  [[1.3654664319093903, 2.0011569695179863, 0.10...    1.0
3  [[1.3464225567418249, 1.2965997871220445, 0.15...    1.0
4  [[1.3212941873089101, 1.9241303852529341, -1.3...    1.0

 Primeiras 5 entradas do conjunto de VALIDA√á√ÉO:
                                                   Text  Label
1318  [[1.632567589969771, 1.3259334388866952, 0.095...    1.0
1319  [[-1.9020010963581078, 0.13419086271709868, 0....    1.0
1320  [[-0.5495222804483586, -0.04835099471844644, -...    0.0
1321  [[1.0487307372902497, 1.4485013158732332, -0.1...    1.0
1322  [[0.38219893342825073, -0.3728841095297984, -1...    0.0

 Primeiras 5 entradas do conjunto de TESTE:
                                                   Text  Label
1600  [[0.5849216259806982, 1.0337079910141158, 0.02...  

### **Constru√ß√£o do modelo RNN com c√≥digo raiz (Sem TensorFlow/SKLearn)**

**Inicializa√ß√£o de Pesos**

Antes de tudo, vamos definir os pesos da rede:

- W_xh: Pesa a entrada para os neur√¥nios recorrentes.
- W_hh: Pesa as conex√µes recorrentes.
- W_hy: Pesa a sa√≠da do neur√¥nio recorrente para a previ√ß√£o final.
- b_h e b_y: Bias da camada oculta e da sa√≠da.

In [56]:
# Definir hiperpar√¢metros
input_size = 50    # Dimens√£o dos embeddings
hidden_size = 64   # N√∫mero de neur√¥nios na camada oculta
output_size = 1    # Sa√≠da bin√°ria (0 ou 1)
learning_rate = 0.01  

# Inicializar pesos
np.random.seed(42)  # Para reprodutibilidade
W_xh = np.random.randn(input_size, hidden_size) * 0.01  # Pesos da entrada para a camada oculta
W_hh = np.random.randn(hidden_size, hidden_size) * 0.01 # Pesos da camada oculta para ela mesma
W_hy = np.random.randn(hidden_size, output_size) * 0.01 # Pesos da camada oculta para sa√≠da

# Bias
b_h = np.zeros((1, hidden_size))
b_y = np.zeros((1, output_size))

print("Pesos e Biases inicializados!")

Pesos e Biases inicializados!


**Fun√ß√£o de Custo (Binary Cross-Entropy)**

In [57]:
def binary_cross_entropy(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-8, 1 - 1e-8)  # üîπ Evita log(0) ou log(1)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)) / y_pred.shape[0]

**Mini-Batches**

In [58]:
def get_mini_batches(X, y, batch_size=16, shuffle=True):
    """Divide os dados em mini-batches."""
    n_samples = X.shape[0]
    indices = np.arange(n_samples)
    if shuffle:
        np.random.shuffle(indices)
    
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        yield X[indices[start:end]], y[indices[start:end]]


**Otimiza√ß√£o de Hiperpar√¢metros (Inicial)**

In [59]:
# Fun√ß√£o de ativa√ß√£o Sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Definir pesos corretamente (Xavier Initialization)
W_xh = np.random.randn(input_size, hidden_size) * np.sqrt(1. / input_size)
W_hh = np.random.randn(hidden_size, hidden_size) * np.sqrt(1. / hidden_size)
W_hy = np.random.randn(hidden_size, output_size) * np.sqrt(1. / hidden_size)

HYPERPARAMS = [
    {"epochs": 10, "batch_size": 16, "learning_rate": 0.001, "momentum": 0.9, "bptt_trunc": 2},
    {"epochs": 10, "batch_size": 16, "learning_rate": 0.005, "momentum": 0.95, "bptt_trunc": 3},
    {"epochs": 10, "batch_size": 16, "learning_rate": 0.007, "momentum": 0.8, "bptt_trunc": 2},
]

best_accuracy = 0
best_params = None
best_model = None

# Testando hiperpar√¢metros
for params in HYPERPARAMS:
    print(f"\nTestando hiperpar√¢metros: {params}")

    rnn = RNN(
        n_units=20,
        # activation=ReLUActivation(),
        activation=TanhActivation(),
        bptt_trunc=params["bptt_trunc"],
        input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM),
        epochs=params["epochs"],
        batch_size=params["batch_size"],
        learning_rate=params["learning_rate"],
        momentum=params["momentum"],
        loss=BinaryCrossEntropy,
        metric=accuracy
    )

    optimizer = Optimizer(learning_rate=params["learning_rate"])
    rnn.initialize(optimizer)

    for epoch in range(params["epochs"]):
        total_loss = 0
        for X_batch, y_batch in get_mini_batches(X_train, y_train, params["batch_size"]):
            y_pred = rnn.forward_propagation(X_batch)
            y_pred_final = sigmoid(y_pred[:, -1, :])  # Aplica Sigmoid na √∫ltima sa√≠da

            loss = binary_cross_entropy(y_batch.reshape(-1, 1), y_pred_final)
            grad_loss = (y_pred_final - y_batch.reshape(-1, 1)) / y_batch.shape[0]

            grad_loss_expanded = np.zeros_like(y_pred)
            grad_loss_expanded[:, -1, :] = grad_loss

            rnn.backward_propagation(grad_loss_expanded)

            total_loss += loss

        print(f"√âpoca {epoch+1}/{params['epochs']} - Loss: {total_loss:.4f}")

    # Avalia√ß√£o
    preds = rnn.predict(X_val)
    
    # Debug do formato de `preds`
    print(f"Formato de preds: {preds.shape}")

    # Corrigir caso `preds` seja 1D
    if preds.ndim == 1:
        preds = preds[:, np.newaxis]

    acc = accuracy(y_val, preds)

    print(f"Accuracy com esses hiperpar√¢metros: {acc:.4f}")

    if acc > best_accuracy:
        best_accuracy = acc
        best_params = params
        best_model = rnn

print(f"\nMelhor combina√ß√£o encontrada: {best_params} com accuracy {best_accuracy:.4f}")


Testando hiperpar√¢metros: {'epochs': 10, 'batch_size': 16, 'learning_rate': 0.001, 'momentum': 0.9, 'bptt_trunc': 2}
√âpoca 1/10 - Loss: 3.6829
√âpoca 2/10 - Loss: 3.6783
√âpoca 3/10 - Loss: 3.6765
√âpoca 4/10 - Loss: 3.6730
√âpoca 5/10 - Loss: 3.6698
√âpoca 6/10 - Loss: 3.6665
√âpoca 7/10 - Loss: 3.6636
√âpoca 8/10 - Loss: 3.6614
√âpoca 9/10 - Loss: 3.6579
√âpoca 10/10 - Loss: 3.6556
Formato de preds: (282,)
Accuracy com esses hiperpar√¢metros: 0.5355

Testando hiperpar√¢metros: {'epochs': 10, 'batch_size': 16, 'learning_rate': 0.005, 'momentum': 0.95, 'bptt_trunc': 3}
√âpoca 1/10 - Loss: 3.6713
√âpoca 2/10 - Loss: 3.6669
√âpoca 3/10 - Loss: 3.6627
√âpoca 4/10 - Loss: 3.6570
√âpoca 5/10 - Loss: 3.6518
√âpoca 6/10 - Loss: 3.6441
√âpoca 7/10 - Loss: 3.6336
√âpoca 8/10 - Loss: 3.6187
√âpoca 9/10 - Loss: 3.5934
√âpoca 10/10 - Loss: 3.5415
Formato de preds: (282,)
Accuracy com esses hiperpar√¢metros: 0.5355

Testando hiperpar√¢metros: {'epochs': 10, 'batch_size': 16, 'learning_rate': 0.0

**Treinar o Modelo Final com melhor accuracy (obtido no passo anterior)**

In [60]:
final_rnn = RNN(
    n_units=20,
    # activation=ReLUActivation(),
    activation=TanhActivation(),
    bptt_trunc=best_params["bptt_trunc"],
    input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM),
    epochs=best_params["epochs"],
    batch_size=best_params["batch_size"],
    learning_rate=best_params["learning_rate"],
    momentum=best_params["momentum"],
    loss=BinaryCrossEntropy,
    metric=accuracy
)

final_optimizer = Optimizer(learning_rate=best_params["learning_rate"])
final_rnn.initialize(final_optimizer)

for epoch in range(best_params["epochs"]):
    total_loss = 0
    for X_batch, y_batch in get_mini_batches(X_train, y_train, best_params["batch_size"]):
        y_pred = final_rnn.forward_propagation(X_batch)
        y_pred_final = sigmoid(y_pred[:, -1, :])  # Aplica Sigmoid na √∫ltima sa√≠da

        loss = binary_cross_entropy(y_batch.reshape(-1, 1), y_pred_final)

        # Calcular o gradiente correto
        grad_loss = (y_pred_final - y_batch.reshape(-1, 1)) / y_batch.shape[0]

        # Expandir para 3 dimens√µes para ser compat√≠vel com a RNN
        grad_loss_expanded = np.zeros_like(y_pred)  # (batch_size, timesteps, output_size)
        grad_loss_expanded[:, -1, :] = grad_loss  # Apenas o √∫ltimo timestep recebe gradiente

        # Passar o gradiente expandido
        final_rnn.backward_propagation(grad_loss_expanded)

        total_loss += loss

    print(f"Treino final - √âpoca {epoch+1}/{best_params['epochs']} - Loss: {total_loss:.4f}")

# Testar Modelo Final
y_test_pred = final_rnn.predict(X_test)

print(f"Formato de y_test_pred: {y_test_pred.shape}")  # Debug

# Se for 1D, expandimos para 2D
if y_test_pred.ndim == 1:
    y_test_pred = y_test_pred[:, np.newaxis]

# Se for 2D (batch_size, timesteps), pegamos o √∫ltimo timestep
if y_test_pred.ndim == 2:
    y_test_pred_final = y_test_pred[:, -1]  #  Sem `:` no final, pois j√° √© 1D
else:
    y_test_pred_final = y_test_pred[:, -1, :]  #  Apenas se for 3D

y_test_pred_labels = (y_test_pred_final > 0.5).astype(int)

y_test_true = y_test.flatten()
accuracy = np.mean(y_test_pred_labels == y_test_true)
print(f"\nAccuracy final no conjunto de teste: {accuracy:.4f}")

# Garantir que y_test_true e y_test_pred_labels tenham o mesmo tamanho
if len(y_test_true) != len(y_test_pred_labels):
    print(f"Warning: Tamanhos diferentes! y_test_true={len(y_test_true)}, y_test_pred_labels={len(y_test_pred_labels)}")
    min_size = min(len(y_test_true), len(y_test_pred_labels))
    y_test_true = y_test_true[:min_size]
    y_test_pred_labels = y_test_pred_labels[:min_size]

# Calcular as metricas
accuracy = np.mean(y_test_pred_labels == y_test_true)
f1 = f1_score(y_test_true, y_test_pred_labels)
recall = recall_score(y_test_true, y_test_pred_labels)
precision = precision_score(y_test_true, y_test_pred_labels)
cm = confusion_matrix(y_test_true, y_test_pred_labels)

# Exibir os resultados
print(f"\nAccuracy final no conjunto de teste: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print("Confusion Matrix:\n", cm)


Treino final - √âpoca 1/10 - Loss: 3.6844
Treino final - √âpoca 2/10 - Loss: 3.6825
Treino final - √âpoca 3/10 - Loss: 3.6817
Treino final - √âpoca 4/10 - Loss: 3.6793
Treino final - √âpoca 5/10 - Loss: 3.6770
Treino final - √âpoca 6/10 - Loss: 3.6760
Treino final - √âpoca 7/10 - Loss: 3.6742
Treino final - √âpoca 8/10 - Loss: 3.6732
Treino final - √âpoca 9/10 - Loss: 3.6709
Treino final - √âpoca 10/10 - Loss: 3.6688
Formato de y_test_pred: (284,)

Accuracy final no conjunto de teste: 0.4754

Accuracy final no conjunto de teste: 0.4754
F1 Score: 0.0000
Recall: 0.0000
Precision: 0.0000
Confusion Matrix:
 [[135   0]
 [149   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Previs√£o para o Dataset2 (disponibilizado pelo professor)**

In [61]:
# Testar Modelo Final
y_test_pred2 = final_rnn.predict(X_eval_final)

print(f"Formato de y_test_pred2: {y_test_pred2.shape}")  # üõ†Ô∏è Debug

# Se for 1D, expandimos para 2D
if y_test_pred2.ndim == 1:
    y_test_pred2 = y_test_pred2[:, np.newaxis]

# Se for 2D (batch_size, timesteps), pegamos o √∫ltimo timestep
if y_test_pred2.ndim == 2:
    y_test_pred_final2 = y_test_pred2[:, -1]  #  Sem `:` no final, pois j√° √© 1D
else:
    y_test_pred_final2 = y_test_pred2[:, -1, :]  #  Apenas se for 3D

y_test_pred_labels2 = (y_test_pred_final2 > 0.5).astype(int)

######################################################################### cria√ß√£o do ficheiro csv com a previs√£o

# Criar IDs para cada amostra com o formato "D2-1", "D2-2", etc.
id_column = [f"D2-{i}" for i in range(1, len(y_test_pred_labels2) + 1)]

# Converter labels para "Human" e "AI"
labels = np.where(y_test_pred_labels2.flatten() == 1, "AI", "Human")

# Criar DataFrame com ID e LABEL
df_output = pd.DataFrame({
    "ID": id_column,
    "Label": labels
})


Formato de y_test_pred2: (100,)


### **An√°lise de resultados**

**Treino com dataset: gpt_vs_human**

- Durante o treino: 0.87 - 0.9

- Para dataset1: 0.66

- Para dataset2: 0.8 - 1.0

- Para ai_human: 0.51

**Treino com dataset: ai_human**

- Durante o treino: 0.81 - 0.84

- Para gpt_vs_human: 0.49

### **Hypertuning com base no modelo anterior - teste com 3600 combina√ß√µes diferentes**

Foi feito o loop apresentado abaixo, com 3600 combina√ß√µes, por√©m por uma quest√£o de brevidade, estamos neste momento a rodar o c√≥digo apenas com o melhor resultado obtido:

**Melhor combina√ß√£o encontrada: {'epochs': 5, 'batch_size': 8, 'learning_rate': 0.01, 'momentum': 0.8, 'bptt_trunc': 6} com accuracy 0.8929**

In [62]:
print(f"Tipo de accuracy antes da chamada: {type(accuracy)}")
if not callable(accuracy):  # Se n√£o for mais uma fun√ß√£o
    del accuracy  # Remover a vari√°vel sobrescrita
    from helpers.metrics import accuracy  # Reimporte 


# Apenas com os melhores hiperpar√¢metros calculados anteriormente
HYPERPARAMS = [
    {"epochs": ep, "batch_size": bs, "learning_rate": lr, "momentum": mo, "bptt_trunc": bt}
    for ep in [5]
    for bs in [8]
    for lr in [0.01]
    for mo in [0.8]
    for bt in [6]
]

best_accuracy = 0
best_params = None
best_model = None

start_time = time.time()
MAX_TIME = 21600 #6 horas em segundos

# Teste de hiperpar√¢metros 
for params in HYPERPARAMS:
    if time.time() - start_time > MAX_TIME:
        break
    
    print(f"\nA testar hiperpar√¢metros: {params}")
    
    rnn = RNN(
        n_units=20,
        activation=TanhActivation(),
        bptt_trunc=params["bptt_trunc"],
        input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM),
        epochs=params["epochs"],
        batch_size=params["batch_size"],
        learning_rate=params["learning_rate"],
        momentum=params["momentum"],
        loss=BinaryCrossEntropy,
        metric=accuracy
    )
    
    optimizer = Optimizer(learning_rate=params["learning_rate"])
    rnn.initialize(optimizer)
    
    for epoch in range(params["epochs"]):
        total_loss = 0
        for X_batch, y_batch in get_mini_batches(X_train, y_train, params["batch_size"]):
            y_pred = rnn.forward_propagation(X_batch)
            y_pred_final = sigmoid(y_pred[:, -1, :])

            loss = binary_cross_entropy(y_batch.reshape(-1, 1), y_pred_final)
            grad_loss = (y_pred_final - y_batch.reshape(-1, 1)) / y_batch.shape[0]
            
            grad_loss_expanded = np.zeros_like(y_pred)
            grad_loss_expanded[:, -1, :] = grad_loss
            
            rnn.backward_propagation(grad_loss_expanded)
            total_loss += loss
        
        print(f"√âpoca {epoch+1}/{params['epochs']} - Loss: {total_loss:.4f}")
    
    preds = rnn.predict(X_val)
    if preds.ndim == 1:
        preds = preds[:, np.newaxis]
    acc_value = accuracy(y_val, preds)
    
    print(f"Accuracy com esses hiperpar√¢metros: {acc_value:.4f}")
    
    if acc_value > best_accuracy:
        best_accuracy = acc_value
        best_params = params
        best_model = rnn

print(f"\nMelhor combina√ß√£o encontrada: {best_params} com accuracy {best_accuracy:.4f}")

Tipo de accuracy antes da chamada: <class 'numpy.float64'>

A testar hiperpar√¢metros: {'epochs': 5, 'batch_size': 8, 'learning_rate': 0.01, 'momentum': 0.8, 'bptt_trunc': 6}
√âpoca 1/5 - Loss: 14.3326
√âpoca 2/5 - Loss: 14.2567
√âpoca 3/5 - Loss: 14.0455
√âpoca 4/5 - Loss: 12.5264
√âpoca 5/5 - Loss: 10.3488
Accuracy com esses hiperpar√¢metros: 0.8794

Melhor combina√ß√£o encontrada: {'epochs': 5, 'batch_size': 8, 'learning_rate': 0.01, 'momentum': 0.8, 'bptt_trunc': 6} com accuracy 0.8794


**Treino do modelo final, com os melhores hiperpar√¢metros**

In [63]:
final_rnn = RNN(
    n_units=20,
    activation=TanhActivation(),
    bptt_trunc=best_params["bptt_trunc"],
    input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM),
    epochs=best_params["epochs"],
    batch_size=best_params["batch_size"],
    learning_rate=best_params["learning_rate"],
    momentum=best_params["momentum"],
    loss=BinaryCrossEntropy,
    metric=accuracy
)

final_optimizer = Optimizer(learning_rate=best_params["learning_rate"])
final_rnn.initialize(final_optimizer)

for epoch in range(best_params["epochs"]):
    total_loss = 0
    for X_batch, y_batch in get_mini_batches(X_train, y_train, best_params["batch_size"]):
        y_pred = final_rnn.forward_propagation(X_batch)
        y_pred_final = sigmoid(y_pred[:, -1, :])  # Aplica Sigmoid na √∫ltima sa√≠da

        loss = binary_cross_entropy(y_batch.reshape(-1, 1), y_pred_final)

        # Calcular o gradiente correto
        grad_loss = (y_pred_final - y_batch.reshape(-1, 1)) / y_batch.shape[0]

        # Expandir para 3 dimens√µes para ser compat√≠vel com a RNN
        grad_loss_expanded = np.zeros_like(y_pred)  # (batch_size, timesteps, output_size)
        grad_loss_expanded[:, -1, :] = grad_loss  # Apenas o √∫ltimo timestep recebe gradiente

        # Passar o gradiente expandido
        final_rnn.backward_propagation(grad_loss_expanded)

        total_loss += loss

    print(f"Treino final - √âpoca {epoch+1}/{best_params['epochs']} - Loss: {total_loss:.4f}")

# Testar Modelo Final
y_test_pred = final_rnn.predict(X_test)

print(f"Formato de y_test_pred: {y_test_pred.shape}")  # Debug

# Se for 1D, expandimos para 2D
if y_test_pred.ndim == 1:
    y_test_pred = y_test_pred[:, np.newaxis]

# Se for 2D (batch_size, timesteps), pegamos o √∫ltimo timestep
if y_test_pred.ndim == 2:
    y_test_pred_final = y_test_pred[:, -1]  #  Sem `:` no final, pois j√° √© 1D
else:
    y_test_pred_final = y_test_pred[:, -1, :]  #  Apenas se for 3D

y_test_pred_labels = (y_test_pred_final > 0.5).astype(int)

y_test_true = y_test.flatten()
accuracy = np.mean(y_test_pred_labels == y_test_true)
print(f"\nAccuracy final no conjunto de teste: {accuracy:.4f}")

# Criar DataFrame com Expected vs Predicted
df_results = pd.DataFrame({
    "expected_value": y_test_true,
    "predicted_value_raw": y_test_pred_final.flatten(),  # Valor original antes do arredondamento
    "predicted_value": y_test_pred_labels.flatten()  # Valor final bin√°rio (0 ou 1)
})

# Mostrar as previs√µes para compara√ß√£o
print("\nCompara√ß√£o entre valores esperados e previstos:")
print(df_results)

Treino final - √âpoca 1/5 - Loss: 14.2591
Treino final - √âpoca 2/5 - Loss: 13.2834
Treino final - √âpoca 3/5 - Loss: 11.1703
Treino final - √âpoca 4/5 - Loss: 8.4915
Treino final - √âpoca 5/5 - Loss: 7.4557
Formato de y_test_pred: (284,)

Accuracy final no conjunto de teste: 0.8768

Compara√ß√£o entre valores esperados e previstos:
     expected_value  predicted_value_raw  predicted_value
0               1.0                  0.0                0
1               1.0                  1.0                1
2               1.0                  1.0                1
3               0.0                  0.0                0
4               0.0                  0.0                0
..              ...                  ...              ...
279             0.0                  0.0                0
280             1.0                  1.0                1
281             1.0                  1.0                1
282             0.0                  0.0                0
283             0.0        

**Previs√£o do melhor modelo para o dataset disponibilizado pelo professor**

In [64]:
# Testar Modelo Final
y_test_pred2 = final_rnn.predict(X_eval_final)

print(f"Formato de y_test_pred2: {y_test_pred2.shape}")  # Debug

# Se for 1D, expandimos para 2D
if y_test_pred2.ndim == 1:
    y_test_pred2 = y_test_pred2[:, np.newaxis]

# Se for 2D (batch_size, timesteps), pegamos o √∫ltimo timestep
if y_test_pred2.ndim == 2:
    y_test_pred_final2 = y_test_pred2[:, -1]  #  Sem `:` no final, pois j√° √© 1D
else:
    y_test_pred_final2 = y_test_pred2[:, -1, :]  #  Apenas se for 3D

y_test_pred_labels2 = (y_test_pred_final2 > 0.5).astype(int)


Formato de y_test_pred2: (100,)


**Cria√ß√£o do Ficheiro CSV com a previs√£o final para o dataset disponibilizado pelo professor**

In [65]:
# Generate IDs for each prediction in the format D2-1, D2-2, ...
ids = [f"D3-{i+1}" for i in range(len(y_test_pred_labels2))]

# Map 0 to "Human" and 1 to "AI"
labels = ["Human" if pred == 0 else "AI" for pred in y_test_pred_labels2.flatten()]

# Create a DataFrame with ID and Label columns
df_predictions = pd.DataFrame({
    "ID": ids,
    "Label": labels
})

# Save the predictions to a CSV file using a tab separator to match the exact format
df_predictions.to_csv("classify_output_datasets/submission3_outputs_rnn_model.csv", sep="\t", index=False)

print("\nPredictions saved to submission3_outputs_rnn_model.csv successfully!")

# Load the validation dataset
df_true = pd.read_csv("../tarefa_1/validation_dataset/dataset3_disclosed_output.csv", sep="\t")

# Merge the datasets on the "ID" column, adding suffixes to distinguish the identical column names
df_merged = pd.merge(df_true, df_predictions, on="ID", suffixes=('_true', '_pred'))

# Calculate the number of correct predictions by comparing the "Label" columns
num_correct = (df_merged["Label_true"] == df_merged["Label_pred"]).sum()

# Calculate the percentage of correct predictions
accuracy_percentage = (num_correct / len(df_merged)) * 100

print(f"Accuracy: {accuracy_percentage:.2f}%")


Predictions saved to submission3_outputs_rnn_model.csv successfully!
Accuracy: 57.00%


### **An√°lise de resultados da melhor combina√ß√£o encontrada**

**Melhor combina√ß√£o encontrada: {'epochs': 5, 'batch_size': 8, 'learning_rate': 0.01, 'momentum': 0.8, 'bptt_trunc': 6} com accuracy 0.8929**

**Treino com dataset: gpt_vs_human**

- Durante o treino: 0.87 - 0.9

- Para dataset1: 0.60

- Para dataset2: 0.8 - 1