In [1]:
import numpy as np
import pandas as pd
import os
import random

def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [2]:
import numpy as np
import pandas as pd
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout, Input, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split


# Load dos dados
csv_path = '../../datasets/human_or_ai_dataset_small.csv'  # Change this to your file path
df = pd.read_csv(csv_path)
# Sanity check!
print("Dataset shape:", df.shape)
print("Columns:", df.columns)

2025-03-17 12:14:39.567988: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742213679.912528   16648 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742213680.004996   16648 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742213680.614566   16648 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742213680.614600   16648 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742213680.614603   16648 computation_placer.cc:177] computation placer alr

Dataset shape: (5051, 2)
Columns: Index(['text', 'source'], dtype='object')


In [3]:
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

# Parameters
max_length = 600
max_tokens = 20000

# Extract texts and labels
texts = df['text'].values
labels = df['source'].values

# Convert labels to numeric values
label_map = {'human': 0, 'ai': 1}
y_data = np.array([label_map[label] for label in labels])

# Define TextVectorization layer
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)

# Adapt to the text dataset
text_vectorization.adapt(texts)

# Transform text data into tokenized sequences
x_data = text_vectorization(texts).numpy()  # Convert TensorFlow tensor to NumPy array

# Split data
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Check shapes
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

# Print a sample
print("Sample sequence:", x_train[0])
print("Sample label:", y_train[0])


W0000 00:00:1742213695.457247   16648 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


x_train shape: (4040, 600)
y_train shape: (4040,)
x_test shape: (1011, 600)
y_test shape: (1011,)
Sample sequence: [   15    20     2   680     3  4751   121  6079  3152     6     2   624
  1302   524     6     5   216   432     3     1  1533     1  3011   104
    15   954     2  2998   505 18556    23  5046  6078     6    38   104
     4    56     9   121  6078     8   224    10     2   583   144  8081
  3011    44    15   129     9 18556  7232    91     2   627   378     8
   150  3530    15    56     9   121  6078     8    31   224     6     5
  6757    44    11    24   489   463     7   246    23  1891     6   627
    47    67     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0   

In [4]:
import tensorflow as tf
import numpy as np

# Set parameters
batch_size = 16
val_fraction = 0.15
seed = 25

# Shuffle the entire dataset with a fixed seed
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
dataset = dataset.shuffle(buffer_size=len(x_train), seed=seed)

# Calculate split sizes
val_size = int(len(x_train) * val_fraction)
train_size = len(x_train) - val_size

# Create training and validation datasets
train_ds = dataset.skip(val_size).batch(batch_size)
val_ds = dataset.take(val_size).batch(batch_size)

# Create test dataset
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size)

train_ds = dataset.skip(val_size).batch(batch_size)

# Transforme com Encoder

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

In [6]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

#    def compute_mask(self, inputs, mask=None):
#        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

In [10]:
vocab_size = 20000
sequence_length = 600
embed_dim = 256
num_heads = 2
dense_dim = 32

# Early stopping 
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

inputs = keras.Input(shape=(None,), dtype="int64")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("full_transformer_encoder.keras", save_best_only=True),
    early_stopping
]

model.fit(train_ds, validation_data=val_ds, epochs=10, callbacks=callbacks)
model = keras.models.load_model(
    "full_transformer_encoder.keras",
    custom_objects={"TransformerEncoder": TransformerEncoder,
                    "PositionalEmbedding": PositionalEmbedding})
print(f"Test acc: {model.evaluate(test_ds)[1]:.3f}")

Epoch 1/10
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 881ms/step - accuracy: 0.7038 - loss: 0.7271 - val_accuracy: 0.9307 - val_loss: 0.1811
Epoch 2/10
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 765ms/step - accuracy: 0.9041 - loss: 0.2148 - val_accuracy: 0.9092 - val_loss: 0.2314
Epoch 3/10
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 771ms/step - accuracy: 0.9435 - loss: 0.1393 - val_accuracy: 0.9620 - val_loss: 0.0958
Epoch 4/10
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 784ms/step - accuracy: 0.9532 - loss: 0.1146 - val_accuracy: 0.9719 - val_loss: 0.0753
Epoch 5/10
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 712ms/step - accuracy: 0.9609 - loss: 0.1018 - val_accuracy: 0.9158 - val_loss: 0.1768
Epoch 6/10
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 728ms/step - accuracy: 0.9742 - loss: 0.0687 - val_accuracy: 0.9818 - val_loss: 0.0520
Epoc



[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 324ms/step - accuracy: 0.9486 - loss: 0.1672
Test acc: 0.950


In [11]:
print(f"Test acc: {model.evaluate(test_ds)[1]:.3f}")

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 336ms/step - accuracy: 0.9486 - loss: 0.1672
Test acc: 0.950


In [19]:
import pandas as pd
import numpy as np

dataset_path = '../../datasets/dataset1_inputs.csv'
df_input = pd.read_csv(dataset_path, sep='\t')

texts_input = df_input['Text'].values
x_input = text_vectorization(texts_input).numpy()

predictions = model.predict(x_input)
predicted_labels = np.where(predictions.flatten() >= 0.5, 'AI', 'Human')

df_output = pd.DataFrame({
    'ID': df_input['ID'],
    'Prediction': predicted_labels
})

output_path = 'output_predictions.csv'
df_output.to_csv(output_path, index=False)

print(f'Previsões guardadas em {output_path}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 707ms/step
Previsões guardadas em output_predictions.csv
