In [None]:
import numpy as np
import pandas as pd
import os
import random

def set_seed(seed: int):
    random.seed(seed) 
    np.random.seed(seed)  
    os.environ["PYTHONHASHSEED"] = str(seed)  

set_seed(25)

# Making our own dnn

In [17]:
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Flatten, Dense, Embedding, Input

# Parameters
max_features = 10000  # Tamanho do vocabulario
maxlen = 120 

# Load dos dados
csv_path = '../../human_or_ai_dataset_small.csv'
df = pd.read_csv(csv_path)

# Sanity check!
print("Dataset shape:", df.shape)
print("Columns:", df.columns)

# Separar os textos das labels
texts = df['text'].values
labels = df['source'].values

# Criar um tokenizer
tokenizer = preprocessing.text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)

# Converter os textos para sequencias de inteiros
sequences = tokenizer.texts_to_sequences(texts)

# Padding para uniformizar tamanhos
x_data = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

# Garantir label numerica
if not np.issubdtype(labels.dtype, np.number):
    label_map = {label: i for i, label in enumerate(np.unique(labels))}
    y_data = np.array([label_map[label] for label in labels])
else:
    y_data = labels

# Data split !
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Check shapes
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

# Print a sample
print("Sample sequence:", x_train[0])
print("Sample label:", y_train[0])

"""
# If you want to save the tokenizer for later use
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
"""

Dataset shape: (5051, 2)
Columns: Index(['text', 'source'], dtype='object')
x_train shape: (4040, 120)
y_train shape: (4040,)
x_test shape: (1011, 120)
y_test shape: (1011,)
Sample sequence: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0   14   19    1  738    2 4758
  109 3150    5    1  619 1381  564    5    4  220  428    2  187   45
 2567  724 3261  111   14 1026    1 3273  482 1822   22 5021    5   38
  111    3   57    8  109    7  244    9    1  623  158  803 7358 3261
   44   14  143    8 1822 7520   96    1  531  373    7  130 3668   14
   57    8  109    7   30  244    5    4 6789   44   10   23  433  472
    6  266   22 1935    5  531   45   66]
Sample label: 1


"\n# If you want to save the tokenizer for later use\nimport pickle\nwith open('tokenizer.pickle', 'wb') as handle:\n    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)\n"

## Define and train the model

In [18]:
from tensorflow.keras import models, layers, Input

# Build the model
def build_model():
    model = models.Sequential()
    model.add(Input(shape=(maxlen,)))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, activation='sigmoid'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(32, activation='sigmoid'))
    model.add(layers.Dense(1)) 
    model.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy'])
    return model 

model = build_model()

model.summary()



In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Compile with better optimizer
model.compile(
    optimizer='adam', 
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Early stopping 
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

# Save best model
model_checkpoint = ModelCheckpoint(
    'best_model_embedding.h5',
    monitor='val_accuracy',
    save_best_only=True
)

# Train with callbacks
history = model.fit(
    x_train, y_train,
    epochs=15,  
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

# Evaluate on test set
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test accuracy: {test_acc:.4f}")

Epoch 1/15
[1m 92/101[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 1ms/step - accuracy: 0.5021 - loss: 1.6952  



[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5025 - loss: 1.6508 - val_accuracy: 0.5297 - val_loss: 0.7168
Epoch 2/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5093 - loss: 0.8060 - val_accuracy: 0.5173 - val_loss: 0.7085
Epoch 3/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5259 - loss: 0.7495 - val_accuracy: 0.5136 - val_loss: 0.7374
Epoch 4/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5085 - loss: 0.7234 - val_accuracy: 0.5161 - val_loss: 0.7154
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5561 - loss: 0.7052 
Test accuracy: 0.5401


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras import models, layers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Geração de features (TF-IDF)
tfidf = TfidfVectorizer(max_features=max_features)
x_data = tfidf.fit_transform(texts).toarray()


# Divisão treino/teste
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Construção do modelo
def build_model(input_dim):
    model = models.Sequential()
    model.add(layers.Input(shape=(input_dim,)))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = build_model(x_train.shape[1])

history = model.fit(
    x_train, y_train,
    epochs=15,
    batch_size=32,
    validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)]
)

test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test accuracy: {test_acc:.4f}")

Epoch 1/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.6435 - loss: 0.6304 - val_accuracy: 0.9394 - val_loss: 0.1572
Epoch 2/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9595 - loss: 0.1169 - val_accuracy: 0.9220 - val_loss: 0.1568
Epoch 3/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9827 - loss: 0.0528 - val_accuracy: 0.9381 - val_loss: 0.1550
Epoch 4/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9871 - loss: 0.0321 - val_accuracy: 0.9431 - val_loss: 0.1786
Epoch 5/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9950 - loss: 0.0165 - val_accuracy: 0.9356 - val_loss: 0.1923
Epoch 6/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9959 - loss: 0.0149 - val_accuracy: 0.9394 - val_loss: 0.2015
Epoch 7/15
[1m101/101[0m 