In [1]:
import numpy as np
import pandas as pd
import os
import random

def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

# Making our own rnn

In [2]:
import numpy as np
import pandas as pd
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout, Input, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split

# Parameters
max_features = 10000  # Tamanho do vocabulário
maxlen = 120  # Tamanho máximo das sequências

# Load dos dados
csv_path = '../../datasets/human_or_ai_dataset_small_research_only.csv'  # Change this to your file path
df = pd.read_csv(csv_path)
# Sanity check!
print("Dataset shape:", df.shape)
print("Columns:", df.columns)

Dataset shape: (5051, 2)
Columns: Index(['text', 'source'], dtype='object')


In [3]:
import pickle

# Separar os textos das labels
texts = df['text'].values
labels = df['source'].values

# Criar um tokenizer
tokenizer = preprocessing.text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)

# Converter os textos para sequências de inteiros
sequences = tokenizer.texts_to_sequences(texts)

# Padding para uniformizar tamanhos
x_data = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

# Garantir label numérica
if not np.issubdtype(labels.dtype, np.number):
    label_map = {'human': 0, 'ai': 1}
    y_data = np.array([label_map[label] for label in labels])
else:
    y_data = labels
print(y_data)
# Data split !
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Check shapes
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

# Print a sample
print("Sample sequence:", x_train[0])
print("Sample label:", y_train[0])

# Save tokenizer to a file
with open('tokenizerRNN.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


[0 1 0 ... 0 0 0]
x_train shape: (4040, 120)
y_train shape: (4040,)
x_test shape: (1011, 120)
y_test shape: (1011,)
Sample sequence: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0   14   19    1  738    2 4758
  109 3150    5    1  619 1381  564    5    4  220  428    2  187   45
 2567  724 3261  111   14 1026    1 3273  482 1822   22 5021    5   38
  111    3   57    8  109    7  244    9    1  623  158  803 7358 3261
   44   14  143    8 1822 7520   96    1  531  373    7  130 3668   14
   57    8  109    7   30  244    5    4 6789   44   10   23  433  472
    6  266   22 1935    5  531   45   66]
Sample label: 0


## Define and train the model

In [None]:
model = Sequential()

model.add(Input((maxlen,))) 
model.add(Embedding(input_dim=max_features, output_dim=128, input_length=maxlen))
model.add(SimpleRNN(128, dropout=0.2, recurrent_dropout=0.2))  
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  

model.summary()

In [None]:
# Compilar o modelo
model.compile(
    optimizer='adam',  
    loss='binary_crossentropy',  
    metrics=['accuracy']
)

# Early stopping 
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

# Salvar o melhor modelo
model_checkpoint = ModelCheckpoint(
    'best_model_rnn.h5',
    monitor='val_accuracy',
    save_best_only=True
)

# Treinar o modelo com callbacks
history = model.fit(
    x_train, y_train,
    epochs=10, 
    batch_size=128,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

# Avaliar no conjunto de teste
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Accuracy no teste: {test_acc:.4f}")

### LSMT

In [4]:
from tensorflow.keras.layers import LSTM

model = Sequential()

model.add(Input((maxlen,))) 
model.add(Embedding(input_dim=max_features, output_dim=128, input_length=maxlen))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))  
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  

model.summary()



In [5]:
# Compilar o modelo
model.compile(
    optimizer='adam',  
    loss='binary_crossentropy',  
    metrics=['accuracy']
)

# Early stopping 
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

# Salvar o melhor modelo
model_checkpoint = ModelCheckpoint(
    'best_model_rnn_lstm_epochs.h5',
    monitor='val_accuracy',
    save_best_only=True
)

# Treinar o modelo com callbacks
history = model.fit(
    x_train, y_train,
    epochs=50, 
    batch_size=128,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

# Avaliar no conjunto de teste
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Accuracy no teste: {test_acc:.4f}")

Epoch 1/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 306ms/step - accuracy: 0.5781 - loss: 0.6631



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 339ms/step - accuracy: 0.5815 - loss: 0.6608 - val_accuracy: 0.8899 - val_loss: 0.3435
Epoch 2/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 311ms/step - accuracy: 0.9045 - loss: 0.2710



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 339ms/step - accuracy: 0.9052 - loss: 0.2691 - val_accuracy: 0.9084 - val_loss: 0.2173
Epoch 3/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 307ms/step - accuracy: 0.9604 - loss: 0.1169



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 333ms/step - accuracy: 0.9605 - loss: 0.1164 - val_accuracy: 0.9332 - val_loss: 0.2598
Epoch 4/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 336ms/step - accuracy: 0.9798 - loss: 0.0669 - val_accuracy: 0.9307 - val_loss: 0.1989
Epoch 5/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 336ms/step - accuracy: 0.9912 - loss: 0.0288 - val_accuracy: 0.9245 - val_loss: 0.2325
Epoch 6/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 350ms/step - accuracy: 0.9920 - loss: 0.0268 - val_accuracy: 0.9307 - val_loss: 0.2230
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.9400 - loss: 0.2317
Accuracy no teste: 0.9387


### GRU

In [None]:
from tensorflow.keras.layers import GRU

model = Sequential()

model.add(Input((maxlen,))) 
model.add(Embedding(input_dim=max_features, output_dim=128, input_length=maxlen))
model.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))  
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  

model.summary()

In [None]:
# Compilar o modelo
model.compile(
    optimizer='adam',  
    loss='binary_crossentropy',  
    metrics=['accuracy']
)

# Early stopping 
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

# Salvar o melhor modelo
model_checkpoint = ModelCheckpoint(
    'best_model_rnn_gru.h5',
    monitor='val_accuracy',
    save_best_only=True
)

# Treinar o modelo com callbacks
history = model.fit(
    x_train, y_train,
    epochs=10, 
    batch_size=128,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

# Avaliar no conjunto de teste
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Accuracy no teste: {test_acc:.4f}")

# Predicting for the Competition

In [None]:
from tensorflow.keras.models import load_model

# Load the saved model
model = load_model('best_model_rnn_gru.h5')

print("Model loaded successfully!")

# Carregar o CSV com dados para prever
prediction_csv_path = '../../datasets/dataset1_inputs.csv'
df_predict = pd.read_csv(prediction_csv_path,sep="\t")

# Verificar os dados carregados
print("Prediction dataset shape:", df_predict.shape)
print("Columns:", df_predict.columns)
print("Sample IDs:", df_predict['ID'].head())

# Pré-processar os dados de texto para corresponder ao formato de treinamento
# Converter textos para sequências
sequences = tokenizer.texts_to_sequences(df_predict['Text'].values)

# Padding das sequências para o mesmo tamanho usado no treinamento
x_predict = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

# Fazer previsões com o modelo treinado
predictions = model.predict(x_predict)

# Converter probabilidades para labels binários (0 = Human, 1 = AI)
# Usando 0.5 como threshold - você pode ajustar isso conforme necessário
labels = (predictions > 0.5).astype(int)

# Mapear labels para "AI" e "Human"
label_mapping = {1: "AI", 0: "Human"}
labels_mapped = [label_mapping[label] for label in labels.flatten()]

# Criar um DataFrame com os resultados
results_df = pd.DataFrame({
    'ID': df_predict['ID'],
    'Label': labels_mapped
})

# Exibir uma amostra dos resultados
print("\nAmostra dos resultados de previsão:")
print(results_df.head())

# Salvar em CSV
output_csv_path = 'prediction_results3.csv'
results_df.to_csv(output_csv_path, sep="\t", index=False)
print(f"\nResultados salvos em {output_csv_path}")