In [2]:
import numpy as np
import pandas as pd
import os
import random
import tensorflow as tf


def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo
    tf.random.set_seed(seed)  # TensorFlow

set_seed(25)

In [3]:
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Flatten, Dense, Embedding, Input

# Parameters
max_features = 20000  # Tamanho do vocabulario
maxlen = 500  # sequencia maxima, posso testar com mais se o computador aguentar

# Load dos dados
csv_path = '../datasets/human_or_ai_dataset_sub3.csv'  # Change this to your file path
df = pd.read_csv(csv_path)
df.dropna(inplace=True)
# Sanity check!
print("Dataset shape:", df.shape)
print("Columns:", df.columns)
print(df.info())


# Separar os textos das labels
texts = df['text'].values
labels = df['source'].values

# Criar um tokenizer
tokenizer = preprocessing.text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)

# Converter os textos para sequencias de inteiros
sequences = tokenizer.texts_to_sequences(texts)

# Padding para uniformizar tamanhos
x_data = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

# Garantir label numerica
if not np.issubdtype(labels.dtype, np.number):
    label_map = {'human': 0, 'ai': 1}
    y_data = np.array([label_map[label] for label in labels])
else:
    y_data = labels
    
print(y_data)
x_train = x_data
y_train = y_data

Dataset shape: (5246, 2)
Columns: Index(['text', 'source'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 5246 entries, 0 to 5246
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5246 non-null   object
 1   source  5246 non-null   object
dtypes: object(2)
memory usage: 123.0+ KB
None
[1 0 0 ... 0 1 0]


In [4]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, Dropout, GlobalMaxPooling1D, Conv1D, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import initializers
from tensorflow.keras.layers import LSTM

embedding_dims = 256  # Size of embedding vector


model = Sequential()

model.add(Input((maxlen,))) 
model.add(Embedding(max_features, embedding_dims))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))  
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  

# Compile with better optimizer
model.compile(
    optimizer='adam',  # Adam typically works better than rmsprop
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()


# Early stopping 
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=2,
    restore_best_weights=True
)

# Save best model
model_checkpoint = ModelCheckpoint(
    'best_model_sub3_s1.h5',
    monitor='val_accuracy',
    save_best_only=True
)

# Train with callbacks
history = model.fit(
    x_train, y_train,
    epochs=50,  # More epochs, early stopping will prevent overfitting
    batch_size=128,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - accuracy: 0.6395 - loss: 0.6320



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 3s/step - accuracy: 0.6425 - loss: 0.6292 - val_accuracy: 0.8733 - val_loss: 0.3126
Epoch 2/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - accuracy: 0.9133 - loss: 0.2262



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 3s/step - accuracy: 0.9137 - loss: 0.2250 - val_accuracy: 0.8962 - val_loss: 0.2716
Epoch 3/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - accuracy: 0.9643 - loss: 0.0979



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 3s/step - accuracy: 0.9646 - loss: 0.0972 - val_accuracy: 0.9552 - val_loss: 0.1827
Epoch 4/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 3s/step - accuracy: 0.9872 - loss: 0.0357 - val_accuracy: 0.9352 - val_loss: 0.2308
Epoch 5/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 3s/step - accuracy: 0.9868 - loss: 0.0366 - val_accuracy: 0.9019 - val_loss: 0.3202


In [10]:
import pickle
from tensorflow.keras import preprocessing

competition_input = pd.read_csv('submission3_inputs.csv', sep=';')
print(f"Loaded competition input data with shape: {competition_input.shape}")
print(f"Columns: {competition_input.columns}")


# Separar os textos das labels
texts = competition_input['Text'].values
# Converter os textos para sequências de inteiros
sequences = tokenizer.texts_to_sequences(texts)
# Padding para uniformizar tamanhos
t_data = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)


Loaded competition input data with shape: (100, 2)
Columns: Index(['ID', 'Text'], dtype='object')


In [11]:
import tensorflow as tf
from tensorflow import keras  # Optional, but good for structured access
predictor = tf.keras.models.load_model('best_model_sub3_s1.h5')



In [12]:
# Make predictions
raw_predictions = predictor.predict(t_data)

# Convert probabilities to class labels (0 or 1)
predicted_labels = (raw_predictions > 0.5).astype(int).flatten()

# Map numerical predictions to text labels
label_map = {0: "Human", 1: "AI"}
predictions = [label_map[label] for label in predicted_labels]

# Create output dataframe
output_df = pd.DataFrame({
    'ID': competition_input['ID'],
    'Label': predictions
})


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 454ms/step


In [None]:
output_df.to_csv('submissao3-grupo1-s1.csv', sep='\t', index=False)
print("\nPredictions saved to submissao3-grupo1-s1.csv")


Predictions saved to competition_predictions.csv


In [14]:
# Optional: Verify against the provided dataset1_outputs.csv
try:
    ground_truth = pd.read_csv('dataset2_disclosed_complete_outputs.csv', sep='\t')
    merged = output_df.merge(ground_truth, on='ID', suffixes=('_pred', '_true'))
    accuracy = (merged['Label_pred'] == merged['Label_true']).mean()
    print(f"\nAccuracy on dataset1: {accuracy:.4f}")
    
    # Print confusion matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(merged['Label_true'], merged['Label_pred'], labels=['Human', 'AI'])
    print("\nConfusion Matrix:")
    print("              Predicted")
    print("             Human    AI")
    print(f"True Human:  {cm[0][0]:5d}  {cm[0][1]:5d}")
    print(f"     AI:     {cm[1][0]:5d}  {cm[1][1]:5d}")
    
except Exception as e:
    print(f"Could not verify against ground truth: {e}")


Accuracy on dataset1: nan

Confusion Matrix:
              Predicted
             Human    AI
True Human:      0      0
     AI:         0      0
