In [21]:
import numpy as np
import pandas as pd
import os
import random

def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

# Making our own embedding

In [22]:
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Flatten, Dense, Embedding, Input

# Parameters
max_features = 10000  # Tamanho do vocabulario
maxlen = 500  # sequencia maxima, posso testar com mais se o computador aguentar

# Load dos dados
csv_path = '../../datasets/human_or_ai_dataset_small.csv'  # Change this to your file path
df = pd.read_csv(csv_path)

# Sanity check!
print("Dataset shape:", df.shape)
print("Columns:", df.columns)

# Separar os textos das labels
texts = df['text'].values
labels = df['source'].values

# Criar um tokenizer
tokenizer = preprocessing.text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)

# Converter os textos para sequencias de inteiros
sequences = tokenizer.texts_to_sequences(texts)

# Padding para uniformizar tamanhos
x_data = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

# Garantir label numerica
if not np.issubdtype(labels.dtype, np.number):
    label_map = {'human': 0, 'ai': 1}
    y_data = np.array([label_map[label] for label in labels])
else:
    y_data = labels
    
print(y_data)
# Data split !
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Check shapes
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

# Print a sample
print("Sample sequence:", x_train[0])
print("Sample label:", y_train[0])

"""
# If you want to save the tokenizer for later use
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
"""

Dataset shape: (5051, 2)
Columns: Index(['text', 'source'], dtype='object')
[0 1 0 ... 0 0 0]
x_train shape: (4040, 500)
y_train shape: (4040,)
x_test shape: (1011, 500)
y_test shape: (1011,)
Sample sequence: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0

"\n# If you want to save the tokenizer for later use\nimport pickle\nwith open('tokenizer.pickle', 'wb') as handle:\n    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)\n"

## Define and train the model

In [23]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, Dropout, GlobalMaxPooling1D, Conv1D, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dims = 100  # Size of embedding vector

# Model with more sophisticated architecture
model = Sequential()

# Embedding layer
model.add(Input((maxlen,)))
model.add(Embedding(max_features, embedding_dims))
model.add(SpatialDropout1D(0.2))  # Spatial dropout evita overfitting nos embeddings


model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())

# Dense layers
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

# Compile with better optimizer
model.compile(
    optimizer='adam',  # Adam typically works better than rmsprop
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()


# Early stopping 
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

# Save best model
model_checkpoint = ModelCheckpoint(
    'best_model_embedding.h5',
    monitor='val_accuracy',
    save_best_only=True
)

# Train with callbacks
history = model.fit(
    x_train, y_train,
    epochs=15,  # More epochs, early stopping will prevent overfitting
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

# Evaluate on test set
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test accuracy: {test_acc:.4f}")

Epoch 1/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.6218 - loss: 0.5826



[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 0.6229 - loss: 0.5813 - val_accuracy: 0.8874 - val_loss: 0.2668
Epoch 2/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.9239 - loss: 0.1997



[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.9240 - loss: 0.1994 - val_accuracy: 0.9307 - val_loss: 0.1655
Epoch 3/15
[1m100/101[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 34ms/step - accuracy: 0.9781 - loss: 0.0689



[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 36ms/step - accuracy: 0.9782 - loss: 0.0688 - val_accuracy: 0.9369 - val_loss: 0.1597
Epoch 4/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.9987 - loss: 0.0159



[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.9987 - loss: 0.0159 - val_accuracy: 0.9381 - val_loss: 0.1576
Epoch 5/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 1.0000 - loss: 0.0050 - val_accuracy: 0.9381 - val_loss: 0.1764
Epoch 6/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.9983 - loss: 0.0039



[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.9983 - loss: 0.0039 - val_accuracy: 0.9431 - val_loss: 0.1938
Epoch 7/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 1.0000 - loss: 0.0013 - val_accuracy: 0.9381 - val_loss: 0.1969
Epoch 8/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 36ms/step - accuracy: 1.0000 - loss: 7.4815e-04 - val_accuracy: 0.9394 - val_loss: 0.2013
Epoch 9/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 1.0000 - loss: 5.2761e-04 - val_accuracy: 0.9394 - val_loss: 0.2110
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9492 - loss: 0.1717
Test accuracy: 0.9456


# Using Glove

In [27]:

glove_dir = '../../'  # Change this to your GloVe path
embeddings_index = {}
with open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix
embedding_matrix = np.zeros((max_features, embedding_dims))
for word, i in tokenizer.word_index.items():
    if i < max_features:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

## Training a model

In [28]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, Dropout, GlobalMaxPooling1D, Conv1D, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Embedding layer ( with glove!)
model = Sequential()
model.add(Input((maxlen,)))
model.add(Embedding(
    max_features, 
    embedding_dims,
    weights=[embedding_matrix],
    trainable=False  # Set to True to fine-tune embeddings
))


model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())

# Dense layers
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

# Compile with better optimizer
model.compile(
    optimizer='adam',  # Adam typically works better than rmsprop
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()


# Early stopping 
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

# Save best model
model_checkpoint = ModelCheckpoint(
    'best_model_glove.h5',
    monitor='val_accuracy',
    save_best_only=True
)

# Train with callbacks
history = model.fit(
    x_train, y_train,
    epochs=15,  # More epochs, early stopping will prevent overfitting
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

# Evaluate on test set
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test accuracy: {test_acc:.4f}")

Epoch 1/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.7264 - loss: 0.5477



[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.7275 - loss: 0.5459 - val_accuracy: 0.9196 - val_loss: 0.2120
Epoch 2/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.9409 - loss: 0.1515 - val_accuracy: 0.8923 - val_loss: 0.2158
Epoch 3/15
[1m 99/101[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.9767 - loss: 0.0758



[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.9765 - loss: 0.0759 - val_accuracy: 0.9319 - val_loss: 0.1676
Epoch 4/15
[1m 99/101[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 24ms/step - accuracy: 0.9908 - loss: 0.0394



[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.9906 - loss: 0.0397 - val_accuracy: 0.9332 - val_loss: 0.1776
Epoch 5/15
[1m 99/101[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 24ms/step - accuracy: 0.9955 - loss: 0.0226



[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.9955 - loss: 0.0225 - val_accuracy: 0.9344 - val_loss: 0.1860
Epoch 6/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.9998 - loss: 0.0102 - val_accuracy: 0.9344 - val_loss: 0.2207
Epoch 7/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.9995 - loss: 0.0074



[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.9995 - loss: 0.0074 - val_accuracy: 0.9381 - val_loss: 0.2143
Epoch 8/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.9998 - loss: 0.0041 - val_accuracy: 0.9319 - val_loss: 0.2652
Epoch 9/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 1.0000 - loss: 0.0028 - val_accuracy: 0.9319 - val_loss: 0.2268
Epoch 10/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 1.0000 - loss: 0.0017 - val_accuracy: 0.9282 - val_loss: 0.2343
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9423 - loss: 0.1722
Test accuracy: 0.9436


# Predicting for the Competition

In [None]:
from tensorflow.keras import preprocessing

# Load the CSV with data to predict
prediction_csv_path = '../../datasets/dataset1_inputs.csv'
df_predict = pd.read_csv(prediction_csv_path,sep="\t")

# Check the loaded data
print("Prediction dataset shape:", df_predict.shape)
print("Columns:", df_predict.columns)
print("Sample IDs:", df_predict['ID'].head())

# Preprocess the text data to match the training data format
# Note: Use the same tokenizer you used for training
# If you saved it with pickle, load it:
"""
import pickle
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
"""

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(df_predict['Text'].values)

# Pad sequences to the same length as during training
x_predict = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

# Make predictions with your trained model
predictions = model.predict(x_predict)

# Convert probabilities to binary labels (0 = Human, 1 = AI)
# Using 0.5 as the threshold - you might want to tune this based on your model's performance
labels = (predictions > 0.5).astype(int)

# Map labels to "AI" and "Human"
label_mapping = {1: "AI", 0: "Human"}
labels_mapped = [label_mapping[label] for label in labels.flatten()]

# Create a DataFrame with the results
results_df = pd.DataFrame({
    'ID': df_predict['ID'],
    'Label': labels_mapped
})

# Print sample of results
print("\nSample of prediction results:")
print(results_df.head())

# Save to CSV
output_csv_path = 'prediction_results.csv'
results_df.to_csv(output_csv_path,sep="\t",index=False)
print(f"\nResults saved to {output_csv_path}")

Prediction dataset shape: (30, 2)
Columns: Index(['ID', 'Text'], dtype='object')
Sample IDs: 0    D1-1
1    D1-2
2    D1-3
3    D1-4
4    D1-5
Name: ID, dtype: object
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step

Sample of prediction results:
     ID  Label
0  D1-1     AI
1  D1-2     AI
2  D1-3  Human
3  D1-4  Human
4  D1-5     AI

Results saved to prediction_results.csv
