In [46]:
import numpy as np
import pandas as pd
import os
import random


def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [47]:
import tensorflow as tf
from tensorflow import keras  # Optional, but good for structured access

# Load the saved predictor
predictor = tf.keras.models.load_model('../code/nn_tensorflow/best_model_rnn_lstm.h5')




In [48]:
import pickle
from tensorflow.keras import preprocessing
# Load competition data
max_features = 10000  # Tamanho do vocabulário
maxlen = 120  # Tamanho máximo das sequências

competition_input = pd.read_csv('dataset3_inputs.csv', sep=';')
print(f"Loaded competition input data with shape: {competition_input.shape}")
print(f"Columns: {competition_input.columns}")

# Load tokenizer from file
with open('../code/nn_tensorflow/tokenizerRNN.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

print("Tokenizer loaded successfully!")

# Separar os textos das labels
texts = competition_input['Text'].values
# Converter os textos para sequências de inteiros
sequences = tokenizer.texts_to_sequences(texts)
# Padding para uniformizar tamanhos
x_data = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)


Loaded competition input data with shape: (100, 2)
Columns: Index(['ID', 'Text'], dtype='object')
Tokenizer loaded successfully!


In [49]:
# Make predictions
raw_predictions = predictor.predict(x_data)

# Convert probabilities to class labels (0 or 1)
predicted_labels = (raw_predictions > 0.5).astype(int).flatten()

# Map numerical predictions to text labels
label_map = {0: "Human", 1: "AI"}
predictions = [label_map[label] for label in predicted_labels]

# Print some sample predictions
print("Sample Predictions:", predictions[:10])


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
Sample Predictions: ['AI', 'AI', 'Human', 'Human', 'AI', 'AI', 'Human', 'Human', 'Human', 'Human']


In [50]:
# Create output dataframe
output_df = pd.DataFrame({
    'ID': competition_input['ID'],
    'Label': predictions
})

print("\nSample predictions:")
print(output_df.head())

# Save predictions to CSV
output_df.to_csv('submissao2-grupo1-s2.csv', sep='\t', index=False)
print("\nPredictions saved to competition_predictions_2.csv")


Sample predictions:
     ID  Label
0  D3-1     AI
1  D3-2     AI
2  D3-3  Human
3  D3-4  Human
4  D3-5     AI

Predictions saved to competition_predictions_2.csv


In [51]:
# Optional: Verify against the provided dataset1_outputs.csv
try:
    ground_truth = pd.read_csv('dataset2_disclosed_outputs.csv', sep='\t')
    merged = output_df.merge(ground_truth, on='ID', suffixes=('_pred', '_true'))
    accuracy = (merged['Label_pred'] == merged['Label_true']).mean()
    print(f"\nAccuracy on dataset1: {accuracy:.4f}")
    
    # Print confusion matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(merged['Label_true'], merged['Label_pred'], labels=['Human', 'AI'])
    print("\nConfusion Matrix:")
    print("              Predicted")
    print("             Human    AI")
    print(f"True Human:  {cm[0][0]:5d}  {cm[0][1]:5d}")
    print(f"     AI:     {cm[1][0]:5d}  {cm[1][1]:5d}")
    
except Exception as e:
    print(f"Could not verify against ground truth: {e}")


Accuracy on dataset1: nan

Confusion Matrix:
              Predicted
             Human    AI
True Human:      0      0
     AI:         0      0
