# Imports

In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from tensorflow.keras import layers
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Auxiliary Functions

In [18]:
def clean(text, stopwords = True):
    def normalize(text):
        # Convert to lowercase
        text = text.lower()
        # Remove numbers, special characters, e o caractere '
        text = re.sub(r"[^a-z\s]", "", text)
        # Replace multiple spaces with a single space
        text = re.sub(r'\s+', ' ', text).strip()
        # Add start and end sequence tokens
        # text = 'startseq ' + " ".join([word for word in text.split() if len(word) > 1]) + ' endseq'
        return text

    def remove_stopwords(text):
        stopwords = [
        "the", "of", "and", "in", "to", "is", "a", "that", "for", "are", "on", "with", 
        "as", "at", "by", "from", "this", "it", "an", "be", "or", "which", "was", "were"
        ]
        text = ' '.join([word for word in text.split() if word not in stopwords])
        return text
    
    text = normalize(text)
    if stopwords:
        text = remove_stopwords(text)
    return text

In [19]:

class Tokenizer:
    def __init__(self, texts, n_words=None):
        self.n_words = n_words
        self.texts = texts
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        self.create_index()
        

    def create_index(self):
        word_counter = Counter()
        for caption in self.texts:
            for word in caption.split():
                word_counter[word] += 1

        # Sort words by frequency and alphabetically for ties
        most_common = word_counter.most_common(self.n_words) if self.n_words else word_counter.items()
        self.vocab = [word for word, _ in sorted(most_common, key=lambda x: (-x[1], x[0]))]

        # Add special tokens
        self.word2idx['<pad>'] = 0
        self.word2idx['<unk>'] = 1
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 2

        for word, index in self.word2idx.items():
            self.idx2word[index] = word


    def encode(self, caption):
        tokens = []
        for word in caption.split():
            if word in self.word2idx:
                tokens.append(self.word2idx[word])
            else:
                tokens.append(self.word2idx['<unk>'])
        return tokens

    def decode(self, tokens):
        return ' '.join([self.idx2word.get(token, '<unk>') for token in tokens])

    def __len__(self):
        return len(self.vocab) + 2

In [20]:
def prepare_data(df, tokenizer,max_length=120):

    X = df['Text']
    # y = df['Label'].apply(lambda x: 0 if x == 0 else 1)
    y = df['Label']
    X = X.apply(lambda x: tokenizer.encode(x))
    X = pad_sequences(X, maxlen=max_length, padding='pre')
    # y = to_categorical(y)[:,1]

    return X, y

# Load best model

In [6]:
import pickle
# open best params and tokenizer
with open('best_model_params.pkl', 'rb') as f:
    hiperparams = pickle.load(f)

with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

print(hiperparams)
print(tokenizer)

{'epochs': 20, 'batch_size': 64, 'learning_rate': 0.001, 'lstm_units': [256, 256], 'dropout_rate': 0.7, 'embed_dim': 50}
<__main__.Tokenizer object at 0x774ef3d47fa0>


In [11]:
max_length = 120
input_shape = (max_length,)
vocab_size = len(tokenizer)
epochs = hiperparams['epochs']
batch_size = hiperparams['batch_size']
learning_rate = hiperparams['learning_rate']
lstm_units = hiperparams['lstm_units']
dropout_rate = hiperparams['dropout_rate']
embed_dim = hiperparams['embed_dim']

# Build LSTM model
inputs = keras.Input(shape=input_shape)  # Assuming (timesteps, features)

x = layers.Embedding(vocab_size, embed_dim)(inputs)

for units in lstm_units[:-1]:
    x = layers.Bidirectional(layers.LSTM(units, return_sequences=True))(x)
    x = layers.Dropout(dropout_rate)(x)

x = layers.Bidirectional(layers.LSTM(lstm_units[-1]))(x)
x = layers.Dropout(dropout_rate)(x)
x = layers.Dense(128, activation='relu')(x)


outputs = layers.Dense(1, activation='sigmoid')(x)

model = keras.Model(inputs=inputs, outputs=outputs)

2025-03-22 15:31:24.480905: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [12]:
model.load_weights('best_model.h5')
model.summary()

# Prepare test data

In [14]:
def prepare_data_test(X, tokenizer,max_length=120):
    X = X.apply(lambda x: tokenizer.encode(x))
    X = pad_sequences(X, maxlen=max_length, padding='pre')

    return X

In [51]:
X_test = pd.read_csv('classify_input_datasets/dataset3_inputs.csv',sep=';', index_col=False)['Text']
ids = pd.read_csv('classify_input_datasets/dataset3_inputs.csv',sep=';', index_col=False)['ID']
X_test = X_test.str.replace('\n', ' ')
X_test = X_test.astype(str)
X_test = X_test.apply(clean)
X_test = prepare_data_test(X_test, tokenizer,max_length)
print(X_test.shape)

(100, 120)


In [52]:
preds = model.predict(X_test)
#print the numbers of each class
preds = [1 if pred > 0.5 else 0 for pred in preds]
print(np.unique(preds, return_counts=True))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step 
(array([0, 1]), array([66, 34]))


In [53]:
output_csv = 'classify_output_datasets/dataset3_output.csv'

preds_str = ['AI' if pred == 1 else 'Human' for pred in preds]
df_out = pd.DataFrame({"ID": ids, "Label": preds_str})
df_out.to_csv(output_csv, sep=';', index=False)
print(f"LSTM Model Predictions saved to {output_csv}")

LSTM Model Predictions saved to classify_output_datasets/dataset3_output.csv
