# Imports

In [13]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Input, RNN, Dropout, Bidirectional
from tensorflow.keras.backend import clear_session
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.backend as ops
from enum import Enum

2025-03-22 16:57:27.091186: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-22 16:57:27.105942: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742662647.123364   86994 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742662647.128332   86994 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742662647.141208   86994 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [14]:
max_length = 120
n = 30000  # Set the number of most frequent words to keep

In [15]:
# Model run enum
class ModelRunMode(Enum):
    """
    Enumeration of Model Run Mode.
    """
    TRAIN           = "train"           # Train Mode
    CLASSIFY        = "classify"        # Classify Mode

In [16]:
# Model run mode
# Options: 
#   ModelRunMode.TRAIN.value            (Train the model)
#   ModelRunMode.CLASSIFY.value         (Classify data)
mode = ModelRunMode.CLASSIFY.value

In [17]:
# Parameters cell
if mode == ModelRunMode.CLASSIFY.value:
    # CLASSIFY mode: Set parameters for classification
    input_csv = "classify_input_datasets/dataset3_inputs.csv"               # CSV file with texts for prediction (ID, Text)
    output_csv = "classify_output_datasets/dataset3_outputs_lstm_model.csv"  # CSV file to store prediction result
elif mode == ModelRunMode.TRAIN.value:
    ...
else:
    print("The selected option is not valid. Options: \"train\" or \"classify\"!")
    SystemExit()

# Defining global fuctions

In [18]:
def clean(text, stopwords = True):
    def normalize(text):
        # Convert to lowercase
        text = text.lower()
        # Remove numbers, special characters, e o caractere '
        text = re.sub(r"[^a-z\s]", "", text)
        # Replace multiple spaces with a single space
        text = re.sub(r'\s+', ' ', text).strip()
        # Add start and end sequence tokens
        # text = 'startseq ' + " ".join([word for word in text.split() if len(word) > 1]) + ' endseq'
        return text

    def remove_stopwords(text):
        stopwords = [
        "the", "of", "and", "in", "to", "is", "a", "that", "for", "are", "on", "with", 
        "as", "at", "by", "from", "this", "it", "an", "be", "or", "which", "was", "were"
        ]
        text = ' '.join([word for word in text.split() if word not in stopwords])
        return text
    
    text = normalize(text)
    if stopwords:
        text = remove_stopwords(text)
    return text

In [19]:

class Tokenizer:
    def __init__(self, texts, n_words=None):
        self.n_words = n_words
        self.texts = texts
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        self.create_index()
        

    def create_index(self):
        word_counter = Counter()
        for caption in self.texts:
            for word in caption.split():
                word_counter[word] += 1

        # Sort words by frequency and alphabetically for ties
        most_common = word_counter.most_common(self.n_words) if self.n_words else word_counter.items()
        self.vocab = [word for word, _ in sorted(most_common, key=lambda x: (-x[1], x[0]))]

        # Add special tokens
        self.word2idx['<pad>'] = 0
        self.word2idx['<unk>'] = 1
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 2

        for word, index in self.word2idx.items():
            self.idx2word[index] = word


    def encode(self, caption):
        tokens = []
        for word in caption.split():
            if word in self.word2idx:
                tokens.append(self.word2idx[word])
            else:
                tokens.append(self.word2idx['<unk>'])
        return tokens

    def decode(self, tokens):
        return ' '.join([self.idx2word.get(token, '<unk>') for token in tokens])

    def __len__(self):
        return len(self.vocab) + 2

In [20]:
def prepare_data(df, tokenizer,max_length=120):

    X = df['Text']
    # y = df['Label'].apply(lambda x: 0 if x == 0 else 1)
    y = df['Label']
    X = X.apply(lambda x: tokenizer.encode(x))
    X = pad_sequences(X, maxlen=max_length, padding='pre')
    # y = to_categorical(y)[:,1]

    return X, y

# TRAIN MODE

## Opening and cleaning data

In [None]:
X = pd.read_csv('../tarefa_1/clean_input_datasets/gpt_vs_human_data_set_inputs.csv',sep='\t', index_col=False)['Text']
y = pd.read_csv('../tarefa_1/clean_output_datasets/gpt_vs_human_data_set_outputs.csv',sep='\t', index_col=False)['Label']
df = pd.concat([X, y], axis=1)

df['Text'] = df['Text'].str.replace('\n', ' ')
df['Text'] = df['Text'].astype(str)
df['Label'] = df['Label'].apply(lambda x: 1 if x == "AI" else 0)
df['Text'] = df['Text'].apply(clean)
#shuffle
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,Text,Label
0,paper we enumerate prime graphs respect cartes...,0
1,research paper investigates resummed cross sec...,1
2,red supergiant stars represent key phase evolu...,0
3,we investigate effects all orders planck lengt...,0
4,we introduce novel extension gutzwiller variat...,0
...,...,...
4048,effect li substitution mg lic cosubstitution s...,0
4049,research paper titled energy efficiency perspe...,1
4050,research paper presents results optical spectr...,1
4051,using molecular simulations we show aperiodic ...,0


In [93]:
X_val_1 = pd.read_csv('../tarefa_1/clean_input_datasets/dataset1_enh_inputs.csv',sep='\t', index_col=False)['Text']
y_val_1 = pd.read_csv('../tarefa_1/clean_output_datasets/dataset1_enh_outputs.csv',sep='\t', index_col=False)['Label']
df_val_1 = pd.concat([X_val_1, y_val_1], axis=1)
df_val_1['Text'] = df_val_1['Text'].str.replace('\n', ' ')
df_val_1['Text'] = df_val_1['Text'].astype(str)
df_val_1['Label'] = df_val_1['Label'].apply(lambda x: 1 if x == "AI" else 0)
df_val_1['Text'] = df_val_1['Text'].apply(clean)
df_val_1

Unnamed: 0,Text,Label
0,cell cycle celldivision cycle sequential serie...,0
1,cell cycle process cell grows duplicates its d...,1
2,photons many atomic models physics particles t...,0
3,photon fundamental particle light other electr...,1
4,according theory plate tectonics earths lithos...,0
...,...,...
95,research paper explores concept topological in...,1
96,we report experimental realization oneway quan...,0
97,research paper presents experimental realizati...,1
98,airwater interface alkylbiphenylcarbonitrile c...,0


In [94]:
val_df = pd.read_csv('dataset2_disclosed.csv', sep=';')
val_df['Text'] = val_df['Text'].str.replace('\n', ' ')
val_df['Text'] = val_df['Text'].astype(str)
val_df['Label'] = val_df['Label'].apply(lambda x: 1 if x == "AI" else 0)
val_df['Text'] = val_df['Text'].apply(clean)
val_df = val_df[['Text', 'Label']]
val_df = pd.concat([val_df, df_val_1], axis=0).reset_index(drop=True)
val_df


Unnamed: 0,Text,Label
0,approximation useful chemistry but not strictl...,0
1,these nutrients needed keep bones teeth muscle...,0
2,vitamin d essential maintaining healthy bones ...,1
3,within million years pressure density hydrogen...,0
4,there estimated trillion galaxies known univer...,0
...,...,...
145,research paper explores concept topological in...,1
146,we report experimental realization oneway quan...,0
147,research paper presents experimental realizati...,1
148,airwater interface alkylbiphenylcarbonitrile c...,0


## Preparing data

In [95]:
tokenizer = Tokenizer(df['Text'], n)
vocab_size = len(tokenizer)
print('Number of words in the vocabulary:', len(tokenizer))

Number of words in the vocabulary: 19081


In [None]:
X_train, y_train = prepare_data(df, tokenizer, max_length)
X_val, y_val = prepare_data(val_df, tokenizer, max_length)

In [97]:
print(X_val[-1])
print(tokenizer.decode(X_val[-1]))

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     5     3    64   355  1801    48  3266  1085   795  2334  7598   914
    18   146  6353   168 18248    46    78   355  1801   258   873  1085
   795   752    10   141   914  2334  3193   176   180    65 11532  3980
   219  2390  3266    55    46   413   355  1801   729   252   100    58
   244  2334    10    33    41   797    20    24  1085   795    19  1748]
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

## Fine tune function

In [None]:
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

def hyperparameter_optimization_lstm(train_ds, validation_ds, 
                                     epochs_list, batch_size_list, 
                                     learning_rate_list, lstm_units_list,
                                     embed_dim_list, 
                                     dropout_list, n_iter=10):

    keras.backend.clear_session()
    best_acc = 0.0
    best_params = {}

    # Prepare random combinations
    param_combinations = []
    for _ in range(n_iter):
        param_combinations.append({
            'epochs': random.choice(epochs_list),
            'batch_size': random.choice(batch_size_list),
            'learning_rate': random.choice(learning_rate_list),
            'lstm_units': random.choice(lstm_units_list),
            'dropout_rate': random.choice(dropout_list),
            'embed_dim': random.choice(embed_dim_list)
        })

    # Extract input shape and number of classes
    input_shape = train_ds.element_spec[0].shape
    print("Input Shape:", input_shape)

    for i,params in enumerate(param_combinations):
        print(f"\nIteration {i+1}/{n_iter}", end=' ')
        # Unpack parameters
        epochs = params['epochs']
        batch_size = params['batch_size']
        learning_rate = params['learning_rate']
        lstm_units = params['lstm_units']
        dropout_rate = params['dropout_rate']
        embed_dim = params['embed_dim']

        # Build LSTM model
        inputs = keras.Input(shape=(input_shape[1],))  # Assuming (timesteps, features)

        x = layers.Embedding(vocab_size, embed_dim)(inputs)

        for units in lstm_units[:-1]:
            x = layers.Bidirectional(layers.LSTM(units, return_sequences=True))(x)
            x = layers.Dropout(dropout_rate)(x)
        
        x = layers.Bidirectional(layers.LSTM(lstm_units[-1]))(x)
        x = layers.Dropout(dropout_rate)(x)
        x = layers.Dense(128, activation='relu')(x)


        outputs = layers.Dense(1, activation='sigmoid')(x)

        model = keras.Model(inputs=inputs, outputs=outputs)
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        callbacks = [
            EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-6)
        ]

        model.compile(optimizer=optimizer,
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

        # Train the model
        history = model.fit(train_ds, epochs=epochs, batch_size=batch_size,
                            verbose=0, validation_data=validation_ds, callbacks=callbacks)

        # Get validation accuracy
        val_acc = max(history.history.get('val_accuracy', [0]))
        print(f"Validation Accuracy: {val_acc:.4f}", end=' ')
        print(f"Train Accuracy: {max(history.history.get('accuracy', [0])):.4f}")

        # Update best model if improved
        if val_acc > best_acc:
            best_model = model
            best_acc = val_acc
            best_params = params

    print("\nBest Hyperparameters Found:", best_params)
    print(f"Best Accuracy: {best_acc:.4f}")
    return best_params, best_model


## Model fine tune

In [None]:
if mode == ModelRunMode.TRAIN.value:
    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(64)
    val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(64)

    # Hyperparameter search
    epochs_list = [10, 20, 30]
    batch_size_list = [64]
    learning_rate_list = [1e-2, 1e-3, 1e-4]
    lstm_units_list = [[64, 64], [128, 128], [256, 256]]
    embed_dim_list = [50, 10, 150]
    dropout_list = [0.3, 0.5, 0.7]
    hiperparams, model = hyperparameter_optimization_lstm(train_ds, val_ds,
                                                        epochs_list, batch_size_list,
                                                        learning_rate_list, lstm_units_list,
                                                        embed_dim_list, dropout_list, n_iter=10)

Input Shape: (None, 120)

Iteration 1/10 Validation Accuracy: 0.7867 Train Accuracy: 0.9931

Iteration 2/10 Validation Accuracy: 0.7333 Train Accuracy: 0.9998

Iteration 3/10 Validation Accuracy: 0.7333 Train Accuracy: 1.0000

Iteration 4/10 Validation Accuracy: 0.7467 Train Accuracy: 1.0000

Iteration 5/10 Validation Accuracy: 0.7333 Train Accuracy: 1.0000

Iteration 6/10 Validation Accuracy: 0.7667 Train Accuracy: 0.9958

Iteration 7/10 Validation Accuracy: 0.7867 Train Accuracy: 0.9998

Iteration 8/10 Validation Accuracy: 0.7800 Train Accuracy: 0.9998

Iteration 9/10 Validation Accuracy: 0.7733 Train Accuracy: 1.0000

Iteration 10/10 Validation Accuracy: 0.8133 Train Accuracy: 0.9995

Best Hyperparameters Found: {'epochs': 20, 'batch_size': 64, 'learning_rate': 0.001, 'lstm_units': [256, 256], 'dropout_rate': 0.7, 'embed_dim': 50}
Best Accuracy: 0.8133


In [None]:
if mode == ModelRunMode.TRAIN.value:
    model.summary()

In [None]:
if mode == ModelRunMode.TRAIN.value:
    model.save('lstm_model_weights/best_lstm_model.h5')
    # save params and tokenizer
    import pickle
    with open('lstm_model_weights/best_lstm_model_params.pkl', 'wb') as f:
        pickle.dump(hiperparams, f)

    with open('lstm_model_weights/tokenizer_lstm.pkl', 'wb') as f:
        pickle.dump(tokenizer, f)



# CLASSIFY MODE

In [26]:
def prepare_data_test(X, tokenizer,max_length=120):
    X = X.apply(lambda x: tokenizer.encode(x))
    X = pad_sequences(X, maxlen=max_length, padding='pre')

    return X

In [27]:
if mode == ModelRunMode.CLASSIFY.value:
    X_test = pd.read_csv(input_csv,sep=';', index_col=False)['Text']
    ids = pd.read_csv(input_csv,sep=';', index_col=False)['ID']
    X_test = X_test.str.replace('\n', ' ')
    X_test = X_test.astype(str)
    X_test = X_test.apply(clean)
    print("CSV loaded successfully")

CSV loaded successfully


In [28]:
if mode == ModelRunMode.CLASSIFY.value:
    import pickle
    from tensorflow import keras
    from tensorflow.keras import layers
    # open best params and tokenizer
    with open('lstm_model_weights/best_lstm_model_params.pkl', 'rb') as f:
        hiperparams = pickle.load(f)

    with open('lstm_model_weights/tokenizer_lstm.pkl', 'rb') as f:
        tokenizer = pickle.load(f)

    # print(hiperparams)
    # print(tokenizer)

    max_length = 120
    input_shape = (max_length,)
    vocab_size = len(tokenizer)
    epochs = hiperparams['epochs']
    batch_size = hiperparams['batch_size']
    learning_rate = hiperparams['learning_rate']
    lstm_units = hiperparams['lstm_units']
    dropout_rate = hiperparams['dropout_rate']
    embed_dim = hiperparams['embed_dim']

    # Build LSTM model
    inputs = keras.Input(shape=input_shape)  # Assuming (timesteps, features)

    x = layers.Embedding(vocab_size, embed_dim)(inputs)

    for units in lstm_units[:-1]:
        x = layers.Bidirectional(layers.LSTM(units, return_sequences=True))(x)
        x = layers.Dropout(dropout_rate)(x)

    x = layers.Bidirectional(layers.LSTM(lstm_units[-1]))(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Dense(128, activation='relu')(x)


    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    model.load_weights('lstm_model_weights/best_lstm_model.h5')
    # print(model.summary())
    print("Model loaded successfully")

Model loaded successfully


In [29]:
if mode == ModelRunMode.CLASSIFY.value:
    X_test = prepare_data_test(X_test, tokenizer, max_length)
    print("X_test shape:", X_test.shape)
    preds = model.predict(X_test)
    preds = [1 if pred > 0.5 else 0 for pred in preds]
    print(np.unique(preds, return_counts=True))
    preds_str = ['AI' if pred == 1 else 'Human' for pred in preds]
    result = pd.DataFrame({'ID': ids, 'Label': preds_str})
    result.to_csv(output_csv, index=False, sep='\t')
    print("Prediction saved successfully")

X_test shape: (100, 120)
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 241ms/step
(array([0, 1]), array([66, 34]))
Prediction saved successfully
