# Imports

In [55]:
import pandas as pd
import numpy as np
import re
import random
import tensorflow as tf
import pickle
import os

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.sequence import pad_sequences
from enum import Enum

In [56]:
max_length = 120
n = 50000  # Set the number of most frequent words to keep


In [57]:
# Model run enum
class ModelRunMode(Enum):
    """
    Enumeration of Model Run Mode.
    """
    TRAIN           = "train"           # Train Mode
    CLASSIFY        = "classify"        # Classify Mode

In [58]:
# Model run mode
# Options: 
#   ModelRunMode.TRAIN.value            (Train the model)
#   ModelRunMode.CLASSIFY.value         (Classify data)
mode = ModelRunMode.TRAIN.value

In [59]:
# Parameters cell
if mode == ModelRunMode.CLASSIFY.value:
    # CLASSIFY mode: Set parameters for classification
    input_csv = "classify_input_datasets/submission3_inputs.csv"               # CSV file with texts for prediction (ID, Text)
    output_csv = "classify_output_datasets/submission3_outputs_lstm_model.csv" # CSV file to store prediction result
elif mode == ModelRunMode.TRAIN.value:
    seed = 42                                                                                   # Global seed for reproducibility across Python, NumPy, and TensorFlow
    os.environ['PYTHONHASHSEED'] = str(seed)                                                        # Set environment variable for Python's internal hash seed for determinism
    os.environ["TF_CUDNN_DETERMINISTIC"] = "1"                                                      # Attempt to force cuDNN to be deterministic
    random.seed(seed)                                                                               # Set Python's built-in random module seed
    np.random.seed(seed)                                                                            # Set NumPy random seed
    tf.random.set_seed(seed)                                                                        # Set TensorFlow random seed
else:
    print("The selected option is not valid. Options: \"train\" or \"classify\"!")
    SystemExit()

# Defining global fuctions

In [60]:
def clean(text, stopwords = True):
    def normalize(text):
        # Convert to lowercase
        text = text.lower()
        # Remove numbers, special characters, e o caractere '
        text = re.sub(r"[^a-z\s]", "", text)
        # Replace multiple spaces with a single space
        return re.sub(r'\s+', ' ', text).strip()

    def remove_stopwords(text):
        stopwords = [
        "the", "of", "and", "in", "to", "is", "a", "that", "for", "are", "on", "with", 
        "as", "at", "by", "from", "this", "it", "an", "be", "or", "which", "was", "were"
        ]
        text = ' '.join([word for word in text.split() if word not in stopwords])
        return text
    
    text = normalize(text)
    if stopwords:
        text = remove_stopwords(text)
    return text

In [61]:

class Tokenizer:
    def __init__(self, texts, n_words=None):
        self.n_words = n_words
        self.texts = texts
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        self.create_index()
        

    def create_index(self):
        word_counter = Counter()
        for caption in self.texts:
            for word in caption.split():
                word_counter[word] += 1

        # Sort words by frequency and alphabetically for ties
        most_common = word_counter.most_common(self.n_words) if self.n_words else word_counter.items()
        self.vocab = [word for word, _ in sorted(most_common, key=lambda x: (-x[1], x[0]))]

        # Add special tokens
        self.word2idx['<pad>'] = 0
        self.word2idx['<unk>'] = 1
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 2

        for word, index in self.word2idx.items():
            self.idx2word[index] = word


    def encode(self, caption):
        tokens = []
        for word in caption.split():
            if word in self.word2idx:
                tokens.append(self.word2idx[word])
            else:
                tokens.append(self.word2idx['<unk>'])
        return tokens

    def decode(self, tokens):
        return ' '.join([self.idx2word.get(token, '<unk>') for token in tokens])

    def __len__(self):
        return len(self.vocab) + 2

In [62]:
def prepare_data(df, tokenizer,max_length=120):
    X = df['Text']
    y = df['Label']
    X = X.apply(lambda x: tokenizer.encode(x))
    X = pad_sequences(X, maxlen=max_length, padding='pre')
    return X, y

# TRAIN MODE

## Opening and cleaning data

In [63]:
X = pd.read_csv('../tarefa_1/test_input_dataset/merged_inputs.csv', sep='\t')
y = pd.read_csv('../tarefa_1/test_output_dataset/merged_outputs.csv', sep='\t')
df = pd.merge(X, y, on="ID", how="inner")

df['Text'] = df['Text'].str.replace('\n', ' ')
df['Text'] = df['Text'].astype(str)
df['Label'] = df['Label'].apply(lambda x: 1 if x == "AI" else 0)
df['Text'] = df['Text'].apply(clean)

# Divisão em treino (70%) e validação (30%)
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42, shuffle=True)

## Preparing data

In [64]:
tokenizer = Tokenizer(df['Text'], n)
vocab_size = len(tokenizer)
print('Number of words in the vocabulary:', len(tokenizer))

Number of words in the vocabulary: 12582


In [65]:
X_train, y_train = prepare_data(df, tokenizer, max_length)
X_val, y_val = prepare_data(val_df, tokenizer, max_length)

In [66]:
print(X_val[-1])
print(tokenizer.decode(X_val[-1]))

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    4    3  130 1663 2043   17
    7 1162  401 3614  911  271    5  851   81   37  155  192  248   13
   33   58   70  932  153 2043   45    8  459 2043   34    7 1162  172
  750  191  331 3614  309  271  434    5 1337 2043  102    6  931 1610
 1279  897   38   11   29    4   40  143   43   20   59  333  248    7
 1162   18    6  315   97  110    7  219]
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> research paper focuses investigating charging behavior quantum dot coupled luttinger liqui

## Fine tune function

In [67]:
def hyperparameter_optimization_lstm(X_train, y_train, X_val, y_val,
                                     epochs_list, batch_size_list, 
                                     learning_rate_list, lstm_units_list,
                                     embed_dim_list, 
                                     dropout_list, n_iter=10):

    keras.backend.clear_session()
    best_acc = 0.0
    best_params = {}

    # Split entire dataset into train_val vs test
    X_combined = np.concatenate([X_train, X_val])
    y_combined = np.concatenate([y_train, y_val])
    X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)


    # Prepare random combinations
    param_combinations = []
    for _ in range(n_iter):
        param_combinations.append({
            'epochs': random.choice(epochs_list),
            'batch_size': random.choice(batch_size_list),
            'learning_rate': random.choice(learning_rate_list),
            'lstm_units': random.choice(lstm_units_list),
            'dropout_rate': random.choice(dropout_list),
            'embed_dim': random.choice(embed_dim_list)
        })

    for i,params in enumerate(param_combinations):
        print(f"\nIteration {i+1}/{n_iter}", end=' ')
        # Unpack parameters
        epochs = params['epochs']
        batch_size = params['batch_size']
        learning_rate = params['learning_rate']
        lstm_units = params['lstm_units']
        dropout_rate = params['dropout_rate']
        embed_dim = params['embed_dim']

        # Batch the datasets
        train_ds = tf.data.Dataset.from_tensor_slices((X_train_val, y_train_val)).batch(batch_size)
        validation_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)

        # Extract input shape and number of classes
        input_shape = train_ds.element_spec[0].shape
        print("Input Shape:", input_shape)

        # Build LSTM model
        inputs = keras.Input(shape=(input_shape[1],))  # Assuming (timesteps, features)

        x = layers.Embedding(vocab_size, embed_dim)(inputs)

        for units in lstm_units[:-1]:
            x = layers.Bidirectional(layers.LSTM(units, return_sequences=True))(x)
            x = layers.Dropout(dropout_rate)(x)
        
        x = layers.Bidirectional(layers.LSTM(lstm_units[-1]))(x)
        x = layers.Dropout(dropout_rate)(x)
        x = layers.Dense(128, activation='relu')(x)

        outputs = layers.Dense(1, activation='sigmoid')(x)

        model = keras.Model(inputs=inputs, outputs=outputs)
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        callbacks = [
            EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True),
            ReduceLROnPlateau(monitor='val_loss', factor=0.9, patience=2, verbose=0, min_lr=1e-2)
        ]

        model.compile(optimizer=optimizer,
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

        # Train the model
        history = model.fit(train_ds, epochs=epochs, batch_size=batch_size,
                            verbose=0, validation_data=validation_ds, callbacks=callbacks)

        # Evaluate final val accuracy
        val_loss, val_acc = model.evaluate(X_val, y_val, verbose=True)

        print("Params:", params, "| Val Acc = {:.4f}".format(val_acc))

        # Keep track of best accuracy (if two combinations yield the same accuracy, we pick the one with the lower validation loss)
        if (val_acc > best_acc) or (val_acc == best_acc and val_loss < best_loss):
            best_acc = val_acc
            best_loss = val_loss
            best_params = params
            best_model = model
    
    print("Best val acc = {:.4f}".format(best_acc))
    print("Best hyperparams:", best_params)

    # Evaluate the best model
    loss, accuracy = best_model.evaluate(validation_ds)
    
    # Get predictions
    y_true = np.concatenate([y.numpy() for _, y in validation_ds], axis=0)
    y_pred_probs = best_model.predict(validation_ds)
    y_pred = (y_pred_probs > 0.5).astype(int)

    # Calculate metrics
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Print results
    print("\nBest Model Evaluation:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Loss: {loss:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

    return best_params, best_model


## Model fine tune

In [68]:
if mode == ModelRunMode.TRAIN.value:
    # Hyperparameter search
    epochs_list = [4, 8]
    batch_size_list = [16]
    learning_rate_list = [1e-2, 1e-3]
    lstm_units_list = [[32, 32], [64, 32]]
    embed_dim_list = [100, 150]
    dropout_list = [0.1, 0.2]
    hiperparams, model = hyperparameter_optimization_lstm(X_train, y_train, X_val, y_val,
                                                        epochs_list, batch_size_list,
                                                        learning_rate_list, lstm_units_list,
                                                        embed_dim_list, dropout_list, n_iter=15)


Iteration 1/15 Input Shape: (None, 120)
Params: {'epochs': 4, 'batch_size': 16, 'learning_rate': 0.001, 'lstm_units': [32, 32], 'dropout_rate': 0.1, 'embed_dim': 100} | Val Acc = 0.9947

Iteration 2/15 Input Shape: (None, 120)
Params: {'epochs': 4, 'batch_size': 16, 'learning_rate': 0.001, 'lstm_units': [32, 32], 'dropout_rate': 0.1, 'embed_dim': 100} | Val Acc = 0.9965

Iteration 3/15 Input Shape: (None, 120)
Params: {'epochs': 4, 'batch_size': 16, 'learning_rate': 0.01, 'lstm_units': [32, 32], 'dropout_rate': 0.2, 'embed_dim': 100} | Val Acc = 0.9894

Iteration 4/15 Input Shape: (None, 120)
Params: {'epochs': 8, 'batch_size': 16, 'learning_rate': 0.01, 'lstm_units': [32, 32], 'dropout_rate': 0.2, 'embed_dim': 150} | Val Acc = 0.9876

Iteration 5/15 Input Shape: (None, 120)
Params: {'epochs': 8, 'batch_size': 16, 'learning_rate': 0.01, 'lstm_units': [64, 32], 'dropout_rate': 0.1, 'embed_dim': 100} | Val Acc = 0.9947

Iteration 6/15 Input Shape: (None, 120)
Params: {'epochs': 8, 'batc

In [69]:
if mode == ModelRunMode.TRAIN.value:
    model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 120)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 120, 100)          1258200   
                                                                 
 bidirectional_2 (Bidirectio  (None, 120, 64)          34048     
 nal)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 120, 64)           0         
                                                                 
 bidirectional_3 (Bidirectio  (None, 64)               24832     
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 64)                0   

In [70]:
if mode == ModelRunMode.TRAIN.value:
    model.save('lstm_model_weights/best_lstm_model.h5')
    # save params and tokenizer
    import pickle
    with open('lstm_model_weights/best_lstm_model_params.pkl', 'wb') as f:
        pickle.dump(hiperparams, f)

    with open('lstm_model_weights/tokenizer_lstm.pkl', 'wb') as f:
        pickle.dump(tokenizer, f)

# CLASSIFY MODE

In [71]:
def prepare_data_test(X, tokenizer,max_length=max_length):
    X = X.apply(lambda x: tokenizer.encode(x))
    X = pad_sequences(X, maxlen=max_length, padding='pre')

    return X

In [72]:
if mode == ModelRunMode.CLASSIFY.value:
    X_test = pd.read_csv(input_csv,sep='\t', index_col=False)['Text']
    ids = pd.read_csv(input_csv,sep='\t', index_col=False)['ID']
    X_test = X_test.str.replace('\n', ' ')
    X_test = X_test.astype(str)
    X_test = X_test.apply(clean)
    print("CSV loaded successfully")

In [73]:
if mode == ModelRunMode.CLASSIFY.value:
    # open best params and tokenizer
    with open('lstm_model_weights/best_lstm_model_params.pkl', 'rb') as f:
        hiperparams = pickle.load(f)

    with open('lstm_model_weights/tokenizer_lstm.pkl', 'rb') as f:
        tokenizer = pickle.load(f)

    input_shape = (max_length,)
    vocab_size = len(tokenizer)
    epochs = hiperparams['epochs']
    batch_size = hiperparams['batch_size']
    learning_rate = hiperparams['learning_rate']
    lstm_units = hiperparams['lstm_units']
    dropout_rate = hiperparams['dropout_rate']
    embed_dim = hiperparams['embed_dim']

    # Build LSTM model
    inputs = keras.Input(shape=input_shape)  # Assuming (timesteps, features)

    x = layers.Embedding(vocab_size, embed_dim)(inputs)

    for units in lstm_units[:-1]:
        x = layers.Bidirectional(layers.LSTM(units, return_sequences=True))(x)
        x = layers.Dropout(dropout_rate)(x)

    x = layers.Bidirectional(layers.LSTM(lstm_units[-1]))(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Dense(128, activation='relu')(x)

    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    model.load_weights('lstm_model_weights/best_lstm_model.h5')
    #print(model.summary())
    print("Model loaded successfully")

In [74]:
if mode == ModelRunMode.CLASSIFY.value:
    X_test = prepare_data_test(X_test, tokenizer, max_length)
    print("X_test shape:", X_test.shape)
    preds = model.predict(X_test)
    preds = [1 if pred > 0.5 else 0 for pred in preds]
    print(np.unique(preds, return_counts=True))
    preds_str = ['AI' if pred == 1 else 'Human' for pred in preds]
    result = pd.DataFrame({'ID': ids, 'Label': preds_str})
    result.to_csv(output_csv, index=False, sep='\t')
    print("Prediction saved successfully")

    # Load the validation dataset
    df_true = pd.read_csv("../tarefa_1/validation_dataset/dataset3_disclosed_output.csv", sep="\t")

    # Merge the datasets on the "ID" column, adding suffixes to distinguish the identical column names
    df_merged = pd.merge(df_true, result, on="ID", suffixes=('_true', '_pred'))

    # Calculate the number of correct predictions by comparing the "Label" columns
    num_correct = (df_merged["Label_true"] == df_merged["Label_pred"]).sum()

    # Calculate the percentage of correct predictions
    accuracy_percentage = (num_correct / len(df_merged)) * 100

    print(f"Accuracy: {accuracy_percentage:.2f}%")