# Bidirectional Encoder Representations from Transformers (BERT) Notebook

```md
@authors: Grupo 03
```

In [None]:
# Notebook Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json
import tensorflow as tf
import shutil
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from enum import Enum

2025-03-23 23:25:22.779449: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-23 23:25:22.792779: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742772322.807279   51676 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742772322.811841   51676 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742772322.823207   51676 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
# Model run enum
class ModelRunMode(Enum):
    """
    Enumeration of Model Run Mode.
    """
    TRAIN           = "train"           # Train Mode
    CLASSIFY        = "classify"        # Classify Mode

In [3]:
# Model run mode
# Options: 
#   ModelRunMode.TRAIN.value            (Train the model)
#   ModelRunMode.CLASSIFY.value         (Classify data)
mode = ModelRunMode.CLASSIFY.value
# Prefix for saving the model files
model_prefix = "llm_bert_model"
file_path = "llm_bert_model_weights"
separator_char = "\t"

In [None]:
# Parameters cell
if mode == ModelRunMode.TRAIN.value:
    # TRAIN mode: Set parameters for training
    input_csv = "../Tarefas/tarefa_1/clean_input_datasets/dataset1_enh_inputs_v2.csv"           # CSV file with training inputs (ID, Text)
    output_csv = "../Tarefas/tarefa_1/clean_output_datasets/dataset1_enh_outputs_v2.csv"        # CSV file with training outputs (ID, Label)
    test_size = 0.3                                                                             # Proportion of the dataset to use as test data
    random_state=42                                                                             # Seed for reproducible dataset splitting
elif mode == ModelRunMode.CLASSIFY.value:
    # CLASSIFY mode: Set parameters for classification
    input_csv = "classify_input_datasets/dataset3_inputs.csv"                                   # CSV file with texts for prediction (ID, Text)
    output_csv = "classify_output_datasets/dataset3_outputs_llm_bert_model-s1.csv"              # CSV file to store prediction result
else:
    print("The selected option is not valid. Options: \"train\" or \"classify\"!")
    SystemExit()

In [5]:
# Method to load and merge two datasets by ID column
def merge_data_by_id(input_csv, output_csv, sep="\t"):
    df_in = pd.read_csv(input_csv, sep=sep)
    df_out = pd.read_csv(output_csv, sep=sep)

    # Remove duplicates or NaNs if needed
    df_in.dropna(subset=["ID", "Text"], inplace=True)
    df_out.dropna(subset=["ID", "Label"], inplace=True)
    df_in.drop_duplicates(subset=["ID"], inplace=True)
    df_out.drop_duplicates(subset=["ID"], inplace=True)

    df_merged = pd.merge(df_in, df_out, on="ID", how="inner")
    return df_merged

# Method for text cleaning
def text_cleaning(text, stopwords = False):
    def normalize(text):
        # Convert to lowercase
        text = text.lower()
        # Remove numbers, special characters, e o caractere '
        text = re.sub(r"[^a-z\s]", "", text)
        # Replace multiple spaces with a single space
        text = re.sub(r'\s+', ' ', text).strip()
        # Add start and end sequence tokens
        # text = 'startseq ' + " ".join([word for word in text.split() if len(word) > 1]) + ' endseq'
        return text

    def remove_stopwords(text):
        stopwords = [
        "the", "of", "and", "in", "to", "is", "a", "that", "for", "are", "on", "with", 
        "as", "at", "by", "from", "this", "it", "an", "be", "or", "which", "was", "were"
        ]
        text = ' '.join([word for word in text.split() if word not in stopwords])
        return text
    
    text = normalize(text)
    if stopwords:
        text = remove_stopwords(text)
    return text

# Method to convert labels to binary
def convert_labels_to_binary_and_text(df_merged):
    df_merged["Label"] = df_merged["Label"].str.lower().str.strip()
    y = np.where(df_merged["Label"] == "ai", 1, 0)
    texts = df_merged["Text"].tolist()
    return y, texts

# Method to plot the learning curves
def plot_learning_curves(history):
    # Loss
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    if 'val_loss' in history.history:
        plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title("Loss")
    plt.legend()
    
    # Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Acc')
    if 'val_accuracy' in history.history:
        plt.plot(history.history['val_accuracy'], label='Val Acc')
    plt.title("Accuracy")
    plt.legend()
    plt.show()

# Method to check label distribution
def check_label_distribution(df_merged):
    label_counts = df_merged["Label"].value_counts(dropna=False)
    print("Label distribution:\n", label_counts)

# Method to print the first 5 cleaned texts
def debug_text_cleaning(df_merged):
    for i in range(min(5, len(df_merged))):
        print(df_merged["Text"].iloc[i])

In [6]:
def train_bert(input_csv, output_csv, model_ckpt="bert-base-uncased", output_dir=file_path, model_prefix=model_prefix, num_train_epochs=3, batch_size=8, test_size=0.2, learning_rate=1e-3, random_state=48):
    print("[INFO] Loading data...")
    df_merged = merge_data_by_id(input_csv, output_csv, sep=separator_char)
    
    print("[INFO] Checking label distribution...")
    check_label_distribution(df_merged=df_merged)
    
    print("[INFO] Cleaning text...")
    df_merged["Text"] = df_merged["Text"].apply(text_cleaning)
    
    print("[INFO] Debugging cleaned text...")
    debug_text_cleaning(df_merged)
    
    print("[INFO] Converting labels and text...")
    labels, texts = convert_labels_to_binary_and_text(df_merged)
    
    print("[INFO] Splitting dataset...")
    X_train_texts, X_val_texts, y_train, y_val = train_test_split(texts, labels, test_size=test_size, random_state=random_state, stratify=labels)
    
    print("[INFO] Initializing tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    
    print("[INFO] Calculating token lengths...")
    raw_train_encodings = tokenizer(X_train_texts, add_special_tokens=True, truncation=False, padding=False)
    token_lengths = [len(seq) for seq in raw_train_encodings["input_ids"]]
    max_length = int(np.percentile(token_lengths, 90))
    print(f"[INFO] Selected max_length: {max_length}")
    
    print("[INFO] Tokenizing train and validation datasets...")
    train_encodings = tokenizer(X_train_texts, truncation=True, padding="max_length", max_length=max_length)
    val_encodings = tokenizer(X_val_texts, truncation=True, padding="max_length", max_length=max_length)
    
    print("[INFO] Converting data to TensorFlow datasets...")
    train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(batch_size)
    val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), y_val)).batch(batch_size)
    
    print("[INFO] Loading model...")
    model = TFAutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=1)
    
    print("[INFO] Compiling model...")
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    loss = keras.losses.BinaryCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])
    
    print("[INFO] Setting up early stopping...")
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    
    print("[INFO] Starting training...")
    history = model.fit(train_dataset, validation_data=val_dataset, epochs=num_train_epochs, callbacks=[early_stopping])
    
    print("[INFO] Evaluating model...")
    val_loss, val_acc = model.evaluate(val_dataset)
    print(f"\n[RESULT] Validation Accuracy: {val_acc:.4f} | Validation Loss: {val_loss:.4f}")
    
    print("[INFO] Model summary:")
    model.summary()
    
    print("[INFO] Plotting learning curves...")
    plot_learning_curves(history)
    
    print("[INFO] Preparing to save model...")
    if os.path.exists(output_dir):
        print("[INFO] Removing existing output directory...")
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    model_path = os.path.join(output_dir, f"{model_prefix}_model")
    tokenizer_path = os.path.join(output_dir, f"{model_prefix}_tokenizer")
    config_path = os.path.join(output_dir, f"{model_prefix}_config.json")
    
    print("[INFO] Saving model...")
    model.save_pretrained(model_path)
    
    print("[INFO] Saving tokenizer...")
    tokenizer.save_pretrained(tokenizer_path)
    
    print("[INFO] Saving configuration...")
    config_data = {
        "model_ckpt": model_ckpt,
        "num_train_epochs": num_train_epochs,
        "batch_size": batch_size,
        "max_length": max_length
    }
    with open(config_path, "w") as f:
        json.dump(config_data, f)
    
    print(f"[INFO] Training completed. Model, tokenizer, and config saved to {output_dir}!")


In [7]:
# Classification function
def classify_bert(input_csv, output_csv, output_dir="llm_bert_model_weights", separator_char=separator_char):
    # Construct the file paths
    model_path = os.path.join(output_dir, f"{model_prefix}_model")
    tokenizer_path = os.path.join(output_dir, f"{model_prefix}_tokenizer")
    config_path = os.path.join(output_dir, f"{model_prefix}_config.json")

    # Load the model
    print(f"Loading model from: {output_dir}")
    model = TFAutoModelForSequenceClassification.from_pretrained(model_path)
    
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    # Loading configuration
    print("Loading configuration from:", config_path)
    with open(config_path, "r") as f:
         config_data = json.load(f)

    # Retrieve the saved configuration max_len
    max_length = config_data["max_length"]

    # Read the input CSV
    df_input = pd.read_csv(input_csv, sep=separator_char)
    if "ID" not in df_input.columns or "Text" not in df_input.columns:
        raise ValueError("Input CSV must have 'ID' and 'Text' columns for classification.")
    
    # Extract texts
    texts = df_input["Text"].astype(str).tolist()
    
    # Tokenize
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
    
    # Create tf.data.Dataset
    dataset = tf.data.Dataset.from_tensor_slices(dict(encodings))
    dataset = dataset.batch(16)
    
    # Predict probabilities (logits => sigmoid)
    outputs = model.predict(dataset)

    if isinstance(outputs, dict) and "logits" in outputs:
        logits = outputs["logits"]
    else:
        logits = outputs
    
    # Convert logits to probabilities
    probs = tf.nn.sigmoid(logits).numpy()
    
    # Threshold => "AI" vs "Human"
    pred_bin = (probs >= 0.5).astype(int)
    pred_labels = ["AI" if val == 1 else "Human" for val in pred_bin]


    
    # Save result
    df_out = pd.DataFrame({"ID": df_input["ID"], "Label": pred_labels})
    df_out.to_csv(output_csv, sep=separator_char, index=False)
    print(f"Predictions saved to {output_csv}")

In [8]:
if mode == ModelRunMode.TRAIN.value:
    # Check if Tensorflow is listing available GPUs (if not, continue with CPU)
    print("Tensorflow List of GPUs:", tf.config.list_physical_devices('GPU'))
    
    # Train model
    train_bert(input_csv=input_csv, output_csv=output_csv, model_ckpt="bert-base-uncased", output_dir=file_path, model_prefix=model_prefix, num_train_epochs=5, batch_size=8, test_size=test_size, learning_rate=1e-5, random_state=random_state)

if mode == ModelRunMode.CLASSIFY.value:
    # Classification
    classify_bert(input_csv=input_csv, output_csv=output_csv, output_dir=file_path, separator_char=separator_char)

Loading model from: llm_bert_model_weights


2025-03-23 23:25:25.009872: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at llm_bert_model_weights/llm_bert_model_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


Loading configuration from: llm_bert_model_weights/llm_bert_model_config.json
Predictions saved to classify_output_datasets/dataset3_outputs_llm_bert_model.csv
