## BERT

In [2]:
# Importing required libraries
import pandas as pd
import numpy as np
import re
from transformers import BertTokenizer, TFBertModel  # Import BERT tokenizer and model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
import optuna
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt




### Preprocessing module

In [11]:
# Loading the data
train_df = pd.read_csv('../data/agn_train.csv')
test_df = pd.read_csv('../data/agn_test.csv')

# Loading the data (same as in CNN-RNN model)
# train_url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv'
# test_url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv'

# train_df = pd.read_csv(train_url, header=None)
# test_df = pd.read_csv(test_url, header=None)

# # Add column names
# train_df.columns = ['Class Index', 'Title', 'Description']
# test_df.columns = ['Class Index', 'Title', 'Description']

# Combine 'Title' and 'Description' into 'clean_text'
train_df['clean_text'] = train_df['Title'] + ' ' + train_df['Description']
test_df['clean_text'] = test_df['Title'] + ' ' + test_df['Description']

In [15]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function for preprocessing data using the BERT tokenizer
def preprocess_for_bert(texts, max_len):
    """Tokenize and convert text to BERT's input format."""
    return tokenizer(
        texts.tolist(),                # Convert DataFrame to list
        add_special_tokens=True,       # Add [CLS] and [SEP]
        max_length=max_len,            # Max length of the sequence
        truncation=True,               # Truncate if longer than max_len
        padding='max_length',          # Pad to max length
        return_attention_mask=True,    # Create attention mask
        return_tensors='np'            # Return numpy arrays for tensors
    )

# Define the maximum sequence length for BERT
max_len = 128  # BERT typically works well with lengths of 128 or 256 tokens

# Tokenize and preprocess training and test data for BERT
X_train_bert = preprocess_for_bert(train_df['clean_text'], max_len)
X_test_bert = preprocess_for_bert(test_df['clean_text'], max_len)

# Prepare labels
y_train = train_df['Class Index'].values - 1  # Adjust class indices to [0, 1, 2, 3]
y_test = test_df['Class Index'].values - 1  # Adjust class indices to [0, 1, 2, 3]

### Training module

In [8]:
%%time

# Load the pre-trained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Define inputs for BERT (input_ids, attention_masks)
input_ids = Input(shape=(max_len,), dtype='int32', name='input_ids')
attention_masks = Input(shape=(max_len,), dtype='int32', name='attention_masks')

# BERT model output (sequence output and pooled output)
bert_output = bert_model(input_ids, attention_mask=attention_masks)
pooled_output = bert_output.pooler_output  # We use the pooled output for classification

# Add a fully connected layer
dense_1 = Dense(128, activation='relu')(pooled_output)
dropout_1 = Dropout(0.3)(dense_1)
dense_2 = Dense(64, activation='relu')(dropout_1)
output = Dense(4, activation='softmax')(dense_2)  # 4 classes

# Create the model
model = Model(inputs=[input_ids, attention_masks], outputs=output)

# Compile the model
optimizer = Adam(learning_rate=3e-5)  # Smaller learning rate for fine-tuning BERT
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Training the model with early stopping
model.fit(
    [X_train_bert['input_ids'], X_train_bert['attention_mask']], 
    y_train, 
    validation_split=0.1, 
    epochs=10,  # Fine-tuning usually requires fewer epochs
    batch_size=16,  # Smaller batch sizes are often used for BERT
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/10
[1m3375/3375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 30ms/step - accuracy: 0.8680 - loss: 0.3847 - val_accuracy: 0.8948 - val_loss: 0.2781
Epoch 2/10
[1m  86/3375[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:31[0m 28ms/step - accuracy: 0.8995 - loss: 0.2753

KeyboardInterrupt: 

### Hyperparameter selection module

In [31]:
%%time

# Defining hyperparameters with Optuna
def model_builder(trial):
    # Hyperparameters to optimize
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True)  # Smaller range for BERT fine-tuning
    num_units = trial.suggest_int('num_units', 64, 512)  # For dense layers

    # Load the pre-trained BERT model
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')

    # Input for BERT (input_ids, attention_masks)
    input_ids = Input(shape=(max_len,), dtype='int32', name='input_ids')
    attention_masks = Input(shape=(max_len,), dtype='int32', name='attention_masks')

    # BERT output (pooled_output)
    bert_output = bert_model(input_ids, attention_mask=attention_masks)
    pooled_output = bert_output.pooler_output

    # Fully connected layers after BERT
    dense_1 = Dense(num_units, activation='relu')(pooled_output)
    dropout_1 = Dropout(dropout_rate)(dense_1)
    output = Dense(4, activation='softmax')(dropout_1)  # 4 classes

    # Create the model
    model = Model(inputs=[input_ids, attention_masks], outputs=output)

    # Compile the model
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Function for model evaluation
def objective(trial):
    model = model_builder(trial)

    # Train the model with the optimized hyperparameters
    model.fit(
        [X_train_bert['input_ids'], X_train_bert['attention_mask']], 
        y_train, 
        validation_split=0.1, 
        epochs=3,  # Fine-tuning BERT usually requires fewer epochs
        batch_size=16,  # Smaller batch sizes are often used for BERT
        verbose=0
    )

    # Evaluate the model's accuracy
    loss, accuracy = model.evaluate([X_test_bert['input_ids'], X_test_bert['attention_mask']], y_test, verbose=0)
    return accuracy  # Optimizing for accuracy

# Running Optuna to search for the best hyperparameters
study = optuna.create_study(direction='maximize')  # Maximize accuracy
study.optimize(objective, n_trials=10)  # Number of optimization trials

# Print the best trial results
print(study.best_trial)

[I 2024-10-22 18:00:28,410] A new study created in memory with name: no-name-0fcc09c3-6e51-4445-acc9-d3c76e6c52c0


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

ValueError: Exception encountered when calling layer 'tf_bert_model' (type TFBertModel).

Data of type <class 'keras.src.backend.common.keras_tensor.KerasTensor'> is not allowed only (<class 'tensorflow.python.framework.tensor.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>) is accepted for attention_mask.

Call arguments received by layer 'tf_bert_model' (type TFBertModel):
  • input_ids=<KerasTensor shape=(None, 128), dtype=int32, sparse=False, name=input_ids>
  • attention_mask=<KerasTensor shape=(None, 128), dtype=int32, sparse=False, name=attention_masks>
  • token_type_ids=None
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_values=None
  • use_cache=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False

### Testing module

In [11]:
%%time

# Load the best model using the best hyperparameters found by Optuna
best_trial = study.best_trial
best_params = best_trial.params

# Build the model with the best hyperparameters
model = model_builder(best_trial)

# Train the model on the training data
model.fit([X_train_bert['input_ids'], X_train_bert['attention_mask']], y_train, 
          validation_split=0.1, epochs=3, batch_size=16, verbose=0)

# Make predictions on the test data
y_pred = model.predict([X_test_bert['input_ids'], X_test_bert['attention_mask']])
y_pred_classes = y_pred.argmax(axis=1)  # Convert predicted probabilities to predicted classes

# Calculate accuracy
test_accuracy = (y_pred_classes == y_test).mean()
print(f'Test Accuracy: {test_accuracy:.4f}')

# Generate classification report (Precision, Recall, F1-Score)
print("Classification Report:")
print(classification_report(y_test, y_pred_classes))

# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_classes)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2, 3], yticklabels=[0, 1, 2, 3])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()


ValueError: No trials are completed yet.