## BERT

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import re
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Layer, Lambda
from tensorflow.keras.models import Model
import optuna
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

### Preprocessing module

In [None]:
# Loading the data
train_url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv'
test_url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv'

train_df = pd.read_csv(train_url, header=None)
test_df = pd.read_csv(test_url, header=None)

train_df.columns = ['Class Index', 'Title', 'Description']
test_df.columns = ['Class Index', 'Title', 'Description']
train_df['clean_text'] = train_df['Title'] + ' ' + train_df['Description']
test_df['clean_text'] = test_df['Title'] + ' ' + test_df['Description']

In [None]:
# Load the BERT tokenizer and define tokenization function
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def preprocess_for_bert(texts, max_len):
    """Tokenize text for BERT"""
    return tokenizer(
        texts.tolist(),
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='np'
    )

# Set maximum sequence length and preprocess text data
max_len = 128
X_train_bert = preprocess_for_bert(train_df['clean_text'], max_len)
X_test_bert = preprocess_for_bert(test_df['clean_text'], max_len)
y_train = train_df['Class Index'].values - 1
y_test = test_df['Class Index'].values - 1

### Basic training module

In [None]:
# Define custom layer for BERT embedding
class BertLayer(Layer):
    def __init__(self, **kwargs):
        super(BertLayer, self).__init__(**kwargs)
        self.bert = TFBertModel.from_pretrained('bert-base-uncased')

    def call(self, inputs):
        input_ids, attention_mask = inputs
        output = self.bert(input_ids, attention_mask=attention_mask)
        return output.pooler_output

# Define model structure with custom BERT layer
input_ids = Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
attention_masks = Input(shape=(max_len,), dtype=tf.int32, name='attention_masks')
bert_output = BertLayer()([input_ids, attention_masks])

# Add dense layers for classification
dense_1 = Dense(128, activation='relu')(bert_output)
dropout_1 = Dropout(0.3)(dense_1)
output = Dense(4, activation='softmax')(dropout_1)

# Compile the model
model = Model(inputs=[input_ids, attention_masks], outputs=output)
optimizer = Adam(learning_rate=3e-5)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the base model
model.fit(
    [X_train_bert['input_ids'], X_train_bert['attention_mask']], y_train,
    validation_split=0.1, epochs=5, batch_size=32, verbose=1,
    callbacks=[early_stopping])

### Testing module 1

In [None]:
# Test the base model
y_pred = model.predict([X_test_bert['input_ids'], X_test_bert['attention_mask']])
y_pred_classes = y_pred.argmax(axis=1)
test_accuracy = (y_pred_classes == y_test).mean()
print(f'Base Test Accuracy: {test_accuracy:.4f}')
print("Classification Report (Base Model):")
print(classification_report(y_test, y_pred_classes))

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2, 3], yticklabels=[0, 1, 2, 3])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Base Model)')
plt.show()

### Hyperparameter selection module

In [None]:
# Define model for Optuna hyperparameter tuning
def model_builder(trial):
    # Hyperparameters to optimize
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True)
    num_units = trial.suggest_int('num_units', 64, 512)

    # Model structure with hyperparameter-optimized dense layers
    input_ids = Input(shape=(max_len,), dtype='int32', name='input_ids')
    attention_masks = Input(shape=(max_len,), dtype='int32', name='attention_masks')
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')
    
    def bert_layer(inputs):
        return bert_model(inputs[0], attention_mask=inputs[1]).pooler_output

    bert_output = Lambda(bert_layer, output_shape=(768,))([input_ids, attention_masks])
    dense_1 = Dense(num_units, activation='relu')(bert_output)
    dropout_1 = Dropout(dropout_rate)(dense_1)
    output = Dense(4, activation='softmax')(dropout_1)

    # Compile the model with optimized learning rate
    model = Model(inputs=[input_ids, attention_masks], outputs=output)
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Objective function for Optuna
def objective(trial):
    model = model_builder(trial)
    model.fit([X_train_bert['input_ids'], X_train_bert['attention_mask']], y_train,
              validation_split=0.1, epochs=3, batch_size=32, verbose=1)
    loss, accuracy = model.evaluate([X_test_bert['input_ids'], X_test_bert['attention_mask']], y_test, verbose=1)
    return accuracy

# Run Optuna hyperparameter optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
print(study.best_trial)

### Testing module 2

In [None]:
# Load and build best model from Optuna study
best_trial = study.best_trial
model = model_builder(best_trial)

# Train model with best hyperparameters
model.fit([X_train_bert['input_ids'], X_train_bert['attention_mask']], y_train,
          validation_split=0.1, epochs=10, batch_size=32, verbose=1)

# Test the optimized model
y_pred = model.predict([X_test_bert['input_ids'], X_test_bert['attention_mask']])
y_pred_classes = y_pred.argmax(axis=1)
test_accuracy = (y_pred_classes == y_test).mean()
print(f'Optimized Test Accuracy: {test_accuracy:.4f}')
print("Classification Report (Optimized Model):")
print(classification_report(y_test, y_pred_classes))

# Display confusion matrix for optimized model
conf_matrix = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2, 3], yticklabels=[0, 1, 2, 3])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Optimized Model)')
plt.show()