### Setting up and retrieving data

#### Imports
Performing the necessary imports for the file to run. Main imports that are used are as follows:
- pandas/numpy: Working with data
- transformers: BERT Tokenizer
- sklearn: Evaluation

In [12]:
import os

import pandas as pd

from transformers import AutoTokenizer, TFAutoModel

import tensorflow as tf

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score

In [13]:
# Hyperparameters
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 100
EPOCHS = 10
BATCH_SIZE = 16
LSTM_UNITS = 64

# Set a seed to decrease randomness
SEED = 42
tf.random.set_seed(SEED)

# Set the model file to refer to
MODEL_NAME = 'microsoft/deberta-v3-base'
TOKENIZER_NAME = 'bilstm.deberta-v3-base.tokenizer.json'
SAVED_NAME = 'bilstm.deberta.keras'
WEIGHTS_FILE = 'deberta-v3-base.weights.h5'

#### Load CSV files
Loading CSV files from test csv for preprocessing

In [14]:
test = pd.read_csv('test.csv')

#### Set up BERT-based Tokenizer
Instantiates the tokenizer based on the model name above and define functions for encoding sentences

In [15]:
# Instantiate Tokenizer on MODEL_NAME (BERT)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Encodes sentence
def encode_sentence(s):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

# Encode data for the bert model with a max length of MAX_SEQUENCE_LENGTH
def bert_encode(hypotheses, premises, tokenizer, max_length=MAX_SEQUENCE_LENGTH):

    x = tokenizer(hypotheses, premises, padding='max_length', truncation=True, max_length=max_length)

    inputs = {
          'input_word_ids':tf.ragged.constant(x['input_ids']).to_tensor(),
          'input_mask': tf.ragged.constant(x['attention_mask']).to_tensor(),
          'input_type_ids': tf.ragged.constant(x['token_type_ids']).to_tensor()}

    return inputs



In [16]:
test_input = bert_encode(test.premise.values.tolist(), test.hypothesis.values.tolist(), tokenizer)

### Model Architecture

In [17]:
os.environ["WANDB_API_KEY"] = "0"

# Define function to build the model
def build_model():
    # BERT encoder layer that is non-trainable
    bert_encoder = TFAutoModel.from_pretrained(MODEL_NAME, trainable=False)

    # Tokenized input sequence (word indices), Mask to indicate real tokens/padding, Type Ids
    input_word_ids = tf.keras.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name="input_type_ids")

    # Encodes all 3 inputs
    output = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]

    # BiLSTM layer and normalisation to prevent overfitting.
    output = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LSTM_UNITS))(output)
    output = tf.keras.layers.BatchNormalization()(output)
    output = tf.keras.layers.Dropout(0.1)(output)
    output = tf.keras.layers.Dense(64, activation='relu')(output)
    output = tf.keras.layers.BatchNormalization()(output)
    output = tf.keras.layers.Dropout(0.1)(output)

    # Output layer
    output = tf.keras.layers.Dense(1, activation='sigmoid')(output)

    # Returns the model
    return tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)

In [18]:
# Builds the model
model = build_model()
model.summary()

All model checkpoint layers were used when initializing TFDebertaV2Model.

All the layers of TFDebertaV2Model were initialized from the model checkpoint at microsoft/deberta-v3-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2Model for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_word_ids (InputLayer  [(None, 100)]                0         []                            
 )                                                                                                
                                                                                                  
 input_mask (InputLayer)     [(None, 100)]                0         []                            
                                                                                                  
 input_type_ids (InputLayer  [(None, 100)]                0         []                            
 )                                                                                                
                                                                                            

In [19]:
# Compiles the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [20]:
# Setting up Early Stopping with callback to checkpoint for the training of the model which monitors val loss
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3)
checkpoint_filepath = WEIGHTS_FILE

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    save_weights_only=True)

In [21]:
# Load Weights
model.load_weights(WEIGHTS_FILE)



### Predicting using the model

In [23]:
# Use the model to predict the valid input
outputs = model.predict(test_input)



In [24]:
# Convert probabilities to classes and reshape
preds = (outputs > 0.5).astype(int)

# Save predictions to CSV
df_predictions = pd.DataFrame(preds, columns=['prediction'])
df_predictions.to_csv("Group_70_B.csv", index=False)

# Get Labels for prediction
# labels = test.label.values.reshape(-1,1)

# Evaluate results
# print(f"Accuracy: {accuracy_score(labels, preds):.4f}")
# print(f"F1 Score: {f1_score(labels, preds):.4f}")
# print(f"Precision: {precision_score(labels, preds):.4f}")
# print(f"Recall: {recall_score(labels, preds):.4f}")
# print(f"MCC: {matthews_corrcoef(labels, preds):.4f}")
# print(f"ROC AUC Score: {roc_auc_score(labels, preds):.4f}")