### Setting up and retrieving data

#### Imports
Performing the necessary imports for the file to run. Main imports that are used are as follows:
- pandas/numpy: Working with data
- transformers: BERT Tokenizer
- sklearn: Evaluation

In [None]:
import pandas as pd

from transformers import AutoTokenizer, TFAutoModel

import tensorflow as tf

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score

In [None]:
# Hyperparameters
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 100
EPOCHS = 10
BATCH_SIZE = 16
LSTM_UNITS = 64

# Set a seed to decrease randomness
SEED = 42
tf.random.set_seed(SEED)

# Set the glove file to refer to
MODEL_NAME = 'microsoft/deberta-v3-base'
TOKENIZER_NAME = 'bilstm.' + MODEL_NAME + '.tokenizer.json'
SAVED_NAME = 'bilstm.deberta.keras'
WEIGHTS_FILE = 'bilstm.'+ MODEL_NAME +'.weights.h5'

#### Load CSV files
Loading CSV files from test csv for preprocessing

In [None]:
test = pd.read_csv('test.csv')

#### Set up BERT-based Tokenizer
Instantiates the tokenizer based on the model name above and define functions for encoding sentences

In [None]:
# Instantiate Tokenizer on MODEL_NAME (BERT)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Encodes sentence
def encode_sentence(s):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

# Encode data for the bert model with a max length of MAX_SEQUENCE_LENGTH
def bert_encode(hypotheses, premises, tokenizer, max_length=MAX_SEQUENCE_LENGTH):

    x = tokenizer(hypotheses, premises, padding='max_length', truncation=True, max_length=max_length)

    inputs = {
          'input_word_ids':tf.ragged.constant(x['input_ids']).to_tensor(),
          'input_mask': tf.ragged.constant(x['attention_mask']).to_tensor(),
          'input_type_ids': tf.ragged.constant(x['token_type_ids']).to_tensor()}

    return inputs

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [None]:
test_input = bert_encode(test.premise.values.tolist(), test.hypothesis.values.tolist(), tokenizer)

### Predicting using the model

In [None]:
# Load the entire model in
test_model = tf.keras.models.load_model(SAVED_NAME, custom_objects={'TFAutoModel': TFAutoModel})

# Use the model to predict the valid input
outputs = test_model.predict(test_input)

In [None]:
# Convert probabilities to classes and reshape
preds = (outputs > 0.5).astype(int)

# Save predictions to CSV
df_predictions = pd.DataFrame(preds, columns=['prediction'])
df_predictions.to_csv("Group_70_B.csv", index=False)

# Get Labels for prediction
# labels = test.label.values.reshape(-1,1)

# Evaluate results
# print(f"Accuracy: {accuracy_score(labels, preds):.4f}")
# print(f"F1 Score: {f1_score(labels, preds):.4f}")
# print(f"Precision: {precision_score(labels, preds):.4f}")
# print(f"Recall: {recall_score(labels, preds):.4f}")
# print(f"MCC: {matthews_corrcoef(labels, preds):.4f}")
# print(f"ROC AUC Score: {roc_auc_score(labels, preds):.4f}")