### Imports and hyperparameters

In [1]:
import pandas as pd
import numpy as np

import os

from transformers import AutoTokenizer, TFAutoModel

import tensorflow as tf

In [2]:
# Hyperparameters
MAX_SEQUENCE_LENGTH = 100
EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 3e-5
DROPOUT = 0.2

# Set a seed to decrease randomness
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)

### Load demo dataset

In [3]:
# Enter the name of your demo file here
FILE_NAME = 'test.csv'

In [4]:
demo = pd.read_csv(FILE_NAME)

### Set up tokenizer for DeBERTa model

In [5]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



### Tokenising and preparing inputs

In [6]:
# Encode data with a max length of 100
def bert_encode(hypotheses, premises, tokenizer, max_length=100):

    x = tokenizer(hypotheses, premises, padding='max_length', truncation=True, max_length=max_length)

    inputs = {
          'input_word_ids':tf.ragged.constant(x['input_ids']).to_tensor(),
          'input_mask': tf.ragged.constant(x['attention_mask']).to_tensor(),
          'input_type_ids': tf.ragged.constant(x['token_type_ids']).to_tensor()}

    return inputs

In [7]:
demo_input = bert_encode(demo.premise.values.tolist(), demo.hypothesis.values.tolist(), tokenizer)

### Define model

In [8]:
os.environ["WANDB_API_KEY"] = "0"

def build_model():
    bert_encoder = TFAutoModel.from_pretrained("microsoft/deberta-v3-base")
    input_word_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name="input_type_ids")

    output = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.GlobalAveragePooling1D()(output)

    output = tf.keras.layers.Dropout(DROPOUT)(output)

    output = tf.keras.layers.Dense(1, activation='sigmoid')(output)

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)

    return model

In [9]:
model = build_model()
model.summary()

tf_model.h5:   0%|          | 0.00/736M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDebertaV2Model.

All the layers of TFDebertaV2Model were initialized from the model checkpoint at microsoft/deberta-v3-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2Model for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_word_ids (InputLayer  [(None, 100)]                0         []                            
 )                                                                                                
                                                                                                  
 input_mask (InputLayer)     [(None, 100)]                0         []                            
                                                                                                  
 input_type_ids (InputLayer  [(None, 100)]                0         []                            
 )                                                                                                
                                                                                              

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
# Load pretrained model
model.load_weights('drive/MyDrive/NLI/deberta-v3-base.model.weights.h5')



### Compute predictions

In [12]:
outputs = model.predict(demo_input)

preds = outputs > 0.5



In [13]:
demo['prediction'] = preds.astype(int).reshape(-1)

In [None]:
# Display full dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

demo.head(len(demo))

### Saving predictions only

In [18]:
preds_df = pd.DataFrame({'prediction': preds.astype(int).reshape(-1)})

preds_df.to_csv('predictions.csv', index=False)