In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers

## Configuration for BERT models incorporated with masked attention

In [2]:
max_length = 128 
batch_size = 32
epochs = 2

labels = ["contradiction", "entailment", "neutral"]

In [None]:
!curl -LO https://raw.githubusercontent.com/MohamadMerchant/SNLI/master/data.tar.gz
!tar -xvzf data.tar.gz

In [4]:
train_df = pd.read_csv("SNLI_Corpus/snli_1.0_train.csv", nrows=100000)
valid_df = pd.read_csv("SNLI_Corpus/snli_1.0_dev.csv")
test_df = pd.read_csv("SNLI_Corpus/snli_1.0_test.csv")

In [None]:
print("Total train samples: " + str(train_df.shape[0]))

In [None]:
print("Total validation samples: " + str(valid_df.shape[0]))

In [None]:
print("Total test samples: " + str(valid_df.shape[0]))

Looking at dataste

In [None]:
print("Sentence1: " + str(train_df.loc[1, 'sentence1']))
print("Sentence2: " + str(train_df.loc[1, 'sentence2']))
print("Similarity: " + str(train_df.loc[1, 'similarity']))

# Preprocessing

In [None]:
print(train_df.isnull().sum())

In [15]:
train_df.dropna(axis = 0, inplace = True)

In [None]:
print(train_df.isnull().sum())

Seeing distribution TVT split

In [None]:
print("Train Target Distribution")
print(train_df.similarity.value_counts())

In [None]:
print("Validation Target Distribution")
print(valid_df.similarity.value_counts())

In [19]:
train_df = (train_df[train_df.similarity != "-"].sample(frac = 1.0, random_state = 42).reset_index(drop = True)
)

In [20]:
valid_df = (valid_df[valid_df.similarity != "-"].sample(frac = 1.0, random_state = 42).reset_index(drop = True))

In [None]:
print(train_df.similarity.value_counts())

### One-hot encoding

In [25]:
# from sklearn.preprocessing import OneHotEncoder

In [26]:
# encoder = OneHotEncoder(sparse_output = False)

In [None]:
# y_train = y_train = tf.keras.utils.encoder.fit_transform(df["label"])

In [22]:
train_df["label"] = train_df["similarity"].apply(lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2)
y_train = tf.keras.utils.to_categorical(train_df.label, num_classes = 3)

In [23]:
valid_df["label"] = valid_df["similarity"].apply(lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2)
y_val = tf.keras.utils.to_categorical(valid_df.label, num_classes=3)

In [24]:
test_df["label"] = test_df["similarity"].apply(lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2)
y_test = tf.keras.utils.to_categorical(test_df.label, num_classes=3)

# Data adaptation for BERT

Version-1
Parameters:
1. Sentence pairs
2. lables
3. batch_size

______________________
version-2

Parameters:
1. Sentence pairs
2. lables
3. batch_size
4. shuffle
5. include_lables

Return: Tuple(s) ([input_ids, attention_mask, token_type_ids], lables)

In [27]:
class BERTDataGenerator(tf.keras.utils.Sequence):
    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size = batch_size,
        shuffle = True,
        include_targets = True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets

        self.tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case = True)                        # Same tokenizer as in previous code
        
        self.indexes = np.arange(len(self.sentence_pairs))
        
        self.on_epoch_end()

    def __len__(self):
        return len(self.sentence_pairs) // self.batch_size
    
    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus(), batch of both the sentences are encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens = True,
            max_length = max_length,
            return_attention_mask = True,
            return_token_type_ids = True,
            pad_to_max_length = True,
            return_tensors = "tf",
        )

        # Convert batch of encoded features to np array.
        input_ids = np.array(encoded["input_ids"], dtype = "int32")                                 # TRY int64 as it is dtype of SNLI dataset
        attention_masks = np.array(encoded["attention_mask"], dtype = "int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype = "int32")

        # This is to set include_targets to true if data generator is used for training OR validation (targets are required OF COURSE!).
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype = "int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]
        
        def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
            if self.shuffle:
                np.random.RandomState(42).shuffle(self.indexes)

# Distribution of BERT for sentence matching

In [None]:
# strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
strategy = tf.distribute.MirroredStrategy()

In [None]:
with strategy.scope():
    # Setts encoded token ids from BERT tokenizer in to distiribution
    input_ids = tf.keras.layers.Input(
        shape = (max_length,), 
        dtype = tf.int32,
        name = "input_ids"
        )
    
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape = (max_length,), 
        dtype = tf.int32, 
        name = "attention_masks"
        )

    # Token type ids are binary masks that identify different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape = (max_length,), 
        dtype = tf.int32, 
        name = "token_type_ids"
    )

    # -----------------DITCH RobERTa for now
    bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")

    #--------------------------------------------------------------MODEL FREEZED----------------------------------------------------------

    # FREEZE the BERT model to reuse the pretrained features without modifying them.
    bert_model.trainable = False

    bert_output = bert_model.bert(
        input_ids, 
        attention_mask = attention_masks, 
        token_type_ids = token_type_ids
    )

    sequence_output = bert_output.last_hidden_state
    pooled_output = bert_output.pooler_output

    # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data. (This is also called Adapter's base layer)
    bi_lstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences = True)
    )(sequence_output)

    # Applying hybrid pooling approach to bi_lstm sequence output
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    concat = tf.keras.layers.concatenate([avg_pool, max_pool])
    dropout = tf.keras.layers.Dropout(0.3)(concat)
    output = tf.keras.layers.Dense(3, activation = "softmax")(dropout)
    
    model = tf.keras.models.Model(
        inputs = [input_ids, attention_masks, token_type_ids], outputs = output
    )

    model.compile(
        optimizer = tf.keras.optimizers.Adam(),
        loss = "categorical_crossentropy",
        metrics = ["acc"],
    )

In [None]:
print("Strategy: " + str(strategy))
model.summary()

Creating training and validation data generators

In [None]:
train_data = BERTDataGenerator(
    train_df[["sentence1", "sentence2"]].values.astype("str"),
    y_train,
    batch_size = batch_size,
    shuffle = True,
)

In [34]:
valid_data = BERTDataGenerator(
    valid_df[["sentence1", "sentence2"]].values.astype("str"),
    y_val,
    batch_size = batch_size,
    shuffle = False,
)

In [None]:
history = model.fit(
    train_data,
    validation_data = valid_data,
    epochs = epochs,
    use_multiprocessing = True,
    workers = -1,
)

# Fine-Tuning

In [37]:
# --------------------------------------------------------------UNFREEZE THE MODEL-----------------------
bert_model.trainable = True

In [41]:
# Recompile the model
model.compile(
    optimizer = tf.keras.optimizers.Adam(1e-5),
    loss = "categorical_crossentropy",
    metrics = ["accuracy"],
)

In [None]:
model.summary()

In [None]:
history = model.fit(
    train_data,
    validation_data = valid_data,
    epochs = epochs,
    use_multiprocessing = True,
    workers = -1,
)

In [48]:
test_data = BERTDataGenerator(
    test_df[["sentence1", "sentence2"]].values.astype("str"), 
    y_test,
    batch_size = batch_size,
    shuffle = False,
)

In [None]:
model.evaluate(test_data, verbose = 1)

In [55]:
model.save('fine_tuned_bert_model.h5')

# Testing on Custom Test Cases

In [52]:
def check_similarity(sentence1, sentence2):
    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    test_data = BERTDataGenerator(
        sentence_pairs,
        labels = None,
        batch_size = 1,
        shuffle = False,
        include_targets = False,
    )

    proba = model.predict(test_data[0])[0]
    idx = np.argmax(proba)

    proba = f"{proba[idx]: .2f}%"
    pred = labels[idx]

    return pred, proba

In [None]:
sentence1 = "Considerations for designing a scalable data model?"
sentence2 = "A scalable data model should take the following into account: "
check_similarity(sentence1, sentence2)