## Natural language inference

### Contradictory dear watson
***

Project: https://www.kaggle.com/competitions/contradictory-my-dear-watson

### Model: bert fine tunning

https://huggingface.co/distilbert/distilbert-base-multilingual-cased

__Objetivo__: testear el modelo entendiendo su funcionamiento, bias y limitaciones

This model is a distilled version of the BERT base multilingual model. The code for the distillation process can be found here. This model is cased: it does make a difference between english and English.

The model is trained on the concatenation of Wikipedia in 104 different languages listed here. The model has 6 layers, 768 dimension and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average, this model, referred to as DistilmBERT, is twice as fast as mBERT-base

**Importar Librerias**

In [12]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
import keras_nlp
import seaborn as sns
import matplotlib.pyplot as plt
import os

print("TensorFlow version:", tf.__version__)
print("KerasNLP version:", keras_nlp.__version__)

TensorFlow version: 2.16.1
KerasNLP version: 0.14.4


__Conf__

In [None]:
try:
    # detect and init the TPU
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.TPUStrategy(resolver)
    print("All devices: ", tf.config.list_logical_devices('TPU'))
except ValueError:
    strategy = tf.distribute.get_strategy()  # default strategy if no TPU available

INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.
INFO:tensorflow:Initializing the TPU system: local


free(): corrupted unsorted chunks
https://symbolize.stripped_domain/r/?trace=78ac5bd76e2c,78ac5bd2804f,5796fbbbe3bf,5796fbbbe3bf&map= 
*** SIGABRT received by PID 13 (TID 843) on cpu 88 from PID 13; stack trace: ***
PC: @     0x78ac5bd76e2c  (unknown)  (unknown)
    @     0x78ab69290387        928  (unknown)
    @     0x78ac5bd28050       9808  (unknown)
    @     0x5796fbbbe3c0  (unknown)  (unknown)
    @     0x5796fbbbe3c0  (unknown)  (unknown)
https://symbolize.stripped_domain/r/?trace=78ac5bd76e2c,78ab69290386,78ac5bd2804f,5796fbbbe3bf,5796fbbbe3bf&map= 
E0917 22:02:54.792257     843 coredump_hook.cc:442] RAW: Remote crash data gathering hook invoked.
E0917 22:02:54.792270     843 client.cc:269] RAW: Coroner client retries enabled (b/136286901), will retry for up to 30 sec.
E0917 22:02:54.792273     843 coredump_hook.cc:537] RAW: Sending fingerprint to remote end.
E0917 22:02:54.792300     843 coredump_hook.cc:546] RAW: Cannot send fingerprint to Coroner: [NOT_FOUND] stat failed on

In [None]:
RESULT_DICT = {
    0 : "entailment",
    1 : "neutral",
    2 : "contradiction"
}

#### Data

In [None]:
df_train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
df_test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')

In [None]:
def display_pair_of_sentence(x):
    print( "Premise : " + x['premise'])
    print( "Hypothesis: " + x['hypothesis'])
    print( "Language: " + x['language'])
    print( "Label: " + str(x['label']))
    print()

df_train.head(5).apply(lambda x : display_pair_of_sentence(x), axis=1)

df_train.shape

In [None]:
df_train["premise_length"] = df_train["premise"].apply(lambda x : len(x))
df_train["hypothesis_length"] = df_train["hypothesis"].apply(lambda x : len(x))
df_train[["hypothesis_length", "premise_length"]].describe()

#### Data preprocessing

In [None]:
df_train = df_train[['premise', 'hypothesis', 'label']]
df_test = df_test[['premise', 'hypothesis']]

#### Train model

In [None]:
from transformers import BertTokenizer, TFBertModel, TFAutoModel,AutoTokenizer

In [None]:
model_name ='joeddav/xlm-roberta-large-xnli'
tokenizer = AutoTokenizer.from_pretrained(model_name)

#### Tokenize

In [None]:
def encode_premise_sentence(s):
    tokens=[]
    tokens.append('[CLS]')
    tokens+=list(tokenizer.tokenize(s))
    return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
def encode_hypo_sentence(s):
    tokens=[]
    tokens.append('[sep]')
    tokens+=list(tokenizer.tokenize(s))
    tokens.append('[sep]')
    return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
encode_hypo_sentence("jsalkgfad")

In [None]:
encode_premise_sentence("jsalkgfad")

In [None]:
tokenized=[]
for i in range(len(df_train)):
    pre=encode_premise_sentence(df_train['premise'][i])
    hyp=encode_hypo_sentence(df_train['hypothesis'][i])
    tokenized.append(pre+hyp)
df_train['tokenized']=tokenized
df_train.head()

#### Attention Mask and Token Type ID

In [None]:
mask=[]
for i in range(len(df_train)):
    padded_seq=tokenizer(df_train['premise'][i],df_train['hypothesis'][i], padding=True,add_special_tokens = True)
    mask.append(padded_seq)
df_train['masked'] = mask
df_train.head(5)
# print(mask[0])

#### Train Model

In [None]:
max_len=237
def build_model():
    bert_encoder = TFAutoModel.from_pretrained('joeddav/xlm-roberta-large-xnli')
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    embedding = bert_encoder([input_word_ids, input_mask])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
def input_convert(data):
    inputs={
        'input_word_ids':[],
        'input_mask':[]
    }
    for each in data:
        inputs['input_word_ids'].append(each['input_ids'])
        inputs['input_mask'].append(each['attention_mask'])
        
    inputs['input_word_ids']= tf.ragged.constant( inputs['input_word_ids']).to_tensor()
    inputs['input_mask']= tf.ragged.constant( inputs['input_mask']).to_tensor()
    return inputs

In [None]:
train_input=input_convert(df_train['masked'].values)
for key in train_input.keys():
    train_input[key] = train_input[key][:,:max_len]

In [None]:
train_input

In [None]:
from transformers import TFXLMRobertaModel

In [None]:
import tensorflow as tf
from transformers import TFXLMRobertaModel

def build_model():
    max_len = 100  # Adjust as needed

    # Define input layers using tf.keras.Input
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")

    # Instantiate the transformer model
    bert_encoder = TFXLMRobertaModel.from_pretrained('joeddav/xlm-roberta-large-xnli')

    # Forward pass through the model
    # Ensure inputs are handled correctly
    encoder_outputs = bert_encoder(
        input_ids=input_word_ids,
        attention_mask=input_mask
    )
    embedding = encoder_outputs.last_hidden_state  # Use last_hidden_state for embeddings

    # Define the output layer
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:, 0, :])

    # Build and compile the model
    model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),  # Adjust learning rate as needed
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model


In [None]:
import tensorflow as tf
from transformers import TFXLMRobertaModel

def build_model():
    max_len = 100  # Adjust as needed

    # Define input layers using tf.keras.Input
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")

    # Instantiate the transformer model
    bert_encoder = TFXLMRobertaModel.from_pretrained('joeddav/xlm-roberta-large-xnli')

    # Use tf.function to convert Keras tensors to TensorFlow tensors
    @tf.function
    def get_model_outputs(input_ids, attention_mask):
        return bert_encoder(input_ids=input_ids, attention_mask=attention_mask)

    # Forward pass through the model
    encoder_outputs = get_model_outputs(input_word_ids, input_mask)
    embedding = encoder_outputs.last_hidden_state  # Use last_hidden_state for embeddings

    # Define the output layer
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:, 0, :])

    # Build and compile the model
    model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),  # Adjust learning rate as needed
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model


In [None]:
 import tensorflow as tf
from transformers import TFXLMRobertaModel

def build_model():
    max_len = 100  # Adjust this as needed

    # Define input layers using tf.keras.Input
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")

    # Instantiate the transformer model
    bert_encoder = TFXLMRobertaModel.from_pretrained('joeddav/xlm-roberta-large-xnli')

    # Define a function to call the model
    def call_transformer_model(input_ids, attention_mask):
        # Ensure the inputs are TensorFlow tensors
        return bert_encoder(input_ids=input_ids, attention_mask=attention_mask)
    
    # Call the model
    encoder_outputs = call_transformer_model(input_word_ids, input_mask)
    embedding = encoder_outputs.last_hidden_state  # Get the last hidden state

    # Define the output layer
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:, 0, :])  # Using the first token (CLS token) for classification

    # Build and compile the model
    model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),  # Adjust learning rate as needed
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model


In [None]:
# Example of preparing data
train_input = {
    'input_word_ids': tf.convert_to_tensor(train_enc['input_ids'], dtype=tf.int32),
    'input_mask': tf.convert_to_tensor(train_enc['attention_mask'], dtype=tf.int32)
}

# Example of model training
model = build_model()
model.summary()
model.fit(train_input, df_train['label'].values, epochs=5, verbose=1, batch_size=128, validation_split=0.1, callbacks=[early_stop])


In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(patience=3,restore_best_weights=True)
with strategy.scope():
    model = build_model()
    model.summary()
    model.fit(train_input, df_train['label'].values, epochs=5, verbose=1, batch_size=128, validation_split=0.1, callbacks=[early_stop])

#### Prediction

In [None]:
mask=[]
for i in range(len(df_test)):
    padded_seq=tokenizer(df_test['premise'][i],df_test['hypothesis'][i],
                        padding=True,add_special_tokens =True)
    mask.append(padded_seq)
df_test['masked']=mask
df_test.head()

In [None]:
predictions=[np.argmax(i) for i in model.predict(test_input)]

In [None]:
VALIDATION_SPLIT = 0.2
TRAIN_SIZE = int(df_train.shape[0]*(1-VALIDATION_SPLIT))
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

In [None]:
def split_labels(x, y):
    return (x[0], x[1]), y


training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            df_train[['premise','hypothesis']].values,
            keras.utils.to_categorical(df_train['label'], num_classes=3)
        )
    )
)

train_dataset = training_dataset.take(TRAIN_SIZE)
val_dataset = training_dataset.skip(TRAIN_SIZE)

# Apply the preprocessor to every sample of train, val and test data using `map()`.
# [`tf.data.AUTOTUNE`](https://www.tensorflow.org/api_docs/python/tf/data/AUTOTUNE) and `prefetch()` are options to tune performance, see
# https://www.tensorflow.org/guide/data_performance for details.

train_preprocessed = train_dataset.map(split_labels, tf.data.AUTOTUNE).batch(BATCH_SIZE, drop_remainder=True).cache().prefetch(tf.data.AUTOTUNE)
val_preprocessed = val_dataset.map(split_labels, tf.data.AUTOTUNE).batch(BATCH_SIZE, drop_remainder=True).cache().prefetch(tf.data.AUTOTUNE)

In [None]:
# Load a BERT model.
with strategy.scope():
    classifier = keras_nlp.models.DistilBertClassifier.from_preset("distil_bert_base_multi", num_classes=3)

    # in distributed training, the recommendation is to scale batch size and learning rate with the numer of workers.
    classifier.compile(optimizer=keras.optimizers.Adam(1e-5*strategy.num_replicas_in_sync),
                       loss=keras.losses.CategoricalCrossentropy(from_logits=True),
                       metrics=['accuracy'])
    
    classifier.summary()

__Fine tunning distil bert__

In [None]:
EPOCHS=10
history = classifier.fit(train_preprocessed,
                         epochs=EPOCHS,
                         validation_data=val_preprocessed
                        )