In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from transformers import AutoTokenizer, TFRobertaForQuestionAnswering, TFBertForQuestionAnswering
from sklearn import model_selection

from chaii_config import *
from chaii_models import *
from chaii_utils import *

2021-10-16 18:31:41.228752: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2021-10-16 18:31:41.228881: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
FOLD = 4
MODEL_CONFIG = muril_config

In [3]:
def prepare_train_features(examples_df, tokenizer, config):
    examples = examples_df.copy().reset_index(drop=True)
    
    examples["question"] = [q.lstrip() for q in examples["question"]]

    tokenized_examples = tokenizer(
        examples["question" if config["pad_on_right"] else "context"].to_list(),
        examples["context" if config["pad_on_right"] else "question"].to_list(),
        truncation="only_second" if config["pad_on_right"] else "only_first",
        max_length=config["max_length"],
        stride=config["doc_stride"],
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_token_type_ids=True,
        padding="max_length",
    )

    # Example to feature mapping as long contexts might give multiple features
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # Character to token mapping
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Get labels
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Sequence ids indicate from which sequence a token is
        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]  # which example created this feature
        answer_text = examples["answer_text"].values[sample_index]
        answer_start_char = examples["answer_start"].values[sample_index]
        answer_end_char = answer_start_char + len(answer_text)

        # Find start and end token index (set default to cls index)
        answer_start_token = cls_index
        answer_end_token = cls_index

        # Get start and end of context
        token_start_index = 0
        while sequence_ids[token_start_index] != (1 if config["pad_on_right"] else 0):
            token_start_index += 1
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != (1 if config["pad_on_right"] else 0):
            token_end_index -= 1

        # Detect if the answer is inside the span (otherwise use leave cls label)
        if (offsets[token_start_index][0] <= answer_start_char and offsets[token_end_index][1] >= answer_end_char):
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= answer_start_char:
                token_start_index += 1
            answer_start_token = token_start_index - 1
            while offsets[token_end_index][1] >= answer_end_char:
                token_end_index -= 1
            answer_end_token = token_end_index + 1

        if answer_start_token == cls_index or answer_end_token == cls_index:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            tokenized_examples["start_positions"].append(answer_start_token)
            tokenized_examples["end_positions"].append(answer_end_token)

    return tokenized_examples

In [4]:
def create_model_input(tokenized_features, is_train=True):
    if is_train:
        X_train = [
            np.array(tokenized_features["input_ids"]),
            np.array(tokenized_features["attention_mask"]),
            np.array(tokenized_features["token_type_ids"]),
        ]
        Y_train = [
            np.array(tokenized_features["start_positions"]),
            np.array(tokenized_features["end_positions"]),
        ]
        return X_train, Y_train
    else:
        X_test = [
            np.array(tokenized_features["input_ids"]),
            np.array(tokenized_features["attention_mask"]),
            np.array(tokenized_features["token_type_ids"]),
        ]
        return X_test

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG["model_checkpoint"])

In [7]:
chaii = pd.read_csv(INPUT_DIR / "chaii-hindi-and-tamil-question-answering/train.csv")
mlqa = pd.read_csv(INPUT_DIR / "chaii-data/mlqa.csv")
xquad = pd.read_csv(INPUT_DIR / "chaii-data/xquad.csv")

In [8]:
df = pd.concat([chaii, mlqa, xquad]).reset_index(drop=True)
df["kfold"] = -1
kf = model_selection.StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=1)
for fold, (t, v) in enumerate(kf.split(X=df, y=df.language.values)):
    df.loc[v, "kfold"] = fold
df

Unnamed: 0,id,context,question,answer_text,answer_start,language,kfold
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,4
1,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,1
2,29d154b56,சர் அலெக்ஸாண்டர் ஃபிளெமிங் (Sir Alexander Flem...,பென்சிலின் கண்டுபிடித்தவர் யார்?,சர் அலெக்ஸாண்டர் ஃபிளெமிங்,0,tamil,3
3,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,68,tamil,3
4,b29c82c22,சூரியக் குடும்பம் \nசூரியக் குடும்பம் (Solar S...,பூமியின் அருகில் உள்ள விண்மீன் எது?,சூரியனும்,585,tamil,1
...,...,...,...,...,...,...,...
7724,57378c9b1c456719005744aa,विद्युत आवेश के परिवर्तन की समय दर के रूप में ...,इलेक्ट्रोस्टैटिक और चुंबकीय बल के योग के रूप क...,इलेक्ट्रोस्टैटिक बल,328,hindi,4
7725,5737a25ac3c5551400e51f51,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,संरचनाओं में तनाव का कारण क्या बनता है?,तनाव टेंसर,343,hindi,3
7726,5737a25ac3c5551400e51f52,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,किसी वस्तु के आयतन में क्रॉस सेक्शन क्षेत्र की...,दबाव की शर्तें,118,hindi,2
7727,5737a25ac3c5551400e51f53,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,सामान्य ताकतों से क्या जुड़ा है?,दबाव की शर्तें,118,hindi,1


In [9]:
if USE_TPU:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
    with strategy.scope():
        model = create_model(MODEL_CONFIG)
else:
    model = create_model(MODEL_CONFIG)

model.summary()

2021-10-16 18:31:56.293391: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-10-16 18:31:56.296471: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2021-10-16 18:31:56.296514: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-10-16 18:31:56.296541: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (c47c18d74a1d): /proc/driver/nvidia/version does not exist
2021-10-16 18:31:56.300098: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operation

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 504857600   input_1[0][0]                    
                                                                 input_3[0][0]                

In [10]:
class ExactMatch(keras.callbacks.Callback):
    def __init__(self, eval_df, eval_features, tokenizer):
        self.eval_df = eval_df
        self.eval_features = eval_features
        self.tokenizer = tokenizer
        self.x_eval = [
            np.array(eval_features["input_ids"]),
            np.array(eval_features["attention_mask"]),
            np.array(eval_features["token_type_ids"]),
        ]
        
    def on_epoch_end(self, epoch, logs=None):
        raw_predictions = self.model.predict(self.x_eval)
        final_predictions = postprocess_predictions(
            self.eval_df, self.eval_features, raw_predictions, self.tokenizer, verbose=False
        )
        results = pd.DataFrame(
            {
                "id": self.eval_df["id"].values,
                "language": self.eval_df["language"].values,
                "answer": self.eval_df["answer_text"].values,
            }
        )
        results["prediction"] = results["id"].apply(lambda x: final_predictions[x])
        results["exact_match"] = results["prediction"] == results["answer"]
        acc = sum(results["exact_match"] / results.shape[0])
        if results["language"].nunique() > 1:
            results_hindi = results[results["language"] == "hindi"]
            results_tamil = results[results["language"] == "tamil"]
            acc_hindi = sum(results_hindi["prediction"] == results_hindi["answer"]) / results_hindi.shape[0]
            acc_tamil = sum(results_tamil["prediction"] == results_tamil["answer"]) / results_tamil.shape[0]
            print(
                f"\nepoch={epoch+1}, exact match score={acc:.2f},",
                f"exact match score (hindi)={acc_hindi:.2f},",
                f"exact match score (tamil)={acc_tamil:.2f}"
            )
        else:
            print(f"\nepoch={epoch+1}, exact match score={acc:.2f}")
        
        
class JaccardSimilarity(keras.callbacks.Callback):
    def __init__(self, eval_df, eval_features, tokenizer):
        self.eval_df = eval_df
        self.eval_features = eval_features
        self.tokenizer = tokenizer
        self.x_eval = [
            np.array(eval_features["input_ids"]),
            np.array(eval_features["attention_mask"]),
            np.array(eval_features["token_type_ids"]),
        ]
        
    def on_epoch_end(self, epoch, logs=None):
        raw_predictions = self.model.predict(self.x_eval)
        final_predictions = postprocess_predictions(
            self.eval_df, self.eval_features, raw_predictions, self.tokenizer, verbose=False
        )
        results = pd.DataFrame(
            {
                "id": self.eval_df["id"].values,
                "language": self.eval_df["language"].values,
                "answer": self.eval_df["answer_text"].values,
            }
        )
        results["prediction"] = results["id"].apply(lambda x: final_predictions[x])
        results["jaccard"] = results[["answer", "prediction"]].apply(lambda x: jaccard_similarity(x[0], x[1]), axis=1)
        jaccard_sim = results["jaccard"].mean()
        if results["language"].nunique() > 1:
            jaccard_sim_hindi = results.loc[results["language"] == "hindi", "jaccard"].mean()
            jaccard_sim_tamil = results.loc[results["language"] == "tamil", "jaccard"].mean()
            print(
                f"\nepoch={epoch+1}, jaccard similarity={jaccard_sim:.2f},",
                f"jaccard similarity (hindi)={jaccard_sim_hindi:.2f},",
                f"jaccard similarity (tamil)={jaccard_sim_tamil:.2f}"
            )
        else:
            print(f"\nepoch={epoch+1}, jaccard similarity={jaccard_sim:.2f}")
            
            
class EarlyStoppingAtMaxJaccardSimilarity(keras.callbacks.Callback):
    """Stop training when the Jaccard Similarity is max

      Arguments:
          patience: Number of epochs to wait after min has been hit. After this
          number of no improvement, training stops.
          eval_df, eval_features, tokenizer: Ground truth and tokenizer needed in
          order to compute js
    """

    def __init__(self, eval_df, eval_features, tokenizer, patience=0):
        super(EarlyStoppingAtMaxJaccardSimilarity, self).__init__()
        self.patience = patience
        self.eval_df = eval_df
        self.eval_features = eval_features
        self.tokenizer = tokenizer
        self.x_eval = [
            np.array(eval_features["input_ids"]),
            np.array(eval_features["attention_mask"]),
            np.array(eval_features["token_type_ids"]),
        ]
        # best_weights to store the weights at which the max JS occurs.
        self.best_weights = None

    def on_train_begin(self, logs=None):
        # The number of epoch it has waited when loss is no longer minimum.
        self.wait = 0
        # The epoch the training stops at.
        self.stopped_epoch = 0
        # Initialize the best as infinity.
        self.best = 0.0

    def on_epoch_end(self, epoch, logs=None):
        raw_predictions = self.model.predict(self.x_eval)
        final_predictions = postprocess_predictions(
            self.eval_df, self.eval_features, raw_predictions, self.tokenizer, verbose=False
        )
        Y_pred = [final_predictions[id_] for id_ in self.eval_df["id"].values]
        Y_true = self.eval_df["answer_text"].values
        jaccard_sim = [jaccard_similarity(Y_true[i], Y_pred[i]) for i in range(Y_true.shape[0])]
        current = np.round(np.mean(jaccard_sim), 4)
        print(f"Validation jaccard similarity: {current}")
        
        if current > self.best:
            self.best = current
            self.wait = 0
            # Record the best weights if current results is better.
            self.best_weights = self.model.get_weights()
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.stopped_epoch = epoch
                self.model.stop_training = True
                print("Restoring model weights from the end of the best epoch.")
                self.model.set_weights(self.best_weights)

    def on_train_end(self, logs=None):
        if self.stopped_epoch > 0:
            print("Epoch %05d: early stopping" % (self.stopped_epoch + 1))

In [11]:
train_df = df.loc[df["kfold"] != FOLD, :]
val_df = df.loc[df["kfold"] == FOLD, :]
tokenized_train = prepare_train_features(train_df, tokenizer, MODEL_CONFIG)
tokenized_val = prepare_train_features(val_df, tokenizer, MODEL_CONFIG)
X_train, Y_train = create_model_input(tokenized_train)
X_val, Y_val = create_model_input(tokenized_train)
    
eval_features = prepare_validation_features(val_df, tokenizer, MODEL_CONFIG)

#early_stopping = tf.keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True)
#jaccard_similarity_callback = JaccardSimilarity(val_df, eval_features, tokenizer)
#exact_match_callback = ExactMatch(val_df, eval_features, tokenizer)

early_stopping_at_max_js = EarlyStoppingAtMaxJaccardSimilarity(
    patience=3, eval_df=val_df, eval_features=eval_features, tokenizer=tokenizer
)

model.fit(
    X_train, Y_train,
    epochs=10,
    batch_size=MODEL_CONFIG["batch_size"],#64
    validation_data=(X_val, Y_val),
    callbacks=[early_stopping_at_max_js]
)

save_locally = tf.saved_model.SaveOptions(experimental_io_device="/job:localhost")

path = f"{MODEL_CONFIG.model_name}-f{FOLD}"
model.save(path, options=save_locally)
print(f"Model written to {path}")

Epoch 1/10


  num_elements)


Validation jaccard similarity: 0.6602
Epoch 2/10
Validation jaccard similarity: 0.6648
Epoch 3/10
Validation jaccard similarity: 0.6494
Epoch 4/10
Validation jaccard similarity: 0.6472
Epoch 5/10
Validation jaccard similarity: 0.6431
Restoring model weights from the end of the best epoch.
Epoch 00005: early stopping


2021-10-16 18:49:35.811777: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


Model written to muril-large-cased-f4
