In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import tensorflow as tf
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/watson/pred.txt
/kaggle/input/contradictory-my-dear-watson/sample_submission.csv
/kaggle/input/contradictory-my-dear-watson/test.csv
/kaggle/input/contradictory-my-dear-watson/train.csv
/kaggle/input/test-pred/test361_0.npy
/kaggle/input/test-pred/test_1.npy
/kaggle/input/test-pred/test361_1.npy
/kaggle/input/test-pred/test361_2.npy
/kaggle/input/test-pred/test_2.npy
/kaggle/input/test-pred/test_0.npy
/kaggle/input/testpred2/test361_0.npy
/kaggle/input/testpred2/test_1.npy
/kaggle/input/testpred2/test361_1.npy
/kaggle/input/testpred2/test361_2.npy
/kaggle/input/testpred2/test_2.npy


In [2]:
!pip3 install transformers



In [3]:
!pip3 install nlp

Collecting nlp
  Downloading nlp-0.4.0-py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 4.4 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 43.4 MB/s 
Installing collected packages: xxhash, nlp
Successfully installed nlp-0.4.0 xxhash-2.0.0


In [4]:
!pip3 install datasets

Collecting datasets
  Downloading datasets-1.1.3-py3-none-any.whl (153 kB)
[K     |████████████████████████████████| 153 kB 4.4 MB/s 
Installing collected packages: datasets
Successfully installed datasets-1.1.3


In [5]:
import os
os.environ["WANDB_API_KEY"] = "0" ## to silence warning

import numpy as np
import random
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import transformers
import tokenizers
import nlp

import datetime
import json
import IPython
from collections import Counter
from IPython.display import display, HTML, IFrame
from datasets import load_dataset

strategy = None

In [7]:
tf.config.set_soft_device_placement(True)
tf.debugging.set_log_device_placement(True)
strategy = tf.distribute.MirroredStrategy()

In [8]:
original_train = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")

lang_abvs = original_train['lang_abv'].values
langs = original_train['language'].values

original_train = sklearn.utils.shuffle(original_train, random_state=SEED)

validation_ratio = 0.3
nb_valid_examples = max(1, int(len(original_train) * validation_ratio))

original_valid = original_train[:nb_valid_examples]
original_train = original_train[nb_valid_examples:]
original_test = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/test.csv")

In [None]:
raw_ds_mapping = {
    'original train': original_train,
    'original valid':  original_valid,
    'original test': original_test
}
def get_raw_dataset(ds_spec, langs=None, shuffle=False, nb_examples=None):
    '''
    get_raw_dataset function: get numbers of examples from the train, valid, and test set'''
    with tf.device('/device:GPU:0'):
        if type(ds_spec) == list:
            ds_spec = {k: None for k in ds_spec}
        
        all_ds = []
        for ds_name, _nb_examples in ds_spec.items():
            
            raw_ds = raw_ds_mapping[ds_name]
            
            if not _nb_examples:
                _nb_examples = len(raw_ds)
            
            ds = raw_ds

            if shuffle:
                ds = sklearn.utils.shuffle(ds, random_state=SEED_2)

            if langs is not None:
                ds = ds[ds['lang_abv'].isin(langs)]
                print('languages:', ds['lang_abv'].unique())

            _nb_examples = max(1, min(_nb_examples, len(ds)))
            ds = ds[:_nb_examples]
                
            all_ds.append(ds)
            
        ds = pd.concat(all_ds)

        if shuffle:
            ds = sklearn.utils.shuffle(ds, random_state=SEED_2)    
        
        if not nb_examples:
            nb_examples = len(ds)

        ds = ds[:nb_examples]
    return ds

In [13]:
def get_unbatched_dataset(ds_spec, tokenizer_name, langs=None, shuffle=False, nb_examples=None, max_len=80, token_counter=None, return_raw_ds=False):
    """
    Get a combined `pandas.DataFrame` from the raw datasets specified in `ds_names`, then perform tokenization
    and create a unbatched `tf.data.Dataset` dataset.
    
    Args: 
        ds_spec: dict. Keys are the keys in `raw_ds_mapping` and values are the number of examples to use
            from the corresponding datasets.
        tokenizer_name: str, the name of a Hugging Face's tokenizer name, e.g. `distilbert-base-uncased`.
        langs: list, a list of language abbreviations. Only examples in these languages will be included.
        shuffle: bool, if to shuffle the raw datasets before sampling from them.
        nb_examples: int, how many examples from the combined raw dataset to be included in the final dataset.
            If `None`, all examples are included.
        max_len: int, the maximal length for tokenization. Padding and truncation are performed.
        
    """
    with tf.device('/device:GPU:0'):
        ds = get_raw_dataset(ds_spec, langs=langs, shuffle=shuffle, nb_examples=nb_examples)
        sentence_pairs = list(zip(ds['premise'].tolist(), ds['hypothesis'].tolist()))
        
        if 'label' not in ds:
            ds['label'] = -1
        labels = ds['label'].tolist()


        tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)

        # `transformers.tokenization_utils_base.BatchEncoding` object -> `dict`
        r = {'input_ids': [], 'attention_mask': []}
        if len(sentence_pairs) > 0:
            r = dict(tokenizer.batch_encode_plus(batch_text_or_text_pairs=sentence_pairs, max_length=max_len, padding='max_length', truncation=True))

        if token_counter is not None:
            for tokens in r['input_ids']:
                token_counter.update(tokens)
        # print('inputs ids', len(r['input_ids']))
        # print('attention_mask', len(r['attention_mask']))
        # print('labels',len(labels))
        # r['input_ids'] = np.array(r['input_ids'])
        # r['attention_mask'] = np.array( r['attention_mask'])
        # This is very slow
        # with tf.device('/GPU:0'):
        dataset = tf.data.Dataset.from_tensor_slices({'inputs': r, 'labels': labels})
        print(dataset)
        result = (dataset, len(ds))
        if return_raw_ds:
          result = (dataset, len(ds), ds)
      
    return result

def get_training_dataset(unbatched_dataset, nb_examples, batch_size=32, shuffle_buffer_size=None, repeat=False):
    '''
    get_training_dataset: get the dataset into batches'''
    with tf.device('/device:GPU:0'):
        dataset = unbatched_dataset
        if repeat:
            dataset = dataset.repeat()
        
        if not shuffle_buffer_size:
            shuffle_buffer_size = nb_examples
        dataset = dataset.shuffle(shuffle_buffer_size)
        dataset = dataset.batch(batch_size, drop_remainder=True)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        
    return dataset

def get_prediction_dataset(dataset, batch_size=32):
    '''
    get_prediction_dataset: get the dataset into batches'''
    with tf.device('/device:GPU:0'):
        dataset = dataset.batch(batch_size, drop_remainder=False)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        
    return dataset

In [15]:
def sample_without_replacement(prob_dist, nb_samples):
    """Sample integers in the range [0, N), without replacement, according to the probability
       distribution `prob_dist`, where `N = prob_dist.shape[0]`.
    
    Args:
        prob_dist: 1-D tf.float32 tensor.
    
    Returns:
        selected_indices: 1-D tf.int32 tensor
    """
    with tf.device('/device:GPU:0'):
        nb_candidates = tf.shape(prob_dist)[0]
        logits = tf.math.log(prob_dist)
        z = -tf.math.log(-tf.math.log(tf.random.uniform(shape=[nb_candidates], minval=0, maxval=1)))
        _, selected_indices = tf.math.top_k(logits + z, nb_samples)

    return selected_indices

In [16]:
def get_masked_lm_fn(tokenizer, mlm_mask_prob=0.15, mask_type_probs=(0.3, 0.6, 0.1), token_counts=None, predict_special_tokens=False, mlm_smoothing=0.7):
    # print(tokenizer.vocab_size)
    """
    Prepare the batch: from the input_ids and the lenghts, compute the attention mask and the masked label for MLM.

    Args:

        tokenizer: A Hugging Face tokenizer.  
        
        token_counts: A list of integers of length `tokenizer.vocab_size`, which is the token counting in a dataset
            (usually, the huge dataset used for pretraing a LM model). This is used for giving higher probability
            for rare tokens to be masked for prediction. If `None`, each token has the same probability to be masked.

        mlm_mask_prob:  A `tf.float32` scalar tensor. The probability to <mask> a token, inclding
            actually masking, keep it as it is (but to predict it), and randomly replaced by another token.
        
        mask_type_probs: A `tf.float32` tensor of shape [3]. Among the sampled tokens to be <masked>, 
        mask_type_probs[0]: the proportion to be replaced by the mask token
            mask_type_probs[1]: the proportion to be kept as it it
            mask_type_probs[2]: the proportion to be replaced by a random token in the tokenizer's vocabulary
        
        predict_special_tokens: bool, if to mask special tokens, like cls, sep or padding tokens. Default: `False`
        
        mlm_smoothing: float, smoothing parameter to emphasize more rare tokens (see `XLM` paper, similar to word2vec).
        
    Retruns:

        prepare_masked_lm_batch: a function that masks a batch of token sequences.
    """

    if token_counts is None:
        """
        Each token has the same probability to be masked.
        """
        token_counts = [1] * tokenizer.vocab_size

    # Tokens with higher counts will be masked less often.
    # If some token has count 1, it will have freq 1.0 in this frequency list, which is the highest value.
    # However, since it never appears in the corpus used for pretraining, there is no effect of this high frequency.
    token_mask_freq = np.maximum(token_counts, 1) ** -mlm_smoothing

    # NEVER to mask/predict padding tokens.
    token_mask_freq[tokenizer.pad_token_id] = 0.0
    with tf.device('/device:GPU:0'):
        if not predict_special_tokens:
            for special_token_id in tokenizer.all_special_ids:
                """
                Do not to predict special tokens, e.g. padding, cls, sep and mask tokens, etc.
                """
                token_mask_freq[special_token_id] = 0.0

        # Convert to tensor.
        token_mask_freq = tf.constant(token_mask_freq, dtype=tf.float32)        

        mlm_mask_prob = tf.constant(mlm_mask_prob)
        mask_type_probs = tf.constant(mask_type_probs)
        
        vocab_size = tf.constant(tokenizer.vocab_size)
        pad_token_id = tf.constant(tokenizer.pad_token_id)
        mask_token_id = tf.constant(tokenizer.mask_token_id)
    print('mask_token_id:', mask_token_id)

    def prepare_masked_lm_batch(inputs):
        """
        Prepare the batch: from the input_ids and the lenghts, compute the attention mask and the masked label for MLM.

        Args:
            
            inputs: a dictionary of tensors. Format is:
            
                {
                    'input_ids': `tf.int32` tensor of shape [batch_size, seq_len] 
                    : `tf.int32` tensor of shape [batch_size, seq_len] 
                }            
                
                Optionally, it could contain extra keys 'attention_mask' and `token_type_ids` with values being
                `tf.int32` tensors of shape [batch_size, seq_len] 
             
        Returns:
        
            result: a dictionary. Format is as following:

                {
                    'inputs': A dictionary of tensors, the same format as the argument `inputs`.
                    'mlm_labels': shape [batch_size, seq_len]

                    'mask_types': shape [batch_size, seq_len]
                    'original_input_ids': shape [batch_size, seq_len]
                    'nb_tokens': shape [batch_size]
                    'nb_non_padding_tokens': shape [batch_size]
                    'nb_tokens_considered': shape [batch_size]
                    'nb_tokens_masked': shape [batch_size]
                }
                
                The tensors associated to `number of tokens` are the toekn countings in the whole batch, not
                in individual examples. They are actually constants, but reshapped to [batch_size], because
                `tf.data.Dataset` requires the batch dimension to be consistent. These are used only for debugging,
                except 'nb_tokens_masked, which is used for calculating the MLM loss values.
        """
        with tf.device('/device:GPU:0'):
            input_ids = inputs['input_ids']
            print('input_ids:',input_ids)
            batch_size, seq_len = input_ids.shape

            attention_mask = None
            if 'attention_mask' in inputs:
                attention_mask = inputs['attention_mask']

            # Compute `attention_mask` if necessary
            if attention_mask is None:
                attention_mask = tf.cast(input_ids != pad_token_id, tf.int32)            

            # The number of tokens in each example, excluding the padding tokens. 
            # shape = [batch_size]
            lengths = tf.reduce_sum(attention_mask, axis=-1)
                    
            # The total number of tokens, excluding the padding tokens.
            nb_non_padding_tokens = tf.math.reduce_sum(lengths)

            # For each token in the batch, get its frequency to be masked from the 1-D tensor `token_mask_freq`.
            # We keep the output to remain 1-D, since it's easier for using sampling method `sample_without_replacement`.
            # shape = [batch_size * seq_len], 1-D tensor.
            freq_to_mask = tf.gather(params=token_mask_freq, indices=tf.reshape(input_ids, [-1]))

            # Normalize the frequency to get a probability (of being masked) distribution over tokens in the batch.
            # shape = [batch_size * seq_len], 1-D tensor.
            prob_to_mask = freq_to_mask / tf.reduce_sum(freq_to_mask)

            tokens_considered = tf.cast(attention_mask, tf.bool)
            if not predict_special_tokens:
                for special_token_id in tokenizer.all_special_ids:
                    tokens_considered = tf.logical_and(tokens_considered, input_ids != special_token_id)
            nb_tokens_considered = tf.reduce_sum(tf.cast(tokens_considered, dtype=tf.int32))
            
            # The number of tokens to be masked.
            # type = tf.float32
            # nb_tokens_to_mask = tf.math.ceil(mlm_mask_prob * tf.cast(nb_non_padding_tokens, dtype=tf.float32))
            nb_tokens_to_mask = tf.math.ceil(mlm_mask_prob * tf.cast(nb_tokens_considered, dtype=tf.float32))
            
            # round to an integer
            nb_tokens_to_mask = tf.cast(nb_tokens_to_mask, tf.int32)

            # Sample `nb_tokens_to_mask` of different indices in the range [0, batch_size * seq_len).
            # The sampling is according to the probability distribution `prob_to_mask`, without replacement.
            # shape = [nb_tokens_to_mask]
            indices_to_mask = sample_without_replacement(prob_to_mask, nb_tokens_to_mask)
            # Create a tensor of shape [batch_size * seq_len].
            # At the indices specified in `indices_to_mask`, it has value 1. Otherwise, the value is 0.
            # This is a mask (after being reshaped to 2D tensor) for masking/prediction, where `1` means that, at that place,
            # the token should be masked for prediction. 
            pred_mask = tf.scatter_nd(
                indices=indices_to_mask[:, tf.newaxis],  # This is necessary for making `tf.scatter_nd` work here. Check the documentation.
                updates=tf.cast(tf.ones_like(indices_to_mask), tf.bool),
                shape=[batch_size * seq_len]
            )

            # Change to 2-D tensor.
            # The mask for masking/prediction.
            # shape = [batch_size, seq_len]
            pred_mask = tf.reshape(pred_mask, [batch_size, seq_len])

            # Get token ids at the places where to mask tokens.
            # 1-D tensor, shape = [nb_tokens_to_mask].
            _input_ids_real = input_ids[pred_mask]

            # randomly select token ids from the range [0, vocab_size)
            # 1-D tensor, shape = [nb_tokens_to_mask]

            _input_ids_rand = tf.random.uniform(shape=[nb_tokens_to_mask], minval=0, maxval=vocab_size, dtype=tf.int32)

            # A constant tensor with value `mask_token_id`.
            # 1-D tensor, shape = [nb_tokens_to_mask]
            _input_ids_mask = mask_token_id * tf.ones_like(_input_ids_real, dtype=tf.int32)

            # For each token to be masked, we decide which type of transformations to apply:
            #     0: masked, 1: keep it as it is, 2: replaced by a random token
            
            # Detail: we need to pass log probability (logits) to `tf.random.categorical`,
            #    and it has to be 2-D. The output is also 2-D, and we just take the 1st row.
            # shape = [nb_tokens_to_mask]
            mask_types = tf.random.categorical(logits=tf.math.log([mask_type_probs]), num_samples=nb_tokens_to_mask)[0]

            # These are token ids after applying masking.
            # shape = [nb_tokens_to_mask]
            masked_input_ids = (
                _input_ids_mask * tf.cast(mask_types == 0, dtype=tf.int32) + \
                _input_ids_real * tf.cast(mask_types == 1, dtype=tf.int32) + \
                _input_ids_rand * tf.cast(mask_types == 2, dtype=tf.int32)
            )
            # Put the masked token ids into a 2-D tensor (initially zeros) of shape [batch_size, seq_len].
            # remark: `tf.where(pred_mask)` is of shape [nb_tokens_to_mask, 2].
            token_ids_to_updates = tf.scatter_nd(indices=tf.where(pred_mask), updates=masked_input_ids, shape=[batch_size, seq_len])

            # At the places where we don't mask, just keep the original token ids.
            # shape = [batch_size, seq_len]
            token_ids_to_keep = input_ids * tf.cast(~pred_mask, tf.int32)
            
            # The final masked token ids used for training
            # shape = [batch_size, seq_len]
            masked_input_ids = token_ids_to_updates + token_ids_to_keep
            
            # At the places where we don't predict, change the labels to -100
            # shape = [batch_size, seq_len]
            mlm_labels = input_ids * tf.cast(pred_mask, dtype=tf.int32) + -100 * tf.cast(~pred_mask, tf.int32)

            masked_lm_batch = {
                'input_ids': masked_input_ids,
                'attention_mask': attention_mask
            }
            result = {
                    'inputs': masked_lm_batch,
                    'mlm_labels': mlm_labels,
                    'mask_types': mask_types,
                    'original_input_ids': input_ids,
                    # 'nb_tokens': nb_tokens * tf.constant(1, shape=[batch_size]),
                    'nb_non_padding_tokens': nb_non_padding_tokens * tf.constant(1, shape=[batch_size]),
                    'nb_tokens_considered': nb_tokens_considered  * tf.constant(1, shape=[batch_size]),
                    'nb_tokens_masked': nb_tokens_to_mask * tf.constant(1, shape=[batch_size])        
                }
            if 'token_type_ids' in inputs:
                masked_lm_batch['token_type_ids'] = inputs['token_type_ids']
                # The total number of tokens
                nb_tokens = tf.reduce_sum(tf.cast(input_ids > -1, dtype=tf.int32))

                # Used for visualization
                # 0: not masked, 1: masked, 2: keep it as it is, 3: replaced by a random token, 4: padding - (not masked)
                # shape = [batch_size, seq_len]
                _mask_types = tf.scatter_nd(tf.where(pred_mask), updates=mask_types + 1, shape=[batch_size, seq_len])
                _mask_types = tf.cast(_mask_types, dtype=tf.int32)
                _mask_types += 4 * tf.cast(input_ids == pad_token_id, tf.int32)
                result = {
                    'inputs': masked_lm_batch,
                    'mlm_labels': mlm_labels,
                    'mask_types': _mask_types,
                    'original_input_ids': input_ids,
                    'nb_tokens': nb_tokens * tf.constant(1, shape=[batch_size]),
                    'nb_non_padding_tokens': nb_non_padding_tokens * tf.constant(1, shape=[batch_size]),
                    'nb_tokens_considered': nb_tokens_considered  * tf.constant(1, shape=[batch_size]),
                    'nb_tokens_masked': nb_tokens_to_mask * tf.constant(1, shape=[batch_size])        
                }

        return result

    return prepare_masked_lm_batch

In [17]:
tokenizer_name = 'distilbert-base-uncased'
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)
print('tokenizer voacb size:', tokenizer.vocab_size)

prepare_masked_lm_batch = get_masked_lm_fn(
    tokenizer=tokenizer,
    # token_counts=tokenizer.vocab_size,
    mlm_mask_prob=0.25,
    mask_type_probs=[0.5, 0.25, 0.25],
)

ds, nb_examples = get_unbatched_dataset(ds_spec={'original valid': 300}, tokenizer_name=tokenizer_name, max_len=80)    
ds_batched = get_training_dataset(ds, nb_examples, batch_size=32)
print(ds_batched)
mlm_dataset = ds_batched.map(lambda batch: prepare_masked_lm_batch(batch['inputs']))

for x in mlm_dataset.take(1):
    print(x)

tokenizer voacb size: 30522
mask_token_id: tf.Tensor(103, shape=(), dtype=int32)
len of the dataset: 300
len of the dataset: (300, 6)
Executing op TensorSliceDataset in device /job:localhost/replica:0/task:0/device:CPU:0
<TensorSliceDataset shapes: {inputs: {input_ids: (80,), attention_mask: (80,)}, labels: ()}, types: {inputs: {input_ids: tf.int32, attention_mask: tf.int32}, labels: tf.int32}>
Executing op DummySeedGenerator in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ShuffleDatasetV3 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
<PrefetchDataset shapes: {inputs: {input_ids: (32, 80), attention_mask: (32, 80)}, labels: (32,)}, types: {inputs: {input_ids: tf.int32, attention_mask: tf.int32}, labels: tf.int32}>
input_ids: Tensor("args_1:0", shape=(32, 80), dtype=int32)
Executing op MapDatas

In [22]:
class Classifier(tf.keras.Model):

    def __init__(self, transformer, use_mask=True):
        
        super(Classifier, self).__init__()
        
        self.transformer = transformer
        self.dropout = tf.keras.layers.Dropout(rate=0.15)
        self.global_pool = tf.keras.layers.GlobalAveragePooling1D()
        self.classifier = tf.keras.layers.Dense(3)
        self.use_mask = use_mask

    def call(self, inputs, training=False):

        # Sequence outputs
 
        mask = tf.cast(inputs['attention_mask'], tf.bool)
        
        x = self.transformer(inputs, training=training)[0]        
        x = self.dropout(x, training=training)
        if not self.use_mask:
            mask = None
        x = self.global_pool(x, mask=mask)
        
        return self.classifier(x)

def get_models(model_name, lr=1e-5, verbose=False):
    '''
    get_models function:
    Loss is computed by the Sparse Categorical Crossentropy.
    Accuracy is computed by the Sparse Categorical Accuracy.
    Masked language model loss, masked language model accuracy, train loss, and train accuracy are stored in
    the metric dictionary.'''
    
    with strategy.scope():

        lm_model = transformers.TFAutoModelForPreTraining.from_pretrained(model_name)

        # False = transfer learning, True = fine-tuning
        lm_model.trainable = True

        # Just run a dummy batch, not necessary
        dummy = lm_model(
            inputs={
                'input_ids':tf.constant(1, shape=[1, 8])
            }
        )

        if verbose:
            
            print('Sample output from the masked LM model:\n')
            print(dummy)
            
            print('\nMasked LM model\n')
            lm_model.summary()

        transformer = lm_model.layers[0]
        model = Classifier(transformer, use_mask=True)
        # Just run a dummy batch, not necessary
        dummy = model(
            inputs={
                'input_ids':tf.constant(1, shape=[1, 8]),
                'attention_mask':tf.constant(1, shape=[1, 8])
            }
        )

        if verbose:
            
            print('Sample output from the classification model:\n')
            print(dummy)
            
            print('\nClassification model\n')
            model.summary()    
    
        # Instiate an optimizer with a learning rate schedule
        optimizer = tf.keras.optimizers.Adam(lr=lr)

        # Only `NONE` and `SUM` are allowed, and it has to be explicitly specified.
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
        # Instantiate metrics
        metrics = {
            'mlm loss': tf.keras.metrics.Sum(),
            'mlm acc': tf.keras.metrics.SparseCategoricalAccuracy(),
            'train loss': tf.keras.metrics.Sum(),
            'train acc': tf.keras.metrics.SparseCategoricalAccuracy()
        }

        return lm_model, model, loss_fn, optimizer, metrics

In [23]:
def get_routines(lm_model, model, loss_fn, optimizer, metrics, batch_size):
    '''
    mlm_fine_tune_step fumction: calculate the loss and accuracy of the predicted masked token and masked token labels
    train_step function: calculate the loss and accuracy of the predicted token and training labels
    predict_fn: return the predicted result
    '''

    def mlm_fine_tune_step(batch):

        # The batch here is the batch received by each replica.
        # However, The number of masked tokens `batch['nb_tokens_masked']` is the number of masked tokens
        # in the whole batch before being distributed to TPU replicas.
        inputs, mlm_labels, nb_tokens_masked = batch['inputs'], batch['mlm_labels'], batch['nb_tokens_masked']
        
        with tf.GradientTape() as tape:
        
            # sequence outputs
            # shape = [batch_size, seq_len, vocab_size]
            logits = lm_model(inputs, training=True)[0]

            # get the places where the tokens should be predicted (masked / replaced / )
            # shape = [batch_size, seq_len]
            mlm_mask = (mlm_labels > -1)

            # shape = [nb_masked_tokens]
            labels_at_masked_tokens = tf.boolean_mask(mlm_labels, mlm_mask)

            # shape = [nb_masked_tokens, vocab_size]
            logits_at_masked_tokens = tf.boolean_mask(logits, mlm_mask)
            # the mlm loss values are calculated only for the masked tokens
            loss_mlm = loss_fn(
                labels_at_masked_tokens,
                logits_at_masked_tokens
            )

            # divide the number of masked tokens in the global batch, i.e. the whole batch that is distributed to different replicas.
            loss_mlm = loss_mlm / tf.cast(nb_tokens_masked[0], dtype=tf.float32)
        
        gradients = tape.gradient(loss_mlm, lm_model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, lm_model.trainable_variables))
        
        metrics['mlm loss'].update_state(loss_mlm)
        metrics['mlm acc'].update_state(labels_at_masked_tokens, logits_at_masked_tokens)
        
    def train_step(batch):

        inputs, labels = batch['inputs'], batch['labels']
        
        with tf.GradientTape() as tape:

            # shape = [batch_size, 3]
            logits = model(inputs, training=True)

            loss = loss_fn(labels, logits)
              # divide by the global batch size, rather than the per replica batch size
            loss = loss / batch_size
        
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
        metrics['train loss'].update_state(loss)
        metrics['train acc'].update_state(labels, logits)

    @tf.function
    def dist_mlm_fine_tune_1_epoch(data_iter, steps_per_epoch):
        """
        Iterating inside `tf.function` to optimized training time.
        """
        
        for _ in tf.range(steps_per_epoch):
            strategy.run(mlm_fine_tune_step, args=(next(data_iter),))        
        
    @tf.function
    def dist_train_1_epoch(data_iter, steps_per_epoch):
        """
        Iterating inside `tf.function` to optimized training time.
        """
        for _ in tf.range(steps_per_epoch):
            strategy.run(train_step, args=(next(data_iter),))
            
    @tf.function                
    def predict_step(batch):

        inputs = batch['inputs']

        logits = model(inputs, training=False)
        return logits

    def predict_fn(dist_pred_ds):

        all_logits = []
        for batch in dist_pred_ds:

            # PerReplica object
            logits = strategy.run(predict_step, args=(batch,))

            # Tuple of tensors
            logits = strategy.experimental_local_results(logits)

            # tf.Tensor
            logits = tf.concat(logits, axis=0)
            all_logits.append(logits)

        # tf.Tensor
        logits = tf.concat(all_logits, axis=0)

        return logits
              
    return dist_mlm_fine_tune_1_epoch, dist_train_1_epoch, predict_fn

In [24]:
def get_datasets(
        ds_spec,
        tokenizer_name,
        batch_size,
        prediction_batch_size,
        max_len,
        mlm_fine_tuning_ds_spec=None
    ): 
    '''
    get_datasets function: get the train, validation, and test dataset into batches. Also, the function returns 
    the value of the training, masked language model, validation, and test examples. All these values are stored
    into the dataset variable.'''
    
        token_counter = Counter()

        train_ds, nb_train_examples = get_unbatched_dataset(
            ds_spec=ds_spec,
            tokenizer_name=tokenizer_name,
            shuffle=True,
            max_len=max_len,
            token_counter=token_counter,
        )
        train_ds = get_training_dataset(
            train_ds, nb_train_examples, batch_size=batch_size, repeat=True
        )

        if mlm_fine_tuning_ds_spec is None:
            mlm_fine_tuning_ds_spec = ds_spec
        mlm_fine_tuning_ds, nb_mlm_fine_tuning_examples, mlm_fine_tuning_raw_ds = get_unbatched_dataset(
            ds_spec=mlm_fine_tuning_ds_spec,
            tokenizer_name=tokenizer_name,
            shuffle=True,
            max_len=max_len,
            token_counter=token_counter,
            return_raw_ds=True
        )
        mlm_fine_tuning_ds = get_training_dataset(
            mlm_fine_tuning_ds, nb_mlm_fine_tuning_examples, batch_size=batch_size, repeat=True
        )
        


        valid_ds, nb_valid_examples = get_unbatched_dataset(
            ds_spec=['original valid'], tokenizer_name=tokenizer_name, max_len=max_len, token_counter=token_counter
        )
        valid_ds = get_prediction_dataset(valid_ds, prediction_batch_size)
        valid_labels = next(iter(valid_ds.map(lambda batch: batch['labels']).unbatch().batch(len(original_valid))))
        
        test_ds, nb_test_examples = get_unbatched_dataset(
            ds_spec=['original test'], tokenizer_name=tokenizer_name, max_len=max_len, token_counter=token_counter
        )
        test_ds = get_prediction_dataset(test_ds, prediction_batch_size)


        datasets = {
            'train': train_ds,
            'mlm_fine_tuning': mlm_fine_tuning_ds,
            'valid': valid_ds,
            'test': test_ds,
            'valid labels': valid_labels,            
            'nb_train_examples': nb_train_examples,
            'nb_mlm_fine_tuning_examples': nb_mlm_fine_tuning_examples,
            'nb_valid_examples': nb_valid_examples,
            'nb_test_examples': nb_test_examples,
            'mlm_fine_tuning_raw_ds': mlm_fine_tuning_raw_ds
        }

        return datasets, token_counter

In [25]:
def init_tpu():

    global strategy

    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)

    except ValueError:
        # strategy = tf.distribute.get_strategy() # for CPU and single GPU 
        strategy = tf.distribute.MirroredStrategy()
        tf.debugging.set_log_device_placement(True)

In [26]:
class Trainer:

    def __init__(
        self,
        ds_spec,
        tokenizer_name,
        mlm_fine_tuning_ds_spec=None,
        batch_size_per_replica=32,
        prediction_batch_size_per_replica=128,
        max_len=80,
        token_counts=None,
        count_tokens=False,
        mlm_mask_prob=0.25,
        mask_type_probs=(0.3, 0.6, 0.1),
        predict_special_tokens=False):
        """
        Args:
            ds_spec: See `get_unbatched_dataset`.
            tokenizer_name: The name for a Hugging Face tokenizer.            
            mlm_fine_tuning_ds_spec: See `get_unbatched_dataset`.
            batch_size_per_replica: int
            prediction_batch_size_per_replica: int
            max_len: int, max length used for padding/truncation.
            token_counts: tf.float32 tensor of shape [vocab_size]. Could be `None`.
            count_tokens: bool, If to count tokens in the datasets when `token_counts` is None.
            mlm_mask_prob: See `get_masked_lm_fn`.
            mask_type_probs: See `get_masked_lm_fn`.
             predict_special_tokens: See `get_masked_lm_fn`.
        """

        init_tpu()
        
        self.batch_size = batch_size_per_replica * strategy.num_replicas_in_sync
        self.prediction_batch_size = prediction_batch_size_per_replica * strategy.num_replicas_in_sync        
                
        tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)
        self.vocab = {v: k for k, v in tokenizer.get_vocab().items()}

        self.mlm_mask_prob = mlm_mask_prob
        self.mask_type_probs = mask_type_probs
        
        self.datasets, token_counter = get_datasets(
            ds_spec,
            tokenizer_name,
            self.batch_size,
            self.prediction_batch_size,
            max_len,
            mlm_fine_tuning_ds_spec=mlm_fine_tuning_ds_spec
        )        
        self.token_counts = token_counts
        if token_counts is None and count_tokens:

            self.token_counts = [0] * tokenizer.vocab_size
            
            for token_id, count in token_counter.items():
                self.token_counts[token_id] = count

        with open('token_counting.json', 'w', encoding='UTF-8') as fp:
            json.dump(token_counter.most_common(), fp, indent=4, ensure_ascii=False)

        prepare_masked_lm_batch = get_masked_lm_fn(tokenizer, self.mlm_mask_prob, self.mask_type_probs, token_counts=token_counts, predict_special_tokens=predict_special_tokens)
        self.datasets['mlm_fine_tuning'] = self.datasets['mlm_fine_tuning'].map(lambda x: prepare_masked_lm_batch(x['inputs']))

    def mlm_fine_tuning(self, dist_mlm_fine_tuning_1_epoch, epochs, metrics):
        '''
        mlm_fine_tuning function: masked token fine tune
        '''
        dist_train_ds_mlm = strategy.experimental_distribute_dataset(self.datasets['mlm_fine_tuning'])
        dist_train_iter_mlm = iter(dist_train_ds_mlm)
        steps_per_epoch = self.datasets['nb_mlm_fine_tuning_examples'] // self.batch_size

        print(f'\nstart mlm finetuning for {epochs} epochs ...')

        history = {}    
        for epoch in range(epochs):

            s = datetime.datetime.now()
            
            dist_mlm_fine_tuning_1_epoch(iter(dist_train_iter_mlm), steps_per_epoch)
            
            mlm_loss = metrics['mlm loss'].result() / steps_per_epoch
            mlm_acc = metrics['mlm acc'].result()
            
            print(f'\nmlm finetuning epoch: {epoch + 1}\n')
            print(f'mlm loss: {mlm_loss}')
            print(f'mlm acc: {mlm_acc}')

            metrics['mlm loss'].reset_states()
            metrics['mlm acc'].reset_states()

            e = datetime.datetime.now()
            elapsed = (e - s).total_seconds()             
            
            print('train timing: {}'.format(elapsed))
            print('-' * 40)
            
            history[epoch] = {
                'mlm loss': float(mlm_loss),
                'mlm acc': float(mlm_acc),
                'train timing': elapsed
            }            
            
        return history

    def train(self, train_name, model_name, epochs, mlm_fine_tuning_epochs=3, lr=1e-5, runs=1, verbose=True):
        """Run the same configuration `runs` times.

        The validation loss and accuracy are computed using the averaged logits over runs.
        The training loss and accuracy are averaged directly over runs.
        """

        all_mlm_fine_tuning_histories = []
        all_histories = []
        all_valid_logits = []
        all_test_logits = []

        for run in range(runs):
            
            if run > 0:
                verbose = False

            mlm_fine_tuning_history, history, valid_logits, test_logits = \
                self._train(train_name + '-' + f'run-{run+1}', model_name, epochs, mlm_fine_tuning_epochs, lr=lr, verbose=verbose)

            all_mlm_fine_tuning_histories.append(mlm_fine_tuning_history)
            all_histories.append(history)
            all_valid_logits.append(valid_logits)
            all_test_logits.append(test_logits)

        all_valid_logits = tf.stack(all_valid_logits)
        all_test_logits = tf.stack(all_test_logits)
        valid_labels = self.datasets['valid labels']
        avg_valid_logits = tf.reduce_mean(all_valid_logits, 1)
        avg_test_logits = tf.reduce_mean(all_test_logits, 1)

        return avg_valid_logits, avg_test_logits

    def _train(self, train_name, model_name, epochs, mlm_fine_tuning_epochs=3, lr=1e-5, verbose=True):
        """
        Args:
            dataset: tf.data.Dataset. Each batch is a dictionary containing at least `input_ids` and `attention_mask` as keys.
            masked tokens fine tune after each epoch
        """
        '''
        _train function, In each epoch, the model fine tunnes the accuracy and result.
        In this function, it calculates masked language model (mlm) loss, (mlm) accuracy, train loss, train accuracy, validation loss, 
        and validation accuracy. Also, the function returns the test and validation predictions.
        '''

        init_tpu()

        lm_model, model, loss_fn, optimizer, metrics = get_models(model_name, lr=lr, verbose=verbose) 
        model_dir = "/content/drive/My Drive/Kaggle/Watson_NLP/model/model_{}".format(epochs) 
        print('saved model dir {}'.format(model_dir)) 
        lm_model.save_pretrained(model_dir)
        dist_mlm_fine_tuning_1_epoch, dist_train_1_epoch, predict_fn = get_routines(lm_model, model, loss_fn, optimizer, metrics, self.batch_size)

        mlm_fine_tuning_history = self.mlm_fine_tuning(dist_mlm_fine_tuning_1_epoch, epochs=mlm_fine_tuning_epochs, metrics=metrics)        
        
        dist_train_ds = strategy.experimental_distribute_dataset(self.datasets['train'])        
        dist_valid_ds = strategy.experimental_distribute_dataset(self.datasets['valid'])
        dist_test_ds = strategy.experimental_distribute_dataset(self.datasets['test'])
        
        dist_train_iter = iter(dist_train_ds)

        steps_per_epoch = self.datasets['nb_train_examples'] // self.batch_size

        print(f'\nstart training for {epochs} epochs ...\n')    
            
        history = {}

        valid_logits = []
        test_logits = []
        #file1 = open("/content/drive/My Drive/Kaggle/Watson_NLP/model/history1.txt","a+") 
        #file2 = open("/content/drive/My Drive/Kaggle/Watson_NLP/model/test1.txt","a+")
        for epoch in range(epochs):
            print(f'epoch training : {epoch}')
            s = datetime.datetime.now()
            
            dist_train_1_epoch(dist_train_iter, steps_per_epoch) 

            train_loss = metrics['train loss'].result() / steps_per_epoch
            train_acc = metrics['train acc'].result()

            print(f'epoch: {epoch + 1}\n')
            print(f'train loss: {train_loss}')
            print(f'train acc: {train_acc}')
            
            metrics['train loss'].reset_states()
            metrics['train acc'].reset_states()
            
            e = datetime.datetime.now()
            elapsed = (e - s).total_seconds()             

            logits = predict_fn(dist_valid_ds)
            valid_logits.append(logits)

            valid_loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(self.datasets['valid labels'], logits, from_logits=True, axis=-1))
            valid_acc = tf.reduce_mean(tf.keras.metrics.sparse_categorical_accuracy(self.datasets['valid labels'], logits))

            logits_test = predict_fn(dist_test_ds)
            test_logits.append(logits_test)      
            
            print('\nvalid loss: {}'.format(valid_loss))
            print('valid acc: {}\n'.format(valid_acc))

            print('\ntest pred: {}'.format(test_logits))
            
            print('train timing: {}'.format(elapsed))
            print('-' * 40)           

            history[epoch] = {
                'train loss': float(train_loss),
                'train acc': float(train_acc),
                'valid loss': float(valid_loss),
                'valid acc': float(valid_acc),  
                'test pred': test_logits,             
                'train timing': elapsed
            }
            #file1.write(str(history))
            #file1.write('\n')
            #file2.write(str(test_logits))
            #file2.write('\n')
            #print('file written')
        
        # shape = [epochs, nb_examples, 3]
        valid_logits = tf.stack(valid_logits)
        test_logits = tf.stack(test_logits)

        # save_history(train_name, mlm_fine_tuning_history, history)
        # save_logits(train_name, valid_logits, test_logits)        
        
        return mlm_fine_tuning_history, history, valid_logits, test_logits


In [27]:
model_name = 'jplu/tf-xlm-roberta-base'

max_len = 80

batch_size_per_replica=32
prediction_batch_size_per_replica=128

epochs = 3
lr = 1e-5

ds_spec = {
    'original train': None
}

mlm_fine_tuning_ds_spec = {
    'original train': None,
    'original valid': None,
    'original test': None,
}

token_counts=None
count_tokens = False
mlm_mask_prob=0.15
mask_type_probs = (0.3, 0.6, 0.1)

predict_special_tokens = False

runs = 1

In [28]:
trainer = Trainer(
    ds_spec=ds_spec,
    tokenizer_name=model_name,
    # mlm_fine_tuning_ds_spec=mlm_fine_tuning_ds_spec,
    batch_size_per_replica=batch_size_per_replica,
    prediction_batch_size_per_replica=prediction_batch_size_per_replica,
    max_len=max_len,
    token_counts=token_counts,
    count_tokens=count_tokens,
    mlm_mask_prob=mlm_mask_prob,
    mask_type_probs=mask_type_probs,
    predict_special_tokens=predict_special_tokens
)

len of the dataset: 8484
len of the dataset: (8484, 6)
Executing op TensorSliceDataset in device /job:localhost/replica:0/task:0/device:CPU:0
<TensorSliceDataset shapes: {inputs: {input_ids: (80,), attention_mask: (80,)}, labels: ()}, types: {inputs: {input_ids: tf.int32, attention_mask: tf.int32}, labels: tf.int32}>
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op DummySeedGenerator in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ShuffleDatasetV3 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
len of the dataset: 8484
len of the dataset: (8484, 6)
Executing op TensorSliceDataset in device /job:localhost/replica:0/task:0/device:CPU:0
<TensorSliceDataset shapes: {inputs: {input_ids: (80,), attention_mask: (80,)}, labels: ()}, types: {inputs: {input_id

In [29]:
mlm_fine_tuning_epochs = 3

train_name = 'mlm finetuning 0'

avg_valid, avg_test = trainer.train(
    train_name,
    model_name, epochs=epochs, mlm_fine_tuning_epochs=mlm_fine_tuning_epochs, lr=lr, runs=runs, verbose=True
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1885418496.0, style=ProgressStyle(descr…


Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Assert in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Assert in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Logical

All model checkpoint layers were used when initializing TFXLMRobertaForMaskedLM.

All the layers of TFXLMRobertaForMaskedLM were initialized from the model checkpoint at jplu/tf-xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForMaskedLM for predictions without further training.


Executing op Fill in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Shape in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Fill in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Fill in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op NotEqual in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Cast in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Cumsum in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AddV2 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Shape in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op GatherV2 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariable

  num_elements)


Executing op Assert in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Fill in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Assert in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Fill in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Assert in device /job:localho

In [30]:
'''
Convert the prediction in the test dataset from tensorflow tensor into numpy.
In each row of the prediction tensor, the largest value is extracted since the value is predicted 
with higher probability.'''
data_tensor = tf.convert_to_tensor(np.load("/kaggle/input/testpred2/test361_1.npy"))
output_max = tf.math.argmax(data_tensor,1)
np_arr = output_max.numpy()
original_test['prediction'] = np_arr
sample_submission = original_test[['id','prediction']]
sample_submission.head()

Executing op ArgMax in device /job:localhost/replica:0/task:0/device:GPU:0


Unnamed: 0,id,prediction
0,c6d58c3f69,2
1,cefcc82292,1
2,e98005252c,0
3,58518c10ba,1
4,c32b0d16df,2


In [31]:
sample_submission.to_csv('submission.csv',index=False)