## Dependencies

In [1]:
import warnings, math, json, glob
import pandas as pd
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K
from tensorflow.keras import Model
from transformers import TFAutoModelForSequenceClassification, TFAutoModel, AutoTokenizer
from commonlit_scripts import *


seed = 0
seed_everything(seed)
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 150)

### Hardware configuration

In [2]:
strategy, tpu = get_strategy()
AUTO = tf.data.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

REPLICAS: 1


# Load data

In [3]:
base_path = '/kaggle/input/'
test_filepath = base_path + 'commonlitreadabilityprize/test.csv'
test = pd.read_csv(test_filepath)
print(f'Test samples: {len(test)}')
display(test.head())

Test samples: 7


Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would keep a bright light burning in the upper story to guide me on my course. On a clear night this light w...
1,f0953f0a5,,,"Dotty continued to go to Mrs. Gray's every night with the milk. Sometimes Katie went with her, and then they always paused a while under the acorn..."
2,0df072751,,,It was a bright and cheerful scene that greeted the eyes of Captain Raymond and his son as they entered the parlor of the adjacent cottage.\nIt wa...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent cell divides into two or more daughter cells. Cell division usually occurs as part of a larger cell...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolving of defects that prevent correct operation of computer software or a system. Debugging tends to b...


# Model parameters

In [4]:
input_noteboks = [x for x in os.listdir(base_path) if '-commonlit-' in x]
input_base_path = f'{base_path}{input_noteboks[0]}/'
with open(input_base_path + 'config.json') as json_file:
    config = json.load(json_file)

config

{'BATCH_SIZE': 256,
 'LEARNING_RATE': 8e-05,
 'EPOCHS': 100,
 'ES_PATIENCE': 30,
 'N_FOLDS': 5,
 'N_USED_FOLDS': 1,
 'SEQ_LEN': 256,
 'BASE_MODEL': '/kaggle/input/huggingface-roberta/roberta-base/',
 'SEED': 0}

## Auxiliary functions

In [5]:
# Datasets utility functions
def custom_standardization(text, is_lower=True):
    if is_lower:
        text = text.lower() # if encoder is uncased
    text = text.strip()
    return text

def sample_target(features, target):
    mean, stddev = target
    sampled_target = tf.random.normal([], mean=tf.cast(mean, dtype=tf.float32), 
                                      stddev=tf.cast(stddev, dtype=tf.float32), dtype=tf.float32)
    return (features, sampled_target)

def get_dataset(pandas_df, tokenizer, labeled=True, ordered=False, repeated=False, 
                is_sampled=False, batch_size=32, seq_len=128, is_lower=True):
    """
        Return a Tensorflow dataset ready for training or inference.
    """
    text = [custom_standardization(text, is_lower) for text in pandas_df['excerpt']]
    
    # Tokenize inputs
    tokenized_inputs = tokenizer(text, max_length=seq_len, truncation=True, 
                                 padding='max_length', return_tensors='tf')
    
    if labeled:
        dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs['input_ids'], 
                                                       'attention_mask': tokenized_inputs['attention_mask']}, 
                                                      (pandas_df['target'], pandas_df['standard_error'])))
        if is_sampled:
            dataset = dataset.map(sample_target, num_parallel_calls=tf.data.AUTOTUNE)
    else:
        dataset = tf.data.Dataset.from_tensor_slices({'input_ids': tokenized_inputs['input_ids'], 
                                                      'attention_mask': tokenized_inputs['attention_mask']})
        
    if repeated:
        dataset = dataset.repeat()
    if not ordered:
        dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)
    dataset = dataset.cache()
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

In [6]:
model_path_list = glob.glob(f'{input_base_path}*.h5')
model_path_list.sort()

print('Models to predict:')
print(*model_path_list, sep='\n')

Models to predict:
/kaggle/input/25-commonlit-roberta-base-cls-3-target-bs-32/model_0.h5


# Model

In [7]:
def model_fn(encoder, seq_len=256):
    input_ids = L.Input(shape=(seq_len,), dtype=tf.int32, name='input_ids')
    input_attention_mask = L.Input(shape=(seq_len,), dtype=tf.int32, name='attention_mask')
    
    outputs = encoder({'input_ids': input_ids,  
                       'attention_mask': input_attention_mask})
    last_hidden_state = outputs['last_hidden_state']
    
    cls_token = last_hidden_state[:, 0, :]
#     x = L.GlobalAveragePooling1D()(last_hidden_state)
    output = L.Dense(1, name='output')(cls_token)
    output_stddev = L.Dense(1, name='output_stddev')(cls_token)
    output_samp = L.Dense(1, name='output_samp')(cls_token)
    
    model = Model(inputs=[input_ids, input_attention_mask], 
                  outputs=[output, output_stddev, output_samp])
    return model

with strategy.scope():
    encoder = TFAutoModel.from_pretrained(config['BASE_MODEL'])
    model = model_fn(encoder, config['SEQ_LEN'])
    
model.summary()

Some layers from the model checkpoint at /kaggle/input/huggingface-roberta/roberta-base/ were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at /kaggle/input/huggingface-roberta/roberta-base/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 256)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode TFBaseModelOutputWit 124645632   attention_mask[0][0]             
                                                                 input_ids[0][0]                  
__________________________________________________________________________________________________
tf.__operators__.getitem (Slici (None, 768)          0           tf_roberta_model[0][0]       

# Test set predictions

In [8]:
tokenizer = AutoTokenizer.from_pretrained(config['BASE_MODEL'])
test_pred = []

for model_path in model_path_list:
    print(model_path)
    if tpu: tf.tpu.experimental.initialize_tpu_system(tpu)
    K.clear_session()
    model.load_weights(model_path)

    # Test predictions
    test_ds = get_dataset(test, tokenizer, labeled=False, ordered=True, 
                          batch_size=config['BATCH_SIZE'], seq_len=config['SEQ_LEN'])
    x_test = test_ds.map(lambda sample: sample)
    test_pred.append(model.predict(x_test)[0])

/kaggle/input/25-commonlit-roberta-base-cls-3-target-bs-32/model_0.h5


# Test set predictions

In [9]:
submission = test[['id']]
submission['target'] = np.mean(test_pred, axis=0)
submission.to_csv('submission.csv', index=False)
display(submission.head(10))

Unnamed: 0,id,target
0,c0f722661,-0.322233
1,f0953f0a5,-0.289042
2,0df072751,-0.324049
3,04caf4e0c,-2.523515
4,0e63f8bea,-1.860601
5,12537fe78,-0.867214
6,965e592c0,0.402693
