## Dependencies

In [1]:
import warnings, math, json, glob
import pandas as pd
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K
from tensorflow.keras import Model
from transformers import TFAutoModelForSequenceClassification, TFAutoModel, AutoTokenizer
from commonlit_scripts import *


seed = 0
seed_everything(seed)
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 150)

### Hardware configuration

In [2]:
strategy, tpu = get_strategy()
AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

REPLICAS: 1


# Load data

In [3]:
test_filepath = '/kaggle/input/commonlitreadabilityprize/test.csv'
test = pd.read_csv(test_filepath)
print(f'Test samples: {len(test)}')
display(test.head())

Test samples: 7


Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would keep a bright light burning in the upper story to guide me on my course. On a clear night this light w...
1,f0953f0a5,,,"Dotty continued to go to Mrs. Gray's every night with the milk. Sometimes Katie went with her, and then they always paused a while under the acorn..."
2,0df072751,,,It was a bright and cheerful scene that greeted the eyes of Captain Raymond and his son as they entered the parlor of the adjacent cottage.\nIt wa...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent cell divides into two or more daughter cells. Cell division usually occurs as part of a larger cell...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolving of defects that prevent correct operation of computer software or a system. Debugging tends to b...


In [4]:
!ls /kaggle/input/

10-commonlit-bert-base-seq-256	huggingface-bert
commonlitreadabilityprize	huggingface-roberta


# Model parameters

In [5]:
input_base_path = '/kaggle/input/10-commonlit-bert-base-seq-256/'
with open(input_base_path + 'config.json') as json_file:
    config = json.load(json_file)

config

{'BATCH_SIZE': 64,
 'LEARNING_RATE': 8e-05,
 'EPOCHS': 50,
 'ES_PATIENCE': 10,
 'N_FOLDS': 5,
 'N_USED_FOLDS': 1,
 'SEQ_LEN': 256,
 'BASE_MODEL': '/kaggle/input/huggingface-bert/bert-base-uncased',
 'SEED': 0}

## Auxiliary functions

In [6]:
# Datasets utility functions
def custom_standardization(text):
    text = text.lower() # if encoder is uncased
    text = text.strip()
    return text


def sample_target(features, target):
    mean, stddev = target
    sampled_target = tf.random.normal([], mean=tf.cast(mean, dtype=tf.float32), 
                                      stddev=tf.cast(stddev, dtype=tf.float32), dtype=tf.float32)
    
    return (features, sampled_target)
    

def get_dataset(pandas_df, tokenizer, labeled=True, ordered=False, repeated=False, 
                is_sampled=False, batch_size=32, seq_len=128):
    """
        Return a Tensorflow dataset ready for training or inference.
    """
    text = [custom_standardization(text) for text in pandas_df['excerpt']]
    
    # Tokenize inputs
    tokenized_inputs = tokenizer(text, max_length=seq_len, truncation=True, 
                                 padding='max_length', return_tensors='tf')
    
    if labeled:
        dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs['input_ids'], 
                                                       'token_type_ids': tokenized_inputs['token_type_ids'], 
                                                       'attention_mask': tokenized_inputs['attention_mask']}, 
                                                      (pandas_df['target'], pandas_df['standard_error'])))
        if is_sampled:
            dataset = dataset.map(sample_target, num_parallel_calls=tf.data.AUTOTUNE)
    else:
        dataset = tf.data.Dataset.from_tensor_slices({'input_ids': tokenized_inputs['input_ids'], 
                                                      'token_type_ids': tokenized_inputs['token_type_ids'], 
                                                      'attention_mask': tokenized_inputs['attention_mask']})
        
    if repeated:
        dataset = dataset.repeat()
    if not ordered:
        dataset = dataset.shuffle(1024)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    
    return dataset

In [7]:
model_path_list = glob.glob(f'{input_base_path}*.h5')
model_path_list.sort()

print('Models to predict:')
print(*model_path_list, sep='\n')

Models to predict:
/kaggle/input/10-commonlit-bert-base-seq-256/model_0.h5


# Model

In [8]:
def model_fn(encoder, seq_len=256):
    input_ids = L.Input(shape=(seq_len,), dtype=tf.int32, name='input_ids')
    input_token_type_ids = L.Input(shape=(seq_len,), dtype=tf.int32, name='token_type_ids')
    input_attention_mask = L.Input(shape=(seq_len,), dtype=tf.int32, name='attention_mask')
    
    outputs = encoder({'input_ids': input_ids,  
                       'token_type_ids': input_token_type_ids, 
                       'attention_mask': input_attention_mask})
    last_hidden_state = outputs['last_hidden_state']
    
    x = L.GlobalAveragePooling1D()(last_hidden_state)
    output = L.Dense(1, name='output')(x)
    
    model = Model(inputs=[input_ids, input_token_type_ids, input_attention_mask], outputs=output)

    return model


with strategy.scope():
    encoder = TFAutoModel.from_pretrained(config['BASE_MODEL'])
    model = model_fn(encoder, config['SEQ_LEN'])
    
model.summary()

Some layers from the model checkpoint at /kaggle/input/huggingface-bert/bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at /kaggle/input/huggingface-bert/bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 256)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 109482240   attention_mask[0][0]             
                                                                 input_ids[0][0]              

# Test set predictions

In [9]:
tokenizer = AutoTokenizer.from_pretrained(config['BASE_MODEL'])
test_pred = []

for model_path in model_path_list:
    print(model_path)
    if tpu: tf.tpu.experimental.initialize_tpu_system(tpu)
    K.clear_session()
    model.load_weights(model_path)

    # Test predictions
    test_ds = get_dataset(test, tokenizer, labeled=False, ordered=True, batch_size=config['BATCH_SIZE'], seq_len=config['SEQ_LEN'])
    x_test = test_ds.map(lambda sample: sample)
    test_pred.append(model.predict(x_test))

/kaggle/input/10-commonlit-bert-base-seq-256/model_0.h5


# Test set predictions

In [10]:
submission = test[['id']]
submission['target'] = np.mean(test_pred, axis=0)
submission.to_csv('submission.csv', index=False)
display(submission.head(10))

Unnamed: 0,id,target
0,c0f722661,-0.447952
1,f0953f0a5,-0.48511
2,0df072751,-0.482398
3,04caf4e0c,-2.425107
4,0e63f8bea,-1.614083
5,12537fe78,-0.789669
6,965e592c0,0.303603
