## Import libraries

In [1]:
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import LayerNormalization
from tensorflow.keras.layers import BatchNormalization
from tensorflow_addons.layers import WeightNormalization
from tensorflow.keras.layers import Conv1D, Flatten, Dense
from tensorflow.keras.layers import Input, Dropout, Activation

from transformers import RobertaTokenizer, TFRobertaModel, RobertaConfig
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel, XLMRobertaConfig

In [2]:
! mkdir "./Roberta-Base"
! mkdir "./XLM-Roberta-Base"
! mkdir "./DistilRoberta-Base"

## Load source datasets

In [3]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
train_df["excerpt_wordlen"] = train_df["excerpt"].apply(lambda x: len(str(x).split()))
train_df.drop(['url_legal','license','standard_error'], inplace=True, axis=1)
train_df.set_index("id", inplace=True)
print(f"train_df: {train_df.shape}\n")
train_df.head()

train_df: (2834, 3)



Unnamed: 0_level_0,excerpt,target,excerpt_wordlen
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c12129c31,When the young people returned to the ballroom...,-0.340259,179
85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,169
b69ac6792,"As Roger had predicted, the snow departed as q...",-0.580118,166
dd1000b26,And outside before the palace a great garden w...,-1.054013,164
37c1b32fb,Once upon a time there were Three Bears who li...,0.247197,147


In [4]:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
test_df["excerpt_wordlen"] = test_df["excerpt"].apply(lambda x: len(str(x).split()))
test_df.drop(['url_legal','license'], inplace=True, axis=1)
test_df.set_index("id", inplace=True)
print(f"test_df: {test_df.shape}\n")
test_df.head()

test_df: (7, 2)



Unnamed: 0_level_0,excerpt,excerpt_wordlen
id,Unnamed: 1_level_1,Unnamed: 2_level_1
c0f722661,My hope lay in Jack's promise that he would ke...,149
f0953f0a5,Dotty continued to go to Mrs. Gray's every nig...,181
0df072751,It was a bright and cheerful scene that greete...,174
04caf4e0c,Cell division is the process by which a parent...,180
0e63f8bea,Debugging is the process of finding and resolv...,168


## Extract target label

In [5]:
Ytrain = train_df['target'].values
Ytrain_strat = pd.qcut(train_df['target'].values, q=5, labels=range(0,5))
train_df.drop(['target'], inplace=True, axis=1)
print(f"Ytrain: {Ytrain.shape}")

Ytrain: (2834,)


## Model Hyperparameters

In [6]:
FOLD = 5
NUM_SEED = 1
VERBOSE = 1
MINI_BATCH_SIZE = 16
NUM_EPOCH = 20
MAX_LEN = max(train_df['excerpt_wordlen'].max(), 
              test_df['excerpt_wordlen'].max()) + 11

ROBERTA_BASE = "../input/huggingface-roberta-variants/roberta-base/roberta-base"
XLM_ROBERTA_BASE = "../input/huggingface-roberta-variants/tf-xlm-roberta-base/tf-xlm-roberta-base"
DISTILROBERTA_BASE = "../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base"

## Helper Functions

In [7]:
def sent_encode(texts, tokenizer):
    input_ids = []
    attention_mask = []
    token_type_ids = []

    for text in tqdm(texts):
        tokens = tokenizer.encode_plus(text, max_length=MAX_LEN, truncation=True, 
                                       padding='max_length', add_special_tokens=True, 
                                       return_attention_mask=True, return_token_type_ids=True, 
                                       return_tensors='tf')
        
        input_ids.append(tokens['input_ids'])
        attention_mask.append(tokens['attention_mask'])
        token_type_ids.append(tokens['token_type_ids'])

    return np.array(input_ids), np.array(attention_mask), np.array(token_type_ids)

In [8]:
def rmse_loss(y_true, y_pred):
    y_true = tf.cast(y_true, dtype=tf.float32)
    y_pred = tf.cast(y_pred, dtype=tf.float32)
    return tf.math.sqrt(tf.math.reduce_mean((y_true - y_pred)**2))

In [9]:
def commonlit_model(transformer_model, use_tokens_type_ids=True):
    
    input_id = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")
    token_type_id = Input(shape=(MAX_LEN,), dtype=tf.int32, name="token_type_ids")

    if use_tokens_type_ids:
        embed = transformer_model(input_id, token_type_ids=token_type_id, attention_mask=attention_mask)[0]
    
    else:
        embed = transformer_model(input_id, attention_mask=attention_mask)[0]
    
    #x = embed[:, 0, :]
    embed = LayerNormalization()(embed)
    
    x = WeightNormalization(
            Conv1D(filters=384, kernel_size=5, 
                   strides=2, padding='same', 
                   kernel_regularizer=l2(0.0001),
                   kernel_initializer='he_uniform'))(embed)
    x = LayerNormalization()(x)
    x = Activation('relu')(x)
    x = SpatialDropout1D(rate=0.25)(x)
    
    x = WeightNormalization(
            Conv1D(filters=192, kernel_size=5, 
                   strides=2, padding='same', 
                   kernel_regularizer=l2(0.0001),
                   kernel_initializer='he_uniform'))(x)
    x = LayerNormalization()(x)
    x = Activation('relu')(x)
    x = SpatialDropout1D(rate=0.25)(x)
    
    x = Flatten()(x)
    x = Dropout(rate=0.5)(x)
    
    x = Dense(units=1, kernel_initializer='lecun_normal')(x)

    model = Model(inputs=[input_id, attention_mask, token_type_id], outputs=x, 
                  name='CommonLit_Readability_Model')
    return model

## Roberta-Base Model

### Generate word tokens and attention masks

In [10]:
tokenizer = RobertaTokenizer.from_pretrained(ROBERTA_BASE)

In [11]:
Xtrain_id, Xtrain_mask, Xtrain_token = sent_encode(train_df['excerpt'].values, tokenizer)

Xtrain_id = Xtrain_id.reshape((Xtrain_id.shape[0], Xtrain_id.shape[2]))
Xtrain_mask = Xtrain_mask.reshape((Xtrain_mask.shape[0], Xtrain_mask.shape[2]))
Xtrain_token = Xtrain_token.reshape((Xtrain_token.shape[0], Xtrain_token.shape[2]))
    
print(f"Input-ids: {Xtrain_id.shape} \nAttention Mask: {Xtrain_mask.shape} \nToken-type-ids: {Xtrain_token.shape}")

100%|██████████| 2834/2834 [00:07<00:00, 384.01it/s]


Input-ids: (2834, 216) 
Attention Mask: (2834, 216) 
Token-type-ids: (2834, 216)


In [12]:
Xtest_id, Xtest_mask, Xtest_token = sent_encode(test_df['excerpt'].values, tokenizer)

Xtest_id = Xtest_id.reshape((Xtest_id.shape[0], Xtest_id.shape[2]))
Xtest_mask = Xtest_mask.reshape((Xtest_mask.shape[0], Xtest_mask.shape[2]))
Xtest_token = Xtest_token.reshape((Xtest_token.shape[0], Xtest_token.shape[2]))
    
print(f"Input-ids: {Xtest_id.shape} \nAttention Mask: {Xtest_mask.shape} \nToken-type-ids: {Xtest_token.shape}")

100%|██████████| 7/7 [00:00<00:00, 515.90it/s]

Input-ids: (7, 216) 
Attention Mask: (7, 216) 
Token-type-ids: (7, 216)





### Initialize the Bert-Base model

In [13]:
config = RobertaConfig.from_pretrained(ROBERTA_BASE)
config.output_hidden_states = False

transformer_model = TFRobertaModel.from_pretrained(ROBERTA_BASE, config=config)

Some layers from the model checkpoint at ../input/huggingface-roberta-variants/roberta-base/roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/huggingface-roberta-variants/roberta-base/roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [14]:
model = commonlit_model(transformer_model)
model.summary()

Model: "CommonLit_Readability_Model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 216)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 216)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 216)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode TFBaseModelOutputWit 124645632   input_ids[0][0]                  
                                                                 attenti

### Fit the model with K-Fold validation

In [15]:
np.random.seed(23)
seeds = np.random.randint(0, 100, size=NUM_SEED)

counter = 0
oof_score = 0
y_pred_final1 = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_id, Ytrain_strat)):
        counter += 1

        train_x_id, train_x_mask, train_x_token = Xtrain_id[train], Xtrain_mask[train], Xtrain_token[train]
        val_x_id, val_x_mask, val_x_token = Xtrain_id[val], Xtrain_mask[val], Xtrain_token[val]
        train_y, val_y = Ytrain[train], Ytrain[val]
        
        tf.random.set_seed(seed)

        model = commonlit_model(transformer_model)
        
        model.compile(loss=rmse_loss,
                      metrics=[RootMeanSquaredError(name='rmse')],
                      optimizer=Adam(lr=8e-5))

        early = EarlyStopping(monitor="val_rmse", mode="min", 
                              restore_best_weights=True, 
                              patience=5, verbose=VERBOSE)
        
        reduce_lr = ReduceLROnPlateau(monitor="val_rmse", factor=0.5, 
                                      min_lr=1e-7, patience=2, 
                                      verbose=VERBOSE, mode='min')

        chk_point = ModelCheckpoint(f'./Roberta-Base/CLRP_Roberta_Base_{counter}C.h5', 
                                    monitor='val_rmse', verbose=VERBOSE, 
                                    save_best_only=True, mode='min',
                                    save_weights_only=True)
        
        history = model.fit(
            [train_x_id, train_x_mask, train_x_token], train_y, 
            batch_size=MINI_BATCH_SIZE,
            epochs=NUM_EPOCH, 
            verbose=VERBOSE, 
            callbacks=[reduce_lr, early, chk_point], 
            validation_data=([val_x_id, val_x_mask, val_x_token], val_y)
        )
        
        model.load_weights(f'./Roberta-Base/CLRP_Roberta_Base_{counter}C.h5')
        
        y_pred = model.predict([val_x_id, val_x_mask, val_x_token])
        y_pred_final1 += model.predict([Xtest_id, Xtest_mask, Xtest_token])
        
        score = np.sqrt(mean_squared_error(val_y, y_pred))
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final1 = y_pred_final1 / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Epoch 1/20

Epoch 00001: val_rmse improved from inf to 0.90316, saving model to ./Roberta-Base/CLRP_Roberta_Base_1C.h5
Epoch 2/20

Epoch 00002: val_rmse improved from 0.90316 to 0.71407, saving model to ./Roberta-Base/CLRP_Roberta_Base_1C.h5
Epoch 3/20

Epoch 00003: val_rmse improved from 0.71407 to 0.68124, saving model to ./Roberta-Base/CLRP_Roberta_Base_1C.h5
Epoch 4/20

Epoch 00004: val_rmse did not improve from 0.68124
Epoch 5/20

Epoch 00005: val_rmse improved from 0.68124 to 0.62412, saving model to ./Roberta-Base/CLRP_Roberta_Base_1C.h5
Epoch 6/20

Epoch 00006: val_rmse improved from 0.62412 to 0.62257, saving model to ./Roberta-Base/CLRP_Roberta_Base_1C.h5
Epoch 7/20

Epoch 00007: val_rmse improved from 0.62257 to 0.58189, saving model to ./Roberta-Base/CLRP_Roberta_Base_1C.h5
Epoch 8/20

Epoch 00008: val_rmse did not improve from 0.58189
Epoch 9/20

Epoch 00009: ReduceLROnPlateau reducing learning rate to 3.9999998989515007e-05.

Epoch 00009: val_rmse did not improve from 0.5

## XLM-Roberta-Base Model

### Generate word tokens and attention masks

In [16]:
tokenizer = XLMRobertaTokenizer.from_pretrained(XLM_ROBERTA_BASE)

In [17]:
Xtrain_id, Xtrain_mask, Xtrain_token = sent_encode(train_df['excerpt'].values, tokenizer)

Xtrain_id = Xtrain_id.reshape((Xtrain_id.shape[0], Xtrain_id.shape[2]))
Xtrain_mask = Xtrain_mask.reshape((Xtrain_mask.shape[0], Xtrain_mask.shape[2]))
Xtrain_token = Xtrain_token.reshape((Xtrain_token.shape[0], Xtrain_token.shape[2]))
    
print(f"Input-ids: {Xtrain_id.shape} \nAttention Mask: {Xtrain_mask.shape} \nToken-type-ids: {Xtrain_token.shape}")

100%|██████████| 2834/2834 [00:02<00:00, 1086.32it/s]


Input-ids: (2834, 216) 
Attention Mask: (2834, 216) 
Token-type-ids: (2834, 216)


In [18]:
Xtest_id, Xtest_mask, Xtest_token = sent_encode(test_df['excerpt'].values, tokenizer)

Xtest_id = Xtest_id.reshape((Xtest_id.shape[0], Xtest_id.shape[2]))
Xtest_mask = Xtest_mask.reshape((Xtest_mask.shape[0], Xtest_mask.shape[2]))
Xtest_token = Xtest_token.reshape((Xtest_token.shape[0], Xtest_token.shape[2]))
    
print(f"Input-ids: {Xtest_id.shape} \nAttention Mask: {Xtest_mask.shape} \nToken-type-ids: {Xtest_token.shape}")

100%|██████████| 7/7 [00:00<00:00, 854.36it/s]

Input-ids: (7, 216) 
Attention Mask: (7, 216) 
Token-type-ids: (7, 216)





### Initialize the Albert-V2 model

In [19]:
config = XLMRobertaConfig.from_pretrained(XLM_ROBERTA_BASE)
config.output_hidden_states = False

transformer_model = TFXLMRobertaModel.from_pretrained(XLM_ROBERTA_BASE, config=config)

Some layers from the model checkpoint at ../input/huggingface-roberta-variants/tf-xlm-roberta-base/tf-xlm-roberta-base were not used when initializing TFXLMRobertaModel: ['lm_head']
- This IS expected if you are initializing TFXLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLMRobertaModel were initialized from the model checkpoint at ../input/huggingface-roberta-variants/tf-xlm-roberta-base/tf-xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


In [20]:
model = commonlit_model(transformer_model)
model.summary()

Model: "CommonLit_Readability_Model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 216)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 216)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 216)]        0                                            
__________________________________________________________________________________________________
tfxlm_roberta_model (TFXLMRober TFBaseModelOutputWit 278043648   input_ids[0][0]                  
                                                                 attenti

### Fit the model with K-Fold validation

In [21]:
np.random.seed(29)
seeds = np.random.randint(0, 100, size=NUM_SEED)

counter = 0
oof_score = 0
y_pred_final2 = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_id, Ytrain_strat)):
        counter += 1

        train_x_id, train_x_mask, train_x_token = Xtrain_id[train], Xtrain_mask[train], Xtrain_token[train]
        val_x_id, val_x_mask, val_x_token = Xtrain_id[val], Xtrain_mask[val], Xtrain_token[val]
        train_y, val_y = Ytrain[train], Ytrain[val]
        
        tf.random.set_seed(seed)

        model = commonlit_model(transformer_model)
        
        model.compile(loss=rmse_loss,
                      metrics=[RootMeanSquaredError(name='rmse')],
                      optimizer=Adam(lr=8e-5))

        early = EarlyStopping(monitor="val_rmse", mode="min", 
                              restore_best_weights=True, 
                              patience=5, verbose=VERBOSE)
        
        reduce_lr = ReduceLROnPlateau(monitor="val_rmse", factor=0.5, 
                                      min_lr=1e-7, patience=2, 
                                      verbose=VERBOSE, mode='min')

        chk_point = ModelCheckpoint(f'./XLM-Roberta-Base/CLRP_XLMRoberta_Base_{counter}C.h5', 
                                    monitor='val_rmse', verbose=VERBOSE, 
                                    save_best_only=True, mode='min',
                                    save_weights_only=True)
        
        history = model.fit(
            [train_x_id, train_x_mask, train_x_token], train_y, 
            batch_size=MINI_BATCH_SIZE,
            epochs=NUM_EPOCH, 
            verbose=VERBOSE, 
            callbacks=[reduce_lr, early, chk_point], 
            validation_data=([val_x_id, val_x_mask, val_x_token], val_y)
        )
        
        model.load_weights(f'./XLM-Roberta-Base/CLRP_XLMRoberta_Base_{counter}C.h5')
        
        y_pred = model.predict([val_x_id, val_x_mask, val_x_token])
        y_pred_final2 += model.predict([Xtest_id, Xtest_mask, Xtest_token])
        
        score = np.sqrt(mean_squared_error(val_y, y_pred))
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final2 = y_pred_final2 / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Epoch 1/20

Epoch 00001: val_rmse improved from inf to 1.03531, saving model to ./XLM-Roberta-Base/CLRP_XLMRoberta_Base_1C.h5
Epoch 2/20

Epoch 00002: val_rmse improved from 1.03531 to 0.73991, saving model to ./XLM-Roberta-Base/CLRP_XLMRoberta_Base_1C.h5
Epoch 3/20

Epoch 00003: val_rmse did not improve from 0.73991
Epoch 4/20

Epoch 00004: val_rmse improved from 0.73991 to 0.70466, saving model to ./XLM-Roberta-Base/CLRP_XLMRoberta_Base_1C.h5
Epoch 5/20

Epoch 00005: val_rmse did not improve from 0.70466
Epoch 6/20

Epoch 00006: ReduceLROnPlateau reducing learning rate to 3.9999998989515007e-05.

Epoch 00006: val_rmse did not improve from 0.70466
Epoch 7/20

Epoch 00007: val_rmse did not improve from 0.70466
Epoch 8/20

Epoch 00008: val_rmse improved from 0.70466 to 0.68365, saving model to ./XLM-Roberta-Base/CLRP_XLMRoberta_Base_1C.h5
Epoch 9/20

Epoch 00009: val_rmse did not improve from 0.68365
Epoch 10/20

Epoch 00010: val_rmse improved from 0.68365 to 0.67476, saving model to ./

## DistilRoberta-Base Model

### Generate word tokens and attention masks

In [22]:
tokenizer = RobertaTokenizer.from_pretrained(DISTILROBERTA_BASE)

In [23]:
Xtrain_id, Xtrain_mask, Xtrain_token = sent_encode(train_df['excerpt'].values, tokenizer)

Xtrain_id = Xtrain_id.reshape((Xtrain_id.shape[0], Xtrain_id.shape[2]))
Xtrain_mask = Xtrain_mask.reshape((Xtrain_mask.shape[0], Xtrain_mask.shape[2]))
Xtrain_token = Xtrain_token.reshape((Xtrain_token.shape[0], Xtrain_token.shape[2]))
    
print(f"Input-ids: {Xtrain_id.shape} \nAttention Mask: {Xtrain_mask.shape} \nToken-type-ids: {Xtrain_token.shape}")

100%|██████████| 2834/2834 [00:05<00:00, 516.04it/s]


Input-ids: (2834, 216) 
Attention Mask: (2834, 216) 
Token-type-ids: (2834, 216)


In [24]:
Xtest_id, Xtest_mask, Xtest_token = sent_encode(test_df['excerpt'].values, tokenizer)

Xtest_id = Xtest_id.reshape((Xtest_id.shape[0], Xtest_id.shape[2]))
Xtest_mask = Xtest_mask.reshape((Xtest_mask.shape[0], Xtest_mask.shape[2]))
Xtest_token = Xtest_token.reshape((Xtest_token.shape[0], Xtest_token.shape[2]))
    
print(f"Input-ids: {Xtest_id.shape} \nAttention Mask: {Xtest_mask.shape} \nToken-type-ids: {Xtest_token.shape}")

100%|██████████| 7/7 [00:00<00:00, 534.37it/s]

Input-ids: (7, 216) 
Attention Mask: (7, 216) 
Token-type-ids: (7, 216)





### Initialize the DistilBert-Base model

In [25]:
config = RobertaConfig.from_pretrained(DISTILROBERTA_BASE)
config.output_hidden_states = False

transformer_model = TFRobertaModel.from_pretrained(DISTILROBERTA_BASE, config=config)

Some layers from the model checkpoint at ../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [26]:
model = commonlit_model(transformer_model, use_tokens_type_ids=False)
model.summary()

Model: "CommonLit_Readability_Model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 216)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 216)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model_1 (TFRobertaMo TFBaseModelOutputWit 82118400    input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
layer_normalization_36 (LayerNo (None, 216, 768)     1536        tf_robe

### Fit the model with K-Fold validation

In [27]:
np.random.seed(31)
seeds = np.random.randint(0, 100, size=NUM_SEED)

counter = 0
oof_score = 0
y_pred_final3 = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_id, Ytrain_strat)):
        counter += 1

        train_x_id, train_x_mask, train_x_token = Xtrain_id[train], Xtrain_mask[train], Xtrain_token[train]
        val_x_id, val_x_mask, val_x_token = Xtrain_id[val], Xtrain_mask[val], Xtrain_token[val]
        train_y, val_y = Ytrain[train], Ytrain[val]
        
        tf.random.set_seed(seed)

        model = commonlit_model(transformer_model, use_tokens_type_ids=False)
        
        model.compile(loss=rmse_loss,
                      metrics=[RootMeanSquaredError(name='rmse')],
                      optimizer=Adam(lr=8e-5))

        early = EarlyStopping(monitor="val_rmse", mode="min", 
                              restore_best_weights=True, 
                              patience=5, verbose=VERBOSE)
        
        reduce_lr = ReduceLROnPlateau(monitor="val_rmse", factor=0.5, 
                                      min_lr=1e-7, patience=2, 
                                      verbose=VERBOSE, mode='min')

        chk_point = ModelCheckpoint(f'./DistilRoberta-Base/CLRP_DistilRoberta_Base_{counter}C.h5', 
                                    monitor='val_rmse', verbose=VERBOSE, 
                                    save_best_only=True, mode='min',
                                    save_weights_only=True)
        
        history = model.fit(
            [train_x_id, train_x_mask, train_x_token], train_y, 
            batch_size=MINI_BATCH_SIZE,
            epochs=NUM_EPOCH, 
            verbose=VERBOSE, 
            callbacks=[reduce_lr, early, chk_point], 
            validation_data=([val_x_id, val_x_mask, val_x_token], val_y)
        )
        
        model.load_weights(f'./DistilRoberta-Base/CLRP_DistilRoberta_Base_{counter}C.h5')
        
        y_pred = model.predict([val_x_id, val_x_mask, val_x_token])
        y_pred_final3 += model.predict([Xtest_id, Xtest_mask, Xtest_token])
        
        score = np.sqrt(mean_squared_error(val_y, y_pred))
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final3 = y_pred_final3 / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Epoch 1/20

Epoch 00001: val_rmse improved from inf to 0.85821, saving model to ./DistilRoberta-Base/CLRP_DistilRoberta_Base_1C.h5
Epoch 2/20

Epoch 00002: val_rmse improved from 0.85821 to 0.63976, saving model to ./DistilRoberta-Base/CLRP_DistilRoberta_Base_1C.h5
Epoch 3/20

Epoch 00003: val_rmse improved from 0.63976 to 0.56569, saving model to ./DistilRoberta-Base/CLRP_DistilRoberta_Base_1C.h5
Epoch 4/20

Epoch 00004: val_rmse did not improve from 0.56569
Epoch 5/20

Epoch 00005: ReduceLROnPlateau reducing learning rate to 3.9999998989515007e-05.

Epoch 00005: val_rmse did not improve from 0.56569
Epoch 6/20

Epoch 00006: val_rmse improved from 0.56569 to 0.53359, saving model to ./DistilRoberta-Base/CLRP_DistilRoberta_Base_1C.h5
Epoch 7/20

Epoch 00007: val_rmse did not improve from 0.53359
Epoch 8/20

Epoch 00008: ReduceLROnPlateau reducing learning rate to 1.9999999494757503e-05.

Epoch 00008: val_rmse did not improve from 0.53359
Epoch 9/20

Epoch 00009: val_rmse did not improv

## Create submission file

In [28]:
y_pred_final = (y_pred_final1 + y_pred_final2 + y_pred_final3) / 3.0

submit_df = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submit_df['target'] = y_pred_final
submit_df.to_csv("./submission.csv", index=False)
submit_df.head()

Unnamed: 0,id,target
0,c0f722661,-0.597287
1,f0953f0a5,-0.472897
2,0df072751,-0.094373
3,04caf4e0c,-2.161176
4,0e63f8bea,-1.968243
