## Import libraries

In [None]:
import gc
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow_addons.optimizers import AdamW, Lookahead
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import Activation, Input
from tensorflow.keras.layers import BatchNormalization

from transformers import TFAutoModel, AutoTokenizer

## Load source datasets

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
train_df.drop(['url_legal','license','standard_error'], inplace=True, axis=1)
train_df.set_index("id", inplace=True)
print(f"train_df: {train_df.shape}\n")
train_df.head()

In [None]:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
test_df.drop(['url_legal','license'], inplace=True, axis=1)
test_df.set_index("id", inplace=True)
print(f"test_df: {test_df.shape}\n")
test_df.head()

## Extract target label

In [None]:
_, ax = plt.subplots(1, 2, figsize=(15, 5))
sns.boxplot(x='target', data=train_df, ax=ax[0])
sns.histplot(x='target', data=train_df, ax=ax[1])
ax[0].title.set_text('Box Plot - target')
ax[1].title.set_text('Hist Plot - target')

In [None]:
Ytrain = train_df['target'].values
Ytrain_strat = pd.qcut(train_df['target'].values, q=5, labels=range(0,5))
train_df.drop(['target'], inplace=True, axis=1)
print("Ytrain: {}".format(Ytrain.shape))

## Data Cleansing

In [None]:
def remove_punctuations(text):
    punct =[]
    punct += list(string.punctuation)
    punct += '’'
    punct += '-'
    punct.remove("'")
    
    for punctuation in punct:
        text = text.replace(punctuation, ' ')
    return text


def process_excerpt(df):
    df['excerpt'] = df['excerpt'].apply(lambda x: x.lower())
    df['excerpt'] = df['excerpt'].apply(lambda x: x.replace('\n', ' '))
    df['excerpt'] = df['excerpt'].str.replace('http\S+|www.\S+', '', case=False)
    df['excerpt'] = df['excerpt'].apply(lambda x: x.replace('&gt;', ''))
    df['excerpt'] = df['excerpt'].apply(remove_punctuations)
    df['excerpt'] = df['excerpt'].apply(lambda x: str(x).replace(" s ", " "))
    return df

In [None]:
train_df = process_excerpt(train_df)
test_df = process_excerpt(test_df)

## Define TPU config

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print("Running on TPU:", tpu.master())
except ValueError:
    strategy = tf.distribute.get_strategy()
    print(f"Running on {strategy.num_replicas_in_sync} replicas")

In [None]:
mini_batch_size = strategy.num_replicas_in_sync * 16
print(f'batch size: {mini_batch_size}')

## Build and validate the model

In [None]:
def sent_encode(texts, tokenizer, max_len=512):
    input_ids = []
    attention_mask = []

    for text in texts:
        tokens = tokenizer.encode_plus(text, max_length=max_len, truncation=True, 
                                       padding='max_length', add_special_tokens=True, 
                                       return_attention_mask=True, return_token_type_ids=False, 
                                       return_tensors='tf')
        
        input_ids.append(tokens['input_ids'])
        attention_mask.append(tokens['attention_mask'])

    return np.array(input_ids), np.array(attention_mask)

In [None]:
def rmse_loss(y_true, y_pred):
    y_true = tf.cast(y_true, dtype=tf.float32)
    y_pred = tf.cast(y_pred, dtype=tf.float32)
    return tf.math.sqrt(tf.math.reduce_mean((y_true - y_pred)**2))

In [None]:
def dnn_model(transformer_model):
    
    input_ids = Input(shape=(512,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(512,), dtype=tf.int32, name="attention_mask")

    embed = transformer_model(input_ids, attention_mask=attention_mask)[0]
    
    x = embed[:, 0, :]
    x = BatchNormalization()(x)
    
    x = Dense(units=512, kernel_initializer='he_uniform', 
                kernel_regularizer=l2(0.0001))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(rate=0.5)(x)
    
    x = Dense(units=256, kernel_initializer='he_uniform', 
                kernel_regularizer=l2(0.0001))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(rate=0.5)(x)
    
    x = Dense(units=128, kernel_initializer='he_uniform', 
                kernel_regularizer=l2(0.0001))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(rate=0.5)(x)
    
    x = Dense(units=1, kernel_initializer='he_uniform')(x)

    model = Model(inputs=[input_ids, attention_mask], outputs=x, 
                  name='CommonLit_DNN_Model')
    return model

In [None]:
FOLD = 5
NUM_SEED = 3
VERBOSE = 1

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_dnn = np.zeros((Ytrain.shape[0], 1))
y_pred_final_dnn = 0
counter = 0

In [None]:
with strategy.scope():
    
    transformer_model = TFAutoModel.from_pretrained("../input/tfbert-large-uncased")
    tokenizer = AutoTokenizer.from_pretrained("../input/tfbert-large-uncased")
    
    Xtrain_id, Xtrain_mask = sent_encode(train_df['excerpt'].values, tokenizer)
    Xtest_id, Xtest_mask = sent_encode(test_df['excerpt'].values, tokenizer)

    Xtrain_id = Xtrain_id.reshape((Xtrain_id.shape[0], Xtrain_id.shape[2]))
    Xtrain_mask = Xtrain_mask.reshape((Xtrain_mask.shape[0], Xtrain_mask.shape[2]))

    Xtest_id = Xtest_id.reshape((Xtest_id.shape[0], Xtest_id.shape[2]))
    Xtest_mask = Xtest_mask.reshape((Xtest_mask.shape[0], Xtest_mask.shape[2]))

    print(f"Train Data: \n   Input-ids: {Xtrain_id.shape} \n   Attention Mask: {Xtrain_mask.shape}\n")
    print(f"Test Data: \n   Input-ids: {Xtest_id.shape} \n   Attention Mask: {Xtest_mask.shape}\n\n")
    

    for sidx, seed in enumerate(seeds):
        seed_score = 0

        kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

        for idx, (train, val) in enumerate(kfold.split(Xtrain_id, Ytrain_strat)):
            counter += 1

            train_x_id, train_x_mask, train_y = Xtrain_id[train], Xtrain_mask[train], Ytrain[train]
            val_x_id, val_x_mask, val_y = Xtrain_id[val], Xtrain_mask[val], Ytrain[val]

            model = dnn_model(transformer_model)
            
            model.layers[2].trainable = False

            model.compile(loss=rmse_loss,
                          optimizer=Lookahead(AdamW(lr=1e-2, 
                                                    weight_decay=1e-5, 
                                                    clipvalue=700), 
                                              sync_period=10))

            early = EarlyStopping(monitor="val_loss", mode="min", 
                                  restore_best_weights=True, 
                                  patience=10, verbose=VERBOSE)

            reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.1, 
                                          min_lr=1e-6, patience=5, 
                                          verbose=VERBOSE, mode='min')
            
            #save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')

            chk_point = ModelCheckpoint('./CommonLit_CNN_Model.h5', 
                                        monitor='val_loss', verbose=VERBOSE, 
                                        save_best_only=True, mode='min',
                                        save_weights_only=True)  #, options=save_locally)

            history = model.fit(
                [train_x_id, train_x_mask], train_y, 
                batch_size=mini_batch_size,
                epochs=150, 
                verbose=VERBOSE, 
                callbacks=[reduce_lr, early, chk_point], 
                validation_data=([val_x_id, val_x_mask], val_y)
            )
            
            '''
            load_locally = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')

            model = load_model('./CommonLit_CNN_Model.h5', options=load_locally, 
                               custom_objects={ 'rmse_loss': rmse_loss(y_true=0.0, y_pred=0.0) })
            '''

            y_pred = model.predict([val_x_id, val_x_mask])
            y_pred_meta_dnn[val] += y_pred
            y_pred_final_dnn += model.predict([Xtest_id, Xtest_mask])

            score = np.sqrt(mean_squared_error(val_y, y_pred))
            oof_score += score
            seed_score += score
            print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))

        print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))

In [None]:
y_pred_meta_dnn = y_pred_meta_dnn / float(NUM_SEED)
y_pred_final_dnn = y_pred_final_dnn / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))