## Import libraries

In [None]:
import gc
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import re
import nltk
import string
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow_addons.optimizers import AdamW
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import Activation, Input
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Dropout, Add
from tensorflow.keras.layers import GlobalAveragePooling1D

from transformers import BertTokenizer, TFBertModel, BertConfig

## Load source datasets

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
train_df.drop(['url_legal','license','standard_error'], inplace=True, axis=1)
train_df.set_index("id", inplace=True)
print(f"train_df: {train_df.shape}\n")
train_df.head()

In [None]:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
test_df.drop(['url_legal','license'], inplace=True, axis=1)
test_df.set_index("id", inplace=True)
print(f"test_df: {test_df.shape}\n")
test_df.head()

## Extract target label

In [None]:
Ytrain = train_df['target'].values
Ytrain_strat = pd.qcut(train_df['target'].values, q=5, labels=range(0,5))
train_df.drop(['target'], inplace=True, axis=1)
print("Ytrain: {}".format(Ytrain.shape))

## Data Cleansing

In [None]:
def decontraction(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


def remove_punctuations(text):
    punct = []
    punct += list(string.punctuation)
    punct += '’'
    punct += '-'
    punct += ','
    punct += '.'
    punct += '?'
    punct += '!'
    punct.remove('"')
    
    for punctuation in punct:
        text = text.replace(punctuation, '')
    return text


def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {
        "N": wordnet.NOUN, 
        "V": wordnet.VERB, 
        "J": wordnet.ADJ, 
        "R": wordnet.ADV
    }
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [None]:
combined_df = train_df.append(test_df, sort=False, ignore_index=False)

del train_df
del test_df
gc.collect()

combined_df.head()

In [None]:
# Convert to lower case
combined_df['Processed_excerpt'] = combined_df['excerpt'].apply(lambda x: str(x).lower().replace('\\', '').replace('_', ' '))

# Remove double spaces
combined_df['Processed_excerpt'] = combined_df['Processed_excerpt'].apply(lambda x: re.sub('\s+',  ' ', x))

# Replace contractions ("don't" with "do not" and "we've" with "we have")
combined_df['Processed_excerpt'] = combined_df['Processed_excerpt'].apply(lambda x: decontraction(x))

# Remove punctuations
combined_df['Processed_excerpt'] = combined_df['Processed_excerpt'].apply(remove_punctuations)

# Lemmatize words
combined_df['Processed_excerpt'] = combined_df['Processed_excerpt'].apply(lambda text: lemmatize_words(text))

# Word length
combined_df['excerpt_wordlen'] = combined_df['Processed_excerpt'].apply(lambda x: len(str(x).split(' ')))

combined_df.head()

In [None]:
for i in range(3):
    print(f"\nOriginal Excerpt: \n{combined_df.iloc[i]['excerpt']} \n\nProcessed Excerpt: \n{combined_df.iloc[i]['Processed_excerpt']}\n")
    print("="*150)

In [None]:
train_df = combined_df[:Ytrain.shape[0]].copy()
test_df = combined_df[Ytrain.shape[0]:].copy()
print(f"train_df: {train_df.shape} \ntest_df: {test_df.shape}")

del combined_df
gc.collect()

## Define TPU config

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print("Running on TPU:", tpu.master())
    
except ValueError:
    strategy = tf.distribute.get_strategy()
    print(f"Running on {strategy.num_replicas_in_sync} replicas")

In [None]:
mini_batch_size = 16     #strategy.num_replicas_in_sync * 16
print(f'batch size: {mini_batch_size}')

## Build and validate the model

In [None]:
max_len = train_df['excerpt_wordlen'].max()
max_len

In [None]:
def sent_encode(texts, tokenizer, seq_len):
    input_ids = []
    attention_mask = []
    token_type_ids = []

    for text in texts:
        tokens = tokenizer.encode_plus(text, max_length=seq_len, truncation=True, 
                                       padding='max_length', add_special_tokens=True, 
                                       return_attention_mask=True, return_token_type_ids=True, 
                                       return_tensors='tf')
        
        input_ids.append(tokens['input_ids'])
        attention_mask.append(tokens['attention_mask'])
        token_type_ids.append(tokens['token_type_ids'])

    return np.array(input_ids), np.array(attention_mask), np.array(token_type_ids)

In [None]:
def rmse_loss(y_true, y_pred):
    y_true = tf.cast(y_true, dtype=tf.float32)
    y_pred = tf.cast(y_pred, dtype=tf.float32)
    return tf.math.sqrt(tf.math.reduce_mean((y_true - y_pred)**2))

In [None]:
def dnn_model(transformer_model, seq_len):
    
    input_ids = Input(shape=(seq_len,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(seq_len,), dtype=tf.int32, name="attention_mask")
    token_type_ids = Input(shape=(seq_len,), dtype=tf.int32, name="token_type_ids")

    embed = transformer_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
    
    x = GlobalAveragePooling1D()(embed)
    
    x = Dense(units=16, kernel_initializer='he_uniform', 
              kernel_regularizer=l2(0.0001))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(rate=0.15)(x)
    
    x = Dense(units=1, kernel_initializer='he_uniform')(x)

    model = Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=x, 
                  name='CommonLit_Bert_Base_Uncased_Model')
    return model

In [None]:
transformer_model = TFBertModel.from_pretrained("bert-base-uncased")
model = dnn_model(transformer_model, max_len+16)
model.summary()

In [None]:
FOLD = 5
NUM_SEED = 1
VERBOSE = 1

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

counter = 0
oof_score = 0

In [None]:
with strategy.scope():
    
    config = BertConfig()
    config.output_hidden_states = False
    
    transformer_model = TFBertModel.from_pretrained("bert-base-uncased", config=config)
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    
    Xtrain_id, Xtrain_mask, Xtrain_token = sent_encode(train_df['Processed_excerpt'].values, tokenizer, seq_len=max_len+16)

    Xtrain_id = Xtrain_id.reshape((Xtrain_id.shape[0], Xtrain_id.shape[2]))
    Xtrain_mask = Xtrain_mask.reshape((Xtrain_mask.shape[0], Xtrain_mask.shape[2]))
    Xtrain_token = Xtrain_token.reshape((Xtrain_token.shape[0], Xtrain_token.shape[2]))
    
    print(f"Train Data: \n   Input-ids: {Xtrain_id.shape} \n   Attention Mask: {Xtrain_mask.shape}\n   Token-type-ids: {Xtrain_token.shape}\n")
    

    for sidx, seed in enumerate(seeds):
        seed_score = 0

        kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

        for idx, (train, val) in enumerate(kfold.split(Xtrain_id, Ytrain_strat)):
            counter += 1

            train_x_id, train_x_mask, train_x_token, train_y = Xtrain_id[train], Xtrain_mask[train], Xtrain_token[train], Ytrain[train]
            val_x_id, val_x_mask, val_x_token, val_y = Xtrain_id[val], Xtrain_mask[val], Xtrain_token[val], Ytrain[val]

            model = dnn_model(transformer_model, max_len+16)
            
            '''
            for layer in model.layers[:3]:
                layer.trainable = False
            '''

            model.compile(loss=rmse_loss,
                          metrics=[RootMeanSquaredError(name='rmse')],
                          optimizer=Adam(lr=8e-5))

            early = EarlyStopping(monitor="val_rmse", mode="min", 
                                  restore_best_weights=True, 
                                  patience=5, verbose=VERBOSE)

            reduce_lr = ReduceLROnPlateau(monitor="val_rmse", factor=0.25, 
                                          min_lr=1e-8, patience=2, 
                                          verbose=VERBOSE, mode='min')

            chk_point = ModelCheckpoint(f'./CommonLit_Bert_Base_Uncased_Model_{seed}S_{(idx+1)}F.h5', 
                                        monitor='val_rmse', verbose=VERBOSE, 
                                        save_best_only=True, mode='min',
                                        save_weights_only=True)

            history = model.fit(
                [train_x_id, train_x_mask, train_x_token], train_y, 
                batch_size=mini_batch_size,
                epochs=15, 
                verbose=VERBOSE, 
                callbacks=[reduce_lr, early, chk_point], 
                validation_data=([val_x_id, val_x_mask, val_x_token], val_y)
            )
            
            model.load_weights(f'./CommonLit_Bert_Base_Uncased_Model_{seed}S_{(idx+1)}F.h5')

            y_pred = model.predict([val_x_id, val_x_mask, val_x_token])
            score = np.sqrt(mean_squared_error(val_y, y_pred))
            oof_score += score
            seed_score += score
            print("\nSeed-{} | Fold-{} | OOF Score: {}\n".format(seed, idx, score))

        print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))

In [None]:
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))