## Import libraries

In [None]:
import gc
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import re
import nltk
import string
from textblob import TextBlob
from collections import Counter
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adamax
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import LSTM, GRU, Conv1D
from tensorflow.keras.layers import Activation, Input
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Embedding, Dense, Dropout
from tensorflow.keras.layers import Bidirectional, Concatenate

from transformers import BertTokenizer, TFBertModel, BertConfig

import sys
sys.path.insert(1, '../input/tf-cyclic-lr-schedule')
from clr_callback import CyclicLR

## Load source datasets

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
train_df.drop(['url_legal','license','standard_error'], inplace=True, axis=1)
train_df.set_index("id", inplace=True)
print(f"train_df: {train_df.shape}\n")
train_df.head()

In [None]:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
test_df.drop(['url_legal','license'], inplace=True, axis=1)
test_df.set_index("id", inplace=True)
print(f"test_df: {test_df.shape}\n")
test_df.head()

## Extract target label

In [None]:
Ytrain = train_df['target'].values
Ytrain_strat = pd.qcut(train_df['target'].values, q=5, labels=range(0,5))
train_df.drop(['target'], inplace=True, axis=1)
print(f"Ytrain: {Ytrain.shape}")

## Feature Engineering

In [None]:
def decontraction(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


def remove_punctuations(text):
    punct = []
    punct += list(string.punctuation)
    punct += '’'
    punct += '-'
    punct += ','
    punct += '.'
    punct += '?'
    punct += '!'
    punct.remove('"')
    
    for punctuation in punct:
        text = text.replace(punctuation, '')
    return text


def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {
        "N": wordnet.NOUN, 
        "V": wordnet.VERB, 
        "J": wordnet.ADJ, 
        "R": wordnet.ADV
    }
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [None]:
def dialog_parser(text):
    
    tokenized = nltk.word_tokenize(text)
    
    # let's set up some lists to hold our pieces of narrative and dialog
    parsed_dialog = []
    parsed_narrative = []
    
    # and this list will be a bucket for the text we're currently exploring
    current = []

    # now let's set up values that will help us loop through the text
    length = len(tokenized)
    found_q = False
    counter = 0
    quote_open, quote_close = '``', "''"

    # now we'll start our loop saying that as long as our sentence is...
    while counter < length:
        word = tokenized[counter]

        # until we find a quotation mark, we're working with narrative
        if quote_open not in word and quote_close not in word:
            current.append(word)

        # here's what we do when we find a closed quote
        else:
            # we append the narrative we've collected & clear our our
            # current variable
            parsed_narrative.append(current)
            current = []
            
            # now current is ready to hold dialog and we're working on
            # a piece of dialog
            current.append(word)
            found_q = True

            # while we're in the quote, we're going to increment the counter
            # and append to current in this while loop
            while found_q and counter < length-1:
                counter += 1
                if quote_close not in tokenized[counter]:
                    current.append(tokenized[counter])
                else:
                    # if we find a closing quote, we add our dialog to the
                    # appropriate list, clear current and flip our found_q
                    # variable to False
                    current.append(tokenized[counter])
                    parsed_dialog.append(current)
                    current = []
                    found_q = False

        # increment the counter to move us through the text
        counter += 1
    
    if len(parsed_narrative) == 0:
        parsed_narrative.append(current)
    
    mean_dialog_word_len = 0
    
    if len(parsed_dialog) > 0:
        for text in parsed_dialog:
            join_text = " ".join(text)
            join_text = join_text.replace('"','')
            join_text = join_text.replace("''","")
            mean_dialog_word_len += len(join_text.split())
        
        mean_dialog_word_len /= float(len(parsed_dialog))
    
    mean_narrative_word_len = 0
    
    if len(parsed_narrative) > 0:
        for text in parsed_narrative:
            join_text = " ".join(text)
            join_text = join_text.replace('"','')
            join_text = join_text.replace("''","")
            mean_narrative_word_len += len(join_text.split())
        
        mean_narrative_word_len /= float(len(parsed_narrative))

    return len(parsed_dialog), len(parsed_narrative), mean_dialog_word_len, mean_narrative_word_len

In [None]:
combined_df = train_df.append(test_df, sort=False, ignore_index=False)

In [None]:
punct = []
punct += list(string.punctuation)
punct += '’'
punct += '-'
punct += ','
punct += '.'
punct += '?'
punct += '!'


combined_df["excerpt_num_words"] = combined_df["excerpt"].apply(lambda x: len(str(x).split()))
combined_df["excerpt_num_unique_words"] = combined_df["excerpt"].apply(lambda x: len(set(str(x).split())))
combined_df["excerpt_num_chars"] = combined_df["excerpt"].apply(lambda x: len(str(x)))
combined_df["excerpt_num_stopwords"] = combined_df["excerpt"].apply(lambda x: len([w for w in str(x).lower().split() if w in set(stopwords.words('english'))]))
combined_df["excerpt_num_punctuations"] =combined_df['excerpt'].apply(lambda x: len([c for c in str(x) if c in punct]))
combined_df["excerpt_num_words_upper"] = combined_df["excerpt"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
combined_df["excerpt_num_words_title"] = combined_df["excerpt"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
combined_df["excerpt_mean_word_len"] = combined_df["excerpt"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
combined_df["excerpt_num_dialog"] = combined_df["excerpt"].apply(lambda x: dialog_parser(x)[0])
combined_df["excerpt_num_narrative"] = combined_df["excerpt"].apply(lambda x: dialog_parser(x)[1])
combined_df["excerpt_dialog_mean_word_len"] = combined_df["excerpt"].apply(lambda x: dialog_parser(x)[2])
combined_df["excerpt_narrative_mean_word_len"] = combined_df["excerpt"].apply(lambda x: dialog_parser(x)[3])
combined_df['excerpt_polarity'] = combined_df['excerpt'].apply(lambda x: TextBlob(x).sentiment[0])
combined_df['excerpt_subjectivity'] = combined_df['excerpt'].apply(lambda x: TextBlob(x).sentiment[1])
combined_df.head()

In [None]:
# Convert to lower case
combined_df['Processed_excerpt'] = combined_df['excerpt'].apply(lambda x: str(x).lower().replace('\\', '').replace('_', ' '))

# Remove double spaces
combined_df['Processed_excerpt'] = combined_df['Processed_excerpt'].apply(lambda x: re.sub('\s+',  ' ', x))

# Replace contractions ("don't" with "do not" and "we've" with "we have")
combined_df['Processed_excerpt'] = combined_df['Processed_excerpt'].apply(lambda x: decontraction(x))

# Remove punctuations
combined_df['Processed_excerpt'] = combined_df['Processed_excerpt'].apply(remove_punctuations)

# Lemmatize words
combined_df['Processed_excerpt'] = combined_df['Processed_excerpt'].apply(lambda text: lemmatize_words(text))

del train_df
del test_df
gc.collect()

combined_df.head()

In [None]:
# Rare-words removal
def remove_rarewords(text):
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])


cnt = Counter()
for text in combined_df['Processed_excerpt'].values:
    for word in text.split():
        cnt[word] += 1

n_rare_words = 50
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])

combined_df['Processed_excerpt'] = combined_df['Processed_excerpt'].apply(lambda text: remove_rarewords(text))
combined_df.head()

In [None]:
# Removal of Emojis
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)


combined_df['Processed_excerpt'] = combined_df['Processed_excerpt'].apply(lambda text: remove_emoji(text))
combined_df.head()

In [None]:
for i in range(3):
    print(f"\nOriginal Excerpt: \n{combined_df.iloc[i]['excerpt']} \n\nProcessed Excerpt: \n{combined_df.iloc[i]['Processed_excerpt']}\n")
    print("="*90)

In [None]:
train_df = combined_df[:Ytrain.shape[0]].copy()
test_df = combined_df[Ytrain.shape[0]:].copy()
print(f"train_df: {train_df.shape} \ntest_df: {test_df.shape}")

del combined_df
gc.collect()

In [None]:
col_list = [col for col in train_df.columns if col not in ['excerpt','Processed_excerpt']]
Xtrain = train_df[col_list].values
Xtest = test_df[col_list].values
Xtrain.shape, Xtest.shape

In [None]:
# Extract excerpts from train and test datasets
text_list = pd.concat([train_df['Processed_excerpt'], 
                       test_df['Processed_excerpt']])
print(f"Total number of excerpts: {len(text_list)}")

## Generate words tokens and attention masks

In [None]:
MAX_LEN = max(train_df['excerpt_num_words'].max(), test_df['excerpt_num_words'].max()) + 8

del train_df
del test_df
gc.collect()

MAX_LEN

In [None]:
def sent_encode(texts, tokenizer):
    input_ids = []
    attention_mask = []
    token_type_ids = []

    for text in texts:
        tokens = tokenizer.encode_plus(text, max_length=MAX_LEN, truncation=True, 
                                       padding='max_length', add_special_tokens=True, 
                                       return_attention_mask=True, return_token_type_ids=True, 
                                       return_tensors='tf')
        
        input_ids.append(tokens['input_ids'])
        attention_mask.append(tokens['attention_mask'])
        token_type_ids.append(tokens['token_type_ids'])

    return np.array(input_ids), np.array(attention_mask), np.array(token_type_ids)

In [None]:
tokenizer = BertTokenizer.from_pretrained("../input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased")
input_id, attention_mask, token_type_id = sent_encode(text_list, tokenizer)

input_id = input_id.reshape((input_id.shape[0], input_id.shape[2]))
attention_mask = attention_mask.reshape((attention_mask.shape[0], attention_mask.shape[2]))
token_type_id = token_type_id.reshape((token_type_id.shape[0], token_type_id.shape[2]))

input_id.shape, attention_mask.shape, token_type_id.shape

In [None]:
Xtrain_id, Xtest_id = input_id[:Ytrain.shape[0]], input_id[Ytrain.shape[0]:]
Xtrain_mask, Xtest_mask = attention_mask[:Ytrain.shape[0]], attention_mask[Ytrain.shape[0]:]
Xtrain_token, Xtest_token = token_type_id[:Ytrain.shape[0]], token_type_id[Ytrain.shape[0]:]

print(f"Xtrain_id: {Xtrain_id.shape} \nXtrain_mask: {Xtrain_mask.shape} \nXtrain_token: {Xtrain_token.shape}\n")
print(f"Xtest_id: {Xtest_id.shape} \nXtest_mask: {Xtest_mask.shape} \nXtest_token: {Xtest_token.shape}")

## Build and validate model

In [None]:
def rmse_loss(y_true, y_pred):
    y_true = tf.cast(y_true, dtype=tf.float32)
    y_pred = tf.cast(y_pred, dtype=tf.float32)
    return tf.math.sqrt(tf.math.reduce_mean((y_true - y_pred)**2))

In [None]:
def bert_model(transformer_model, n_features):
    
    input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")
    token_type_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name="token_type_ids")
    x_input = Input(shape=(n_features,), name="statistical_features")

    embed = transformer_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
    
    x1 = Bidirectional(LSTM(units=192, activation='tanh',
                            return_sequences=True, dropout=0.15,
                            kernel_regularizer=l2(0.0001),
                            kernel_initializer='he_uniform'), 
                       merge_mode='concat')(embed)
    x1 = BatchNormalization()(x1)
    
    x1 = Conv1D(filters=128, kernel_size=3, 
                strides=2, padding='same', 
                kernel_regularizer=l2(0.0001),
                kernel_initializer='he_uniform')(x1)
    x1 = BatchNormalization()(x1)
    x1 = Activation('swish')(x1)
    x1 = SpatialDropout1D(rate=0.2)(x1)
    
    x2 = Bidirectional(GRU(units=192, activation='tanh',
                           return_sequences=True, dropout=0.15,
                           kernel_regularizer=l2(0.0001),
                           kernel_initializer='he_uniform'), 
                       merge_mode='concat')(embed)
    x2 = BatchNormalization()(x2)
    
    x2 = Conv1D(filters=128, kernel_size=3, 
                strides=2, padding='same', 
                kernel_regularizer=l2(0.0001),
                kernel_initializer='he_uniform')(x2)
    x2 = BatchNormalization()(x2)
    x2 = Activation('swish')(x2)
    x2 = SpatialDropout1D(rate=0.2)(x2)
    
    x3 = Dense(units=32, kernel_initializer='he_uniform', 
               kernel_regularizer=l2(0.0001))(x_input)
    x3 = BatchNormalization()(x3)
    x3 = Activation('swish')(x3)
    
    avg_pool1 = GlobalAveragePooling1D()(x1)
    avg_pool2 = GlobalAveragePooling1D()(x2)
    
    x = Concatenate()([avg_pool1, avg_pool2, x3])
    x = BatchNormalization()(x)
    x = Dropout(rate=0.15)(x)
    
    x = Dense(units=64, kernel_initializer='he_uniform', 
              kernel_regularizer=l2(0.0001))(x)
    x = BatchNormalization()(x)
    x = Activation('swish')(x)
    x = Dropout(rate=0.35)(x)
    
    x = Dense(units=16, kernel_initializer='he_uniform', 
              kernel_regularizer=l2(0.0001))(x)
    x = BatchNormalization()(x)
    x = Activation('swish')(x)
    x = Dropout(rate=0.15)(x)
    
    x = Dense(units=1, kernel_initializer='he_uniform')(x)

    model = Model(inputs=[x_input, input_ids, attention_mask, token_type_ids], outputs=x, 
                  name='CommonLit_Bert_Base_Uncased_Model')
    return model

In [None]:
# Model hyperparameters
FOLD = 5
NUM_SEED = 2
VERBOSE = 1
MINI_BATCH_SIZE = 32

min_lr = 1e-6
max_lr = 1e-2
step_size = 4 * (Xtrain.shape[0] // MINI_BATCH_SIZE)
clr_method = 'triangular2'

In [None]:
config = BertConfig(
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2
)
config.output_hidden_states = False

transformer_model = TFBertModel.from_pretrained("../input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased", 
                                                config=config)

In [None]:
model = bert_model(transformer_model, Xtrain.shape[1])
model.summary()

In [None]:
plot_model(
    model, to_file='./CommonLit_Bert_Base_Uncased_Model.png', 
    show_shapes=True, show_layer_names=True
)

In [None]:
np.random.seed(23)
seeds = np.random.randint(0, 100, size=NUM_SEED)

counter = 0
oof_score = 0
y_pred_final = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain_strat)):
        counter += 1

        train_x_id, train_x_mask, train_x_token = Xtrain_id[train], Xtrain_mask[train], Xtrain_token[train]
        val_x_id, val_x_mask, val_x_token = Xtrain_id[val], Xtrain_mask[val], Xtrain_token[val]
        train_x2, val_x2 = Xtrain[train], Xtrain[val]
        train_y, val_y = Ytrain[train], Ytrain[val]

        model = bert_model(transformer_model, Xtrain.shape[1])
        
        '''
        for layer in model.layers[:4]:
            layer.trainable = False
        '''
        
        model.compile(loss=rmse_loss,
                      metrics=[RootMeanSquaredError(name='rmse')],
                      optimizer=Adamax(lr=8e-5))

        early = EarlyStopping(monitor="val_rmse", mode="min", 
                              restore_best_weights=True, 
                              patience=7, verbose=VERBOSE)
        
        reduce_lr = ReduceLROnPlateau(monitor="val_rmse", factor=0.25, 
                                      min_lr=1e-6, patience=3, 
                                      verbose=VERBOSE, mode='min')

        chk_point = ModelCheckpoint('./CommonLit_Bert_Base_Uncased_Model.h5', 
                                    monitor='val_rmse', verbose=VERBOSE, 
                                    save_best_only=True, mode='min',
                                    save_weights_only=True)
        
        cyclic_lr = CyclicLR(base_lr=min_lr, max_lr=max_lr, 
                             mode=clr_method, step_size=step_size)
        
        history = model.fit(
            [train_x2, train_x_id, train_x_mask, train_x_token], train_y, 
            batch_size=MINI_BATCH_SIZE,
            epochs=50, 
            verbose=VERBOSE, 
            workers=5,
            callbacks=[reduce_lr, early, chk_point], 
            validation_data=([val_x2, val_x_id, val_x_mask, val_x_token], val_y)
        )
        
        model.load_weights(f'./CommonLit_Bert_Base_Uncased_Model.h5')
        
        y_pred = model.predict([val_x2, val_x_id, val_x_mask, val_x_token])
        y_pred_final += model.predict([Xtest, Xtest_id, Xtest_mask, Xtest_token])
        
        score = np.sqrt(mean_squared_error(val_y, y_pred))
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final = y_pred_final / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

## Create submission file

In [None]:
submit_df = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submit_df['target'] = y_pred_final
submit_df.to_csv("./submission.csv", index=False)
submit_df.head()