## Import libraries

In [None]:
import shutil

shutil.copytree("/kaggle/input/infersent/", "/kaggle/working/infersent")
! mv /kaggle/working/infersent/* /kaggle/working/

In [None]:
import warnings
warnings.filterwarnings("ignore")

import gc
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import QuantileTransformer

import nltk
import string
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import torch
import lightgbm as lgb
from models import InferSent

## Load source datasets

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
train_df.drop(['url_legal','license','standard_error'], inplace=True, axis=1)
train_df.set_index("id", inplace=True)
print(f"train_df: {train_df.shape}\n")
train_df.head()

In [None]:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
test_df.drop(['url_legal','license'], inplace=True, axis=1)
test_df.set_index("id", inplace=True)
print(f"test_df: {test_df.shape}\n")
test_df.head()

## Extract target label

In [None]:
Ytrain = train_df['target'].values
Ytrain_strat = pd.qcut(train_df['target'].values, q=5, labels=range(0,5))
train_df.drop(['target'], inplace=True, axis=1)
print("Ytrain: {}".format(Ytrain.shape))

## Feature Engineering

In [None]:
combined_df = train_df.append(test_df, sort=False, ignore_index=False)
combined_df.head()

In [None]:
combined_df["excerpt_num_words"] = combined_df["excerpt"].apply(lambda x: len(str(x).split()))
combined_df["excerpt_num_unique_words"] = combined_df["excerpt"].apply(lambda x: len(set(str(x).split())))
combined_df["excerpt_num_chars"] = combined_df["excerpt"].apply(lambda x: len(str(x)))
combined_df["excerpt_num_stopwords"] = combined_df["excerpt"].apply(lambda x: len([w for w in str(x).lower().split() if w in set(stopwords.words('english'))]))
combined_df["excerpt_num_punctuations"] =combined_df['excerpt'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
combined_df["excerpt_num_words_upper"] = combined_df["excerpt"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
combined_df["excerpt_num_words_title"] = combined_df["excerpt"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
combined_df["excerpt_mean_word_len"] = combined_df["excerpt"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
combined_df['excerpt_polarity'] = combined_df['excerpt'].apply(lambda x: TextBlob(x).sentiment[0])
combined_df['excerpt_subjectivity'] = combined_df['excerpt'].apply(lambda x: TextBlob(x).sentiment[1])
combined_df.head()

In [None]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,5), 
                        analyzer='word', max_df=0.95, min_df=3, 
                        use_idf=1, sublinear_tf=1, 
                        max_features=500, strip_accents='ascii')
features = tfidf.fit_transform(combined_df.excerpt).toarray()
features_df = pd.DataFrame(features, 
                           columns=tfidf.get_feature_names(), 
                           index=combined_df.index)
combined_df = pd.merge(combined_df, 
                       features_df, 
                       left_index=True, 
                       right_index=True)
combined_df.shape

In [None]:
countvec = CountVectorizer(stop_words='english', ngram_range=(1,5), 
                           analyzer='word', max_df=0.95, min_df=3, 
                           max_features=200, strip_accents='ascii')
features = countvec.fit_transform(combined_df.excerpt).toarray()
features_df = pd.DataFrame(features, 
                           columns=countvec.get_feature_names(), 
                           index=combined_df.index)
combined_df = pd.merge(combined_df, 
                       features_df, 
                       left_index=True, 
                       right_index=True)
combined_df.shape

## Text data preprocessing

In [None]:
def remove_punctuations(text):
    punct =[]
    punct += list(string.punctuation)
    punct += '’'
    punct += '-'
    punct.remove("'")
    
    for punctuation in punct:
        text = text.replace(punctuation, ' ')
    return text


lemmatizer = WordNetLemmatizer()
wordnet_map = {
    "N": wordnet.NOUN, 
    "V": wordnet.VERB, 
    "J": wordnet.ADJ, 
    "R": wordnet.ADV
}

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [None]:
combined_df["Processed_excerpt"] = combined_df["excerpt"].str.lower()
combined_df['Processed_excerpt'] = combined_df['Processed_excerpt'].apply(remove_punctuations)

stop_words = set(stopwords.words('english'))
combined_df['Processed_excerpt'] = combined_df['Processed_excerpt'].apply(lambda text: " ".join([word for word in str(text).split() if word not in stop_words]))
combined_df['Processed_excerpt'] = combined_df['Processed_excerpt'].apply(lambda text: lemmatize_words(text))

In [None]:
for i in range(3):
    print(f"Original Excerpt: \n{combined_df.iloc[i]['excerpt']} \n\nProcessed Excerpt: \n{combined_df.iloc[i]['Processed_excerpt']}\n\n")

## Glove Embeddings

In [None]:
with open("../input/gloveembeddings/Glove_840B_300d_Embeddings.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
embeddings_index = processed_data['embeddings_index']
print('Word vectors found: {}'.format(len(embeddings_index)))

del processed_data
gc.collect()

In [None]:
def sent2vec(text):
    words = nltk.word_tokenize(text)
    words = [w for w in words if w.isalpha()]
    
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    
    return v / np.sqrt((v ** 2).sum())

In [None]:
glove_vec = [sent2vec(x) for x in tqdm(combined_df["Processed_excerpt"].values)]
col_list = ['glove_'+str(i) for i in range(300)]
glove_vec_df = pd.DataFrame(np.array(glove_vec), columns=col_list, index=combined_df.index)
print(f"glove_vec_df: {glove_vec_df.shape}\n")
glove_vec_df.head()

In [None]:
combined_df = pd.merge(combined_df, glove_vec_df, how="inner", on="id", sort=False)
print(f"combined_df: {combined_df.shape}\n")
combined_df.head()

## InferSent Embeddings

In [None]:
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}

model = InferSent(params_model)
model.load_state_dict(torch.load("../input/infersent2/encoder/infersent2.pkl"))
model = model.cuda()
model.set_w2v_path("../input/glove840b300dtxt/glove.840B.300d.txt")

In [None]:
model.build_vocab(combined_df['Processed_excerpt'].values.tolist(), tokenize=True)

In [None]:
infersent_vec = model.encode(combined_df['Processed_excerpt'].values.tolist(), 
                             bsize=128, tokenize=True, verbose=True)
infersent_vec.shape

In [None]:
col_list = ['infersent_'+str(i) for i in range(infersent_vec.shape[1])]
infersent_df = pd.DataFrame(np.array(infersent_vec), columns=col_list, index=combined_df.index)
print(f"infersent_df: {infersent_df.shape}\n")
infersent_df.head()

In [None]:
combined_df = pd.merge(combined_df, infersent_df, how="inner", on="id", sort=False)
combined_df.drop(['excerpt','Processed_excerpt'], inplace=True, axis=1)
print(f"combined_df: {combined_df.shape}\n")
combined_df.head()

In [None]:
Xtrain = combined_df[:Ytrain.shape[0]].copy()
Xtest = combined_df[Ytrain.shape[0]:].copy()
print(f"Xtrain: {Xtrain.shape} \nXtest: {Xtest.shape}")

## Quantile Transformation

In [None]:
for col in tqdm(Xtrain.columns):
    transformer = QuantileTransformer(n_quantiles=1000, 
                                      random_state=10, 
                                      output_distribution="normal")
    
    vec_len = len(Xtrain[col].values)
    vec_len_test = len(Xtest[col].values)

    raw_vec = Xtrain[col].values.reshape(vec_len, 1)
    test_vec = Xtest[col].values.reshape(vec_len_test, 1)
    transformer.fit(raw_vec)
    
    Xtrain[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    Xtest[col] = transformer.transform(test_vec).reshape(1, vec_len_test)[0]

print("Xtrain: {} \nYtrain: {} \nXtest: {}".format(Xtrain.shape, Ytrain.shape, Xtest.shape))

## Build and validate the model

In [None]:
FOLD = 5
NUM_SEED = 2
COUNTER = 0

np.random.seed(2021)
seeds = np.random.randint(0, 2021, size=NUM_SEED)

oof_score_lgb = 0
y_pred_final_lgb = 0
y_pred_meta_lgb = np.zeros((Ytrain.shape[0], 1))


for sidx, seed in enumerate(seeds):
    seed_score_lgb = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain_strat)):
        COUNTER += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain[val]
        
        #====================================================================
        #                                LightGBM
        #====================================================================
        
        params = {}
        params["objective"] = 'regression'
        params["metric"] = 'rmse'
        params["boosting"] = 'gbdt'
        params["device"] = 'gpu'
        params["learning_rate"] = 0.0204
        params["lambda_l2"] = 0.225
        params["num_leaves"] = 52
        params["max_depth"] = 6
        params["feature_fraction"] = 0.75
        params["bagging_fraction"] = 0.65
        params["bagging_freq"] = 10
        params["min_data_in_leaf"] = 15
        params["verbosity"] = -1
        num_rounds = 5000
        
        lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
        lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

        lgb_model = lgb.train(params, lgtrain, num_rounds, 
                              valid_sets=[lgtrain, lgvalidation], 
                              early_stopping_rounds=100, verbose_eval=200)

        y_pred = lgb_model.predict(val_x, num_iteration=lgb_model.best_iteration)
        y_pred_meta_lgb[val] = np.array([y_pred]).T
        y_pred_final_lgb += lgb_model.predict(Xtest, num_iteration=lgb_model.best_iteration)
        
        score = np.sqrt(mean_squared_error(val_y, y_pred))
        oof_score_lgb += score
        seed_score_lgb += score
        print(f"\nLightGBM | Seed-{seed} | Fold-{idx+1} | OOF Score: {score}\n")
        
    print(f"\nLightGBM | Seed: {seed} | Aggregate OOF Score: {(seed_score_lgb / FOLD)}\n")


y_pred_meta_lgb = y_pred_meta_lgb / float(NUM_SEED)
y_pred_final_lgb = y_pred_final_lgb / float(COUNTER)
oof_score_lgb /= float(COUNTER)
print(f"LightGBM | Aggregate OOF Score: {oof_score_lgb}")

## Create submission file

In [None]:
submit_df = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submit_df['target'] = y_pred_final_lgb
submit_df.to_csv("./submission.csv", index=False)
submit_df.head()