# Import Statements

In [1]:
import os 
!pip install ../input/d/alvaromunoz/textstat/Pyphen-0.10.0-py3-none-any.whl
!pip install ../input/d/alvaromunoz/textstat/textstat-0.7.0-py3-none-any.whl
import torch
from torch import optim, nn
from torch.utils.data import DataLoader, Subset
# from torch.utils.tensorboard import SummaryWriter

import tqdm
import numpy as np
import random
import os
import datetime
import pandas as pd
import psutil

import warnings
import string
import re
import math
from collections import Counter
import pkg_resources
from functools import lru_cache

import logging
from pathlib import Path
import os
import zipfile
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertModel

import textstat

Processing /kaggle/input/d/alvaromunoz/textstat/Pyphen-0.10.0-py3-none-any.whl
Installing collected packages: Pyphen
Successfully installed Pyphen-0.10.0
Processing /kaggle/input/d/alvaromunoz/textstat/textstat-0.7.0-py3-none-any.whl
Installing collected packages: textstat
Successfully installed textstat-0.7.0


# Config

In [2]:
class Config:
    # Debugging
    DEBUG_MODE = False
    SEED = 0

    # Navigation
#     ROOT_PATH = get_project_root()
    RAW_DATA_PATH = Path('/kaggle/input/commonlitreadabilityprize')
    MODEL_SAVE_PATH = Path('/kaggle/working/models')
    MODEL_SAVE_PATH.mkdir(parents=True, exist_ok=True)

    # Dataset Parameters
    PREPROCESS_TEXT = False
    MAX_LEN = 512
    FOLDS = 5
    EMBED = False  # Don't convert to word embeddings (use if embedding will be done simultaneously with modelling)
    READABILITY_METRICS = True  # Include readability metrics in dataset

    # Training Parameters
    EPOCHS = 3
    BATCH_SIZE = 64
    WEIGHT_LOSS = True  # If true, loss function is weighted based on the standard error of readability scores
    SKIP_TRAINING = False

    # Model Parameters
    DROPOUT = 0.3
    EMBEDDINGS_ONLY = False  # Only returns the embeddings, no final estimate.
    # These can be used for clustering or for transfer learning using non-Deep Learning methods.

    LAYER_UNITS = (768 + 13, 128, 64, 1)  # Number of units in each layer of fully connected network
    # 768 + 13 = BERT embedding dim + readability_score dim

    # Optimizer Parameters
    optimizer_params = {
        'lr': 1e-4
    }

# Features

In [3]:
def build_text_features(excerpt, as_dict=True):
    num_words = len(excerpt.split(' '))
    features = {}
    features['flesch_reading_ease'] = textstat.flesch_reading_ease(excerpt)
    features['smog_index'] = textstat.smog_index(excerpt)
    features['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(excerpt)
    features['coleman_liau_index'] = textstat.coleman_liau_index(excerpt)
    features['automated_readability_index'] = textstat.automated_readability_index(excerpt)
    features['dale_chall_readability_score'] = textstat.dale_chall_readability_score(excerpt)
    features['difficult_words'] = textstat.difficult_words(excerpt) / num_words
    features['linsear_write_formula'] = textstat.linsear_write_formula(excerpt)
    features['gunning_fog'] = textstat.gunning_fog(excerpt)
    features['fernandez_huerta'] = textstat.fernandez_huerta(excerpt)
    features['szigriszt_pazos'] = textstat.szigriszt_pazos(excerpt)
    features['gutierrez_polini'] = textstat.gutierrez_polini(excerpt)
    features['crawford'] = textstat.crawford(excerpt)
    if as_dict:
        return features
    else:
        return list(features.values())

# Dataset

In [4]:
def get_raw_data(config):
    """ Downloads and extracts data from Kaggle and stores in '../raw' ready to be processed.
    """
    logger = logging.getLogger(__name__)
    logger.info('Downloading raw data')

    output_path = config.RAW_DATA_PATH
    output_path.mkdir(parents=True, exist_ok=True)
    os.system(f'kaggle competitions download -c commonlitreadabilityprize -p {output_path}')

    for f in output_path.iterdir():
        if f.suffix == '.zip':
            with zipfile.ZipFile((output_path / f), 'r') as zip_ref:
                zip_ref.extractall(output_path)


class ReadabilityPredictorDataset(Dataset):
    def __init__(self, config, train=True):
        super().__init__()
        get_raw_data(config)

        self.config = config
        self.train = train
        self.tokenizer = BertTokenizer.from_pretrained('../input/bert-base-uncased/vocab.txt', do_lower_case=True)

        if self.train:
            fn = 'train.csv'
        else:
            fn = 'test.csv'

        train_file_path = (config.RAW_DATA_PATH / fn)
        self.train_file = pd.read_csv(train_file_path)

        if self.config.DEBUG_MODE:
            self.train_file = self.train_file.iloc[:20]
        self.text = self.train_file['excerpt'].to_numpy()

        if self.train:
            self.targets = self.train_file['target'].to_numpy()
            self.std_error = self.train_file['standard_error'].to_numpy()
        else:
            self.ids = self.train_file['id'].to_numpy()

        if self.config.PREPROCESS_TEXT:
            self.preprocess_text()

#         if self.config.EMBED:
#             self.bert = BertModel.from_pretrained('../input/roberta-transformers-pytorch/roberta-base/pytorch_model.bin', return_dict=False)

    def __len__(self):
        return len(self.train_file)

    def __getitem__(self, item):
        item_text = self.text[item]
        inputs = self.tokenizer.encode_plus(
            item_text,
            max_length=self.config.MAX_LEN,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )

        if self.train:
            data = {'raw': item_text, 'target': self.targets[item], 'weight': 1 / (1 + self.std_error[item])}
        else:
            data = {'raw': item_text, 'id': self.ids[item]}
        if self.config.EMBED:
            _, data['embedding'] = self.bert(
                data['ids'], attention_mask=data['mask'], token_type_ids=data['token_type_ids']
            )
        else:
            data.update({
                'ids': inputs['input_ids'].squeeze(),
                'mask': inputs['attention_mask'].squeeze(),
                'token_type_ids': inputs['token_type_ids'].squeeze(),
            })

        if self.config.READABILITY_METRICS:
            readability_features = build_text_features(item_text, as_dict=False)
            data['readability_metrics'] = torch.tensor(readability_features)

        return data

    def preprocess_text(self):
        """
        Preprocesses the text for use by the model (may not be necessary but, we'll see with time).
        :return: None
        """
        raise NotImplementedError


# Model

In [5]:
class BERTTextReadabilityPredictorBasic(nn.Module):
    def __init__(self, config, device='cpu'):
        super().__init__()
        self.config = config
        self.bert = BertModel.from_pretrained('../input/bert-base-uncased', return_dict=False)
        self.drop = nn.Dropout(config.DROPOUT)
        self.fc = SimpleMLP(layer_units=config.LAYER_UNITS, dropout=config.DROPOUT, device=device)
        self.fc.to(device)

    def forward(self, data):
        # print(data['ids'].shape, data['mask'].shape, data['token_type_ids'].shape)
        _, out = self.bert(
            data['ids'], attention_mask=data['mask'], token_type_ids=data['token_type_ids']
        )

        if self.config.EMBEDDINGS_ONLY:
            return out

        if self.config.READABILITY_METRICS:
            out = torch.cat((out, data['readability_metrics']), dim=1)
        out = self.drop(out)
        out = self.fc(out)

        return out


class SimpleMLP(nn.Module):
    def __init__(self, layer_units=(768, 64, 64, 1), dropout=0.5, device='cpu', **kwargs):
        super().__init__()
        self.bn_layers = nn.ModuleList([
            nn.BatchNorm1d(layer_units[i]).to(device) for i in range(len(layer_units)-1)
        ])
        self.fc_layers = nn.ModuleList([
            nn.Linear(layer_units[i], layer_units[i+1]).to(device) for i in range(len(layer_units)-1)
        ])
        self.dropout = nn.Dropout(dropout)
        self.act = nn.ReLU()

    def forward(self, data):
        num_layers = len(self.fc_layers)
        out = data
        for layer in range(num_layers):
            out = self.bn_layers[layer](out)
            out = self.fc_layers[layer](out)
            if layer != num_layers - 1:
                out = self.dropout(out)
                out = self.act(out)
        return out


# Training

In [6]:
def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

    os.environ['PYTHONASSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)


class Trainer:
    def __init__(self, config, val_fold=1):
#         self.h = hpy()
        set_seed(config.SEED)
        self.config = config
        self.val_fold = val_fold
        self.model_timestamp = datetime.datetime.now().strftime('%m%d-%H%M%S')
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        print(self.device)
        self.dataset = ReadabilityPredictorDataset(config)
        self.test_ds = ReadabilityPredictorDataset(config, train=False)
        self.train_ds, self.val_ds = None, None
        self.train_loader, self.val_loader, self.test_loader = None, None, None
        self.make_loaders()
        self.best_loss = float("inf")
        self.best_model = None

        self.model = BERTTextReadabilityPredictorBasic(config, device=self.device)
        self.model.to(self.device).float()
        if self.config.SKIP_TRAINING:
            self.load_checkpoint((config.MODEL_SAVE_PATH / config.SAVED_MODEL))

        # Freezes all BERT weights. You'll want to adapt learning to unfreeze these once LR decreases.
        for param in self.model.bert.parameters():
            param.requires_grad = False

        param_optimizer = list(self.model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay': 0.0001},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0}
        ]
        self.optimizer = optim.AdamW(optimizer_parameters, **config.optimizer_params)
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, mode='min', factor=0.1, patience=0, verbose=True)
        self.criterion = loss_fn
        self.current_epoch = None

        # Tensorboard writer
#         datetime_stamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
#         run_name = f"CommonLit_{datetime_stamp}"
#         self.writer = SummaryWriter(f'runs/{run_name}')

        self.model_name = None
        self.predictions = None

    def run(self):
        if not self.config.SKIP_TRAINING:
            epoch_pbar = tqdm.tqdm(range(1, self.config.EPOCHS + 1), desc='Running training...')
            for epoch in epoch_pbar:
                try:
                    self.current_epoch = epoch
                    self.phase(train=True)
                    if self.val_fold:
                        self.phase(train=False)
                except:
                    break
        self.predict()

    def make_loaders(self):
        num_samples = int(len(self.dataset) / self.config.FOLDS)

        intervals = []
        for i in range(self.config.FOLDS):
            intervals.append((i * num_samples, (i + 1) * num_samples))

        if self.val_fold:
            val_indices = intervals[self.val_fold - 1]
            self.val_ds = Subset(self.dataset, range(*val_indices))
            self.val_loader = DataLoader(self.val_ds, batch_size=self.config.BATCH_SIZE, shuffle=True)
            intervals.pop(self.val_fold - 1)

        self.train_ds = Subset(self.dataset, sum([list(range(*ti)) for ti in intervals], []))
        self.train_loader = DataLoader(self.train_ds, batch_size=self.config.BATCH_SIZE, shuffle=True, num_workers=8)
        self.test_loader = DataLoader(self.test_ds, batch_size=self.config.BATCH_SIZE, num_workers=8)

    def phase(self, train=True):
        if train:
            self.model.train()
            loader = self.train_loader
            phase = 'training'
        else:
            self.model.eval()
            loader = self.val_loader
            phase = 'validation'
        loss_hist = []
        pbar = tqdm.tqdm(enumerate(loader), total=len(loader), desc=f'Running {phase} phase...')
        for i, batch in pbar:
#             print(self.h.heap())
            if train:
                self.optimizer.zero_grad()
            # TODO readability metrics are not guaranteed to be present. Edit code so it won't break without it
            input_data = {
                i: batch[i].to(self.device) for i in ['ids', 'mask', 'token_type_ids', 'readability_metrics']
            }
            self.model.eval()
#             self.writer.add_graph(self.model, input_data)
            outputs = self.model(input_data)
            if self.config.WEIGHT_LOSS:
                loss = self.criterion(outputs.float(),
                                      batch['target'].to(self.device).float(),
                                      batch['weight'].to(self.device).float())
            else:
                loss = self.criterion(outputs.float(), batch['target'].to(self.device).float())
            if train:
                loss.backward()
                self.optimizer.step()
            
            loss_hist.append(loss.detach().item() / outputs.shape[0])
            avg_loss = np.mean(loss_hist)
            pbar.set_postfix({'loss': avg_loss})

        if not train:
            self.scheduler.step(avg_loss)
            if avg_loss < self.best_loss:
                self.best_loss = avg_loss
                self.best_model = self.model_name
                self.save_checkpoint()

        # Tensorboard stuff
#         self.writer.add_scalar('Loss', np.mean(loss_hist), self.current_epoch)
#         for name, param in self.model.named_parameters():
#             self.writer.add_histogram(name, param, self.current_epoch)
#             if param.grad == None:
#                 continue
#             self.writer.add_histogram(f'{name}.grad', param.grad, self.current_epoch)

    def save_checkpoint(self):
        self.model_name = f"{self.model_timestamp}-epoch-{self.current_epoch}"
        model_path = (self.config.MODEL_SAVE_PATH / self.model_name)
        torch.save(self.model.state_dict(), model_path)

    def load_checkpoint(self, checkpoint):
        self.model.load_state_dict(torch.load(checkpoint))

    def predict(self):
        predictions = []
        self.model.eval()
        pbar = tqdm.tqdm(enumerate(self.test_loader), total=len(self.test_loader), desc=f'Running testing phase...')
        for i, batch in pbar:
            # TODO readability metrics: see line 114
            input_data = {
                i: batch[i].to(self.device) for i in ['ids', 'mask', 'token_type_ids', 'readability_metrics']
            }
            outputs = self.model(input_data)
            for row in range(len(outputs)):
                predictions.append([batch['id'][row], outputs[row].item()])
        self.predictions = pd.DataFrame(predictions, columns=('id', 'target'))

def loss_fn(output, target, weights=None):
    """Loss Function"""
    if weights is not None:
        # Weights loss function by error
        return torch.sqrt((nn.MSELoss(reduction='none')(output.squeeze(), target) * weights).sum() / weights.sum())
    else:
        return torch.sqrt(nn.MSELoss()(output.squeeze(), target))

In [7]:
conf = Config()
trainer = Trainer(conf)

cuda




In [8]:
trainer.run()
trainer.predictions.to_csv('./submission.csv', index=False)

Running training...:   0%|          | 0/3 [00:00<?, ?it/s]
Running training phase...:   0%|          | 0/36 [00:00<?, ?it/s][A
Running training phase...:   0%|          | 0/36 [01:54<?, ?it/s, loss=0.0376][A
Running training phase...:   3%|▎         | 1/36 [01:54<1:06:42, 114.37s/it, loss=0.0376][A
Running training phase...:   3%|▎         | 1/36 [01:55<1:06:42, 114.37s/it, loss=0.0348][A
Running training phase...:   6%|▌         | 2/36 [01:55<27:04, 47.77s/it, loss=0.0348]   [A
Running training phase...:   6%|▌         | 2/36 [01:56<27:04, 47.77s/it, loss=0.0346][A
Running training phase...:   8%|▊         | 3/36 [01:56<14:36, 26.55s/it, loss=0.0346][A
Running training phase...:   8%|▊         | 3/36 [01:57<14:36, 26.55s/it, loss=0.0339][A
Running training phase...:  11%|█         | 4/36 [01:57<08:48, 16.53s/it, loss=0.0339][A
Running training phase...:  11%|█         | 4/36 [01:59<08:48, 16.53s/it, loss=0.0333][A
Running training phase...:  14%|█▍        | 5/36 [01:59<05:40

# Predict