In [None]:
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from transformers import AutoModel, AutoTokenizer

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from pytorch_lightning.utilities.seed import seed_everything

from tqdm import tqdm

tqdm.pandas()
seed_everything(420)

In [None]:
from pydantic import BaseSettings


class Settings(BaseSettings):
    wandb_api_key: str

    class Config:
        env_file = ".env"
        env_file_encoding = "utf-8"
        case_sensitive = False


settings = Settings()
os.environ["WANDB_API_KEY"] = settings.wandb_api_key

In [None]:
KAGGLE_USERNAME = "brotye"
INFERENCE_DATASET = "commonlitinference-large" # will create new dataset if does not exist
BERT_MODEL = "roberta-large"
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [None]:
import logging
import wandb

logger = logging.getLogger("wandb")
logger.setLevel(logging.ERROR)

wandb.login()

In [None]:
from IPython.core.magic import register_cell_magic

@register_cell_magic
def write_and_run(line, cell):
    argz = line.split()
    file = argz[-1]
    mode = 'w'
    if len(argz) == 2 and argz[0] == '-a':
        mode = 'a'
    with open(file, mode) as f:
        f.write(cell)
    get_ipython().run_cell(cell)

### Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, cache_dir=f"../input/huggingface_models/{BERT_MODEL}/tokenizer", model_max_length=320)

### Model

In [None]:
%%write_and_run model_versioning.py

class BertClassifierModel(pl.LightningModule):
    def __init__(self, 
                 max_steps=2500, 
                 pretrain=False, 
                 use_warmup=False, 
                 learning_rate=1e-5,
                 weight_decay=0.1,
                 dropout=0.1,
                 warmup_steps=0.1, # percentage of steps to warmup for
                 dense_dim=None,
                 custom_linear_init=True,
                 bert_model=BERT_MODEL,
                 freeze_layers=0,
                 scheduler_rate=500,
                 pretrained_mlm=False,
                 **kwargs
                ):

        super(BertClassifierModel, self).__init__()
        self.model_type = "BertClassifierModel"
        
        if pretrained_mlm:
            print("loading pretrained version")
            bert_model_cache = f"../input/commonlitinference-pretrain-mlm-large"
            self.text_model = AutoModel.from_pretrained(bert_model_cache)
        else:
            bert_model_cache = f"../input/huggingface_models/{bert_model}/transformer"
            # Load Text Model
            self.text_model = AutoModel.from_pretrained(bert_model, cache_dir=bert_model_cache)
        if dense_dim is None: # use bert dimensionality
            dense_dim = self.text_model.config.hidden_size
        
        self.dense = nn.Linear(self.text_model.config.hidden_size, 
                               dense_dim)
        self.pretraining_layer = nn.Linear(dense_dim, 1)
        self.output_layer = nn.Linear(dense_dim, 1)
        self.dropout = nn.Dropout(dropout)
        self.criterion = nn.MSELoss()
        self.pretrain = pretrain
        
        # optimiser settings
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.max_steps = max_steps
        self.use_warmup = use_warmup
        self.warmup = int(max_steps*warmup_steps)
        self.freeze_layers = freeze_layers
        self.scheduler_rate = scheduler_rate
        
        if custom_linear_init:
            self.initialise(self.output_layer)
            self.initialise(self.dense)

    def initialise(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.text_model.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()

    def forward(self, text_input, **kwargs):
        outputs = self.text_model(**text_input)
        x = self.dropout(outputs[0]).mean(dim=1) # use CLS token
        x = self.dense(x).tanh()
        x = self.dropout(x)
        predictions = self.output_layer(x)
        if self.pretrain:
            return predictions.squeeze(1)
        else:
            return ((predictions.tanh() * 2.9) - 1).squeeze(1)
    
    def training_step(self, batch, batch_nb):
        if self.pretrain:
            predicted_targets_1 = self(text_input=batch["text_input1"])
            predicted_targets_2 = self(text_input=batch["text_input2"])
            target_loss = F.margin_ranking_loss(predicted_targets_1, predicted_targets_2,  batch["target"])
            self.log("pretraining_train_loss", target_loss, prog_bar=True)
        else:        
            predicted_targets = self(**batch)
            target_loss = torch.sqrt(self.criterion(predicted_targets, batch["target"]))
            self.log("train_loss", target_loss, prog_bar=True)
        return target_loss

    def validation_step(self, val_batch, val_batch_idx, **kwargs):
        if self.pretrain:
            predicted_targets_1 = self(text_input=val_batch["text_input1"])
            predicted_targets_2 = self(text_input=val_batch["text_input2"])
            target_loss = F.margin_ranking_loss(predicted_targets_1, predicted_targets_2,  val_batch["target"])
            self.log("pretraining_val_loss", target_loss, prog_bar=True)
        else:        
            predicted_targets = self(**val_batch)
            target_loss = torch.sqrt(self.criterion(predicted_targets, val_batch["target"]))
            self.log("val_loss", target_loss, prog_bar=True)
            return target_loss
        
    def configure_optimizers(self):
        if self.freeze_layers > 0:
            modules = [self.text_model.embeddings, *self.text_model.encoder.layer[-self.freeze_layers:]] # freeze last X layers
            for module in modules:
                for param in module.parameters():
                    param.requires_grad = False        

        optimizer = torch.optim.AdamW(self.parameters(), 
                                      lr=self.learning_rate, 
                                      weight_decay=self.weight_decay)
        monitor = "val_loss"
        if self.pretrain:
            monitor = "pretraining_val_loss"
        
        if self.use_warmup:
            def warm_decay(step):
                if step < self.warmup:
                    return  step / self.warmup
                return (self.max_steps-step)/(self.max_steps)
            scheduler = (
                {
                    "scheduler": torch.optim.lr_scheduler.LambdaLR(optimizer, warm_decay),
                    "interval": "step", #runs per batch rather than per epoch
                    "frequency": 1,
                    "name" : "learning_rate" # uncomment if using LearningRateMonitor
                }
            )
        else:
            print(f"Scheduling with rate: {self.scheduler_rate}")
            scheduler_rate = self.scheduler_rate
            scheduler = {
                "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(
                    optimizer, mode="min", patience=6,
                ),
                "monitor": monitor,
                "interval": "step",
                "reduce_on_plateau": True,
                "frequency": scheduler_rate
            }
        return [optimizer], [scheduler]

### Dataset

In [None]:
class CommonDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        sample = {
            "text_input": self.df.iloc[idx].excerpt,
            "target": self.df.iloc[idx].target,
        }
        return sample

def collate_fn(batch):
    items = {}
    items["text_input"] = tokenizer(
        [batch_item["text_input"] for batch_item in batch],
        padding=True,
        return_tensors="pt",
        truncation=True
    )
    items["target"] = torch.tensor([batch_item["target"] for batch_item in batch]).float()
    return items

dataset = CommonDataset(train)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

num_bins = int(np.floor(1 + np.log2(len(train))))
est = KBinsDiscretizer(n_bins=num_bins, encode='ordinal', strategy='quantile')

y_labels = train.target.values.reshape(-1, 1)
est.fit(y_labels)
y_bins = est.transform(y_labels)

In [None]:
import math
from pytorch_lightning.callbacks import (
    EarlyStopping,
    LearningRateMonitor,
    ModelCheckpoint,
)
from pytorch_lightning.loggers import WandbLogger, TensorBoardLogger
from sklearn.model_selection import StratifiedKFold

SEED = 420
pl.seed_everything(SEED)

params = {
    "max_steps": 2500,
    "pretrain": False,
    "use_warmup": False,
    "learning_rate": 1e-5,
    "weight_decay": 0.1,
    "dropout": 0.1,
    "warmup_steps": 0.06,  # percentage of steps to warmup for
    "dense_dim": None,
    "custom_linear_init": False,
    "bert_model": BERT_MODEL,
    "freeze_layers": 6,
    "seed": SEED,
    "batch_size": 1,
    "accumulate_grads": 24,
    "stochastic_weight_avg": False,
    "val_check_interval": 10, # evaluate every 10 steps
    "scheduler_rate": 160,
    "pretrained_mlm": False # whether to use pretrained mlm version
}
params["val_check_interval"] = params["val_check_interval"]*params["accumulate_grads"] # actual steps (acummulated gradients)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

best_model_scores = []
best_model_paths = []

group = f"{BERT_MODEL}_"+ wandb.util.generate_id()
for i, (train_ids, val_ids) in enumerate(skf.split(dataset, y=y_bins)):
    print(f"Starting fold {i} for {group}")
    train_dataset = torch.utils.data.Subset(dataset, train_ids)
    val_dataset = torch.utils.data.Subset(dataset, val_ids)
    train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=params["batch_size"],
            shuffle=True,
            collate_fn=collate_fn,
            num_workers=6,
            pin_memory=torch.cuda.is_available(),
        )

    val_loader = DataLoader(
            dataset=val_dataset,
            batch_size=4,
            shuffle=False,
            collate_fn=collate_fn,
            num_workers=6,
            pin_memory=torch.cuda.is_available(),
        )

    wandb_logger = WandbLogger(project="commonlit", entity="commonlitreadabilityprize", 
                               group=group, id=f"fold_{i}_{group}", config=params)

    checkpoint_filename = f"{group}_fold_{i}"+"-{val_loss:.2f}"
    checkpoint_callback = ModelCheckpoint(
                dirpath="models",
                monitor="val_loss",
                filename=checkpoint_filename,
                mode="min",
                save_weights_only=True,
    )
    lr_monitor = LearningRateMonitor(logging_interval='step')

    params["scheduler_rate"] = params["val_check_interval"]

    # Init our model
    model = BertClassifierModel(**params)

    print(f"Training:")
    # Initialize a trainer
    trainer = pl.Trainer(
        gpus=1,
        accumulate_grad_batches=params["accumulate_grads"],
        max_epochs=20,
        progress_bar_refresh_rate=1,
        logger=wandb_logger,
        callbacks=[
            checkpoint_callback,
            lr_monitor,
            EarlyStopping(monitor="val_loss", patience=10, mode="min"),
        ],
        val_check_interval=params["val_check_interval"],
        stochastic_weight_avg= params["stochastic_weight_avg"],
        log_every_n_steps=params["accumulate_grads"]
    )

    # Train the model ⚡
    trainer.fit(
        model,
        train_dataloader = train_loader,
        val_dataloaders = [val_loader],
    )

    best_model_scores.append(checkpoint_callback.best_model_score.item())
    best_model_paths.append(checkpoint_callback.best_model_path)
    print(f"Best score for fold {i}: {checkpoint_callback.best_model_score.item()}")    
    wandb_logger.log_metrics({"best_val_loss": checkpoint_callback.best_model_score.item()})
    # save model py code
    wandb.save("model_versioning.py")
    wandb.finish()

    del model, trainer, train_dataset, val_dataset, train_loader, val_loader, wandb_logger, checkpoint_callback
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
from statistics import mean

print(f"Overall score {mean(best_model_scores)}") 
# Overall score 0.5652932763099671 - LB 531 
# Overall score 0.5236241459846497 - AdamW + mean embeddings - LB 512
# Overall score 0.5049538612365723 - LB 0.506 Roberta Base
if mean(best_model_scores) > 0.51:
    raise ValueError

### Make Dataset

Autosubmit model as a Kaggle Dataset file so that it may be loaded into a kernel

In [None]:
!rm -rf common-lit-inference

In [None]:
!mkdir common-lit-inference

In [None]:
import shutil
for i, model in enumerate(best_model_paths):
    new_path = os.path.abspath(f"common-lit-inference/best_{i}.ckpt")
    shutil.copy(model, new_path)

In [None]:
import os
import glob
import json

dataset_config = {
  "title": "common-lit-inference",
  "id": "brotye/commonlitinference",
  "licenses": [
    {
      "name": "CC0-1.0"
    }
  ]
}

with open("./common-lit-inference/dataset-metadata.json", "w") as outfile:
    json.dump(dataset_config, outfile)

In [None]:
!kaggle datasets create -p ./common-lit-inference/ -m "latest" --dir-mode zip

In [None]:
from statistics import mean
mean(a)

In [None]:
BERT_MODEL = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, cache_dir=f"huggingface_models/{BERT_MODEL}/tokenizer")
text_model = AutoModel.from_pretrained(BERT_MODEL, cache_dir=f"huggingface_models/{BERT_MODEL}/transformer")

In [None]:
import os
import glob
import json

json_files = (glob.glob(f"huggingface_models/{BERT_MODEL}/transformer/*.json") + 
              glob.glob(f"huggingface_models/{BERT_MODEL}/tokenizer/*.json"))
for jtf in json_files:
    with open(jtf, "r") as f:
        file_name = jtf.replace(".json", "")
        original_name = json.load(f)["url"].split("/")[-1]
        new_file_name = os.path.dirname(jtf) + f"/{original_name}"
        print(f"renaming {file_name} to {new_file_name}")
        os.rename(file_name, new_file_name)



In [None]:
dataset_config = {
  "title": "huggingface-models",
  "id": "brotye/huggingfacemodels",
  "licenses": [
    {
      "name": "CC0-1.0"
    }
  ]
}

with open("huggingface_models/dataset-metadata.json", "w") as outfile:
    json.dump(dataset_config, outfile)