In [1]:
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from transformers import AutoModel, AutoTokenizer

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from pytorch_lightning.utilities.seed import seed_everything

from tqdm import tqdm

tqdm.pandas()
seed_everything(420)

  rank_zero_deprecation(
Global seed set to 420


420

In [2]:
from pydantic import BaseSettings


class Settings(BaseSettings):
    wandb_api_key: str

    class Config:
        env_file = ".env"
        env_file_encoding = "utf-8"
        case_sensitive = False


settings = Settings()
os.environ["WANDB_API_KEY"] = settings.wandb_api_key

In [9]:
KAGGLE_USERNAME = "brotye"
INFERENCE_DATASET = "commonlitinference-large" # will create new dataset if does not exist
BERT_MODEL = "roberta-large"
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [10]:
import logging
import wandb

logger = logging.getLogger("wandb")
logger.setLevel(logging.ERROR)

wandb.login()

True

In [11]:
from IPython.core.magic import register_cell_magic

@register_cell_magic
def write_and_run(line, cell):
    argz = line.split()
    file = argz[-1]
    mode = 'w'
    if len(argz) == 2 and argz[0] == '-a':
        mode = 'a'
    with open(file, mode) as f:
        f.write(cell)
    get_ipython().run_cell(cell)

### Tokenizer

In [12]:
max_seq_length = 248
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, 
                                          cache_dir=f"../input/huggingfacemodels/{BERT_MODEL}/tokenizer", 
                                          model_max_length=max_seq_length)

In [27]:
# mlm_collator(train.excerpt.values[0])
from datasets import load_dataset
import pandas as pd
import numpy as np
from transformers import DataCollatorForLanguageModeling

# train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

mlm_data = train[['excerpt']]
mlm_data = mlm_data.rename(columns={'excerpt':'text'})
# mlm_data.to_csv('mlm_data.csv', index=False)

mlm_data_val = test[['excerpt']]
mlm_data_val = mlm_data_val.rename(columns={'excerpt':'text'})
# mlm_data_val.to_csv('mlm_data_val.csv', index=False)

# raw_datasets = load_dataset("csv", data_files={"train": "../input/mlmdata/mlm_data.csv", "validation": "../input/mlmdata/mlm_data_val.csv"})

mlm_data_val

In [33]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(mlm_data)
val_dataset = Dataset.from_pandas(mlm_data_val)


raw_dataset = DatasetDict({"train": train_dataset, "val": val_dataset})

In [34]:
def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=True)

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=["text"],
    load_from_cache_file=False,
    desc="Running tokenizer on every text in dataset",
)

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // max_seq_length) * max_seq_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=1,
    load_from_cache_file=False,
    desc=f"Grouping texts in chunks of {max_seq_length}",
)

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=data_collator, batch_size=4
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=4)

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=3.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=1.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 248', max=3.0, style=Progress…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 248', max=1.0, style=Progress…




In [35]:
import torch
import pytorch_lightning as pl
from transformers import AutoModel, AutoConfig, AutoModelForMaskedLM
from transformers.optimization import get_cosine_schedule_with_warmup


class BertMLMModel(pl.LightningModule):
    def __init__(
        self,
        max_steps=2500,
        learning_rate=1e-5,
        weight_decay=0.1,
        dropout=0.1,
        warmup_steps=0.06,  # percentage of steps to warmup for
        bert_model="roberta-base",
        freeze_layers=0,
        scheduler_rate=500,
        **kwargs,
    ):

        super(BertMLMModel, self).__init__()
        self.model_type = "BertClassifierModel"

        # Load Text Model
        config = AutoConfig.from_pretrained(f"../input/huggingfacemodels/{bert_model}/transformer")
        config.update({"layer_norm_eps": 1e-7, "hidden_dropout_prob": dropout}) 
        self.text_model = AutoModelForMaskedLM.from_pretrained(
            f"../input/huggingfacemodels/{bert_model}/transformer", config=config
        )

        # optimiser settings
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.max_steps = max_steps
        self.warmup = int(max_steps * warmup_steps)
        self.freeze_layers = freeze_layers
        self.scheduler_rate = scheduler_rate


    def forward(self, x):
        return self.text_model(**x)

    def training_step(self, batch, batch_idx):
        loss = self(batch).loss
        self.log('mlm_train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self(batch).loss
        self.log('mlm_val_loss', loss, prog_bar=True)


    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
        )
        schedule = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup,
            num_training_steps=self.max_steps,
        )
        scheduler = {
            "scheduler": schedule,
            "interval": "step",  # runs per batch rather than per epoch
            "frequency": 1,
            "name": "learning_rate",  # uncomment if using LearningRateMonitor
        }
        return [optimizer], [scheduler]


In [36]:
import math
from pytorch_lightning.callbacks import (
    EarlyStopping,
    LearningRateMonitor,
    ModelCheckpoint,
)
from pytorch_lightning.loggers import WandbLogger, TensorBoardLogger
from sklearn.model_selection import StratifiedKFold

SEED = 420
pl.seed_everything(SEED)

group = f"{BERT_MODEL}_mlm"+ wandb.util.generate_id()

wandb_logger = WandbLogger(project="commonlit", entity="commonlitreadabilityprize", 
                           group=group, id=f"{group}")

checkpoint_filename = f"{group}"+"-{val_loss:.2f}"
checkpoint_callback = ModelCheckpoint(
            dirpath="models",
            monitor="mlm_val_loss",
            filename=checkpoint_filename,
            mode="min",
            save_weights_only=True,
)
lr_monitor = LearningRateMonitor(logging_interval='step')


max_steps = 5*len(train_dataloader)
# Init our model
model = BertMLMModel()

print(f"Training:")
# Initialize a trainer
trainer = pl.Trainer(
    gpus=1,
    accumulate_grad_batches=4,
    max_epochs=5,
    progress_bar_refresh_rate=1,
    logger=wandb_logger,
    callbacks=[
        checkpoint_callback,
        lr_monitor,
    ],
    val_check_interval=20,
    log_every_n_steps=10
)

# Train the model ⚡
trainer.fit(
    model,
    train_dataloader = train_dataloader,
    val_dataloaders = [eval_dataloader],
)

print(f"Best score: {checkpoint_callback.best_model_score.item()}")    
wandb_logger.log_metrics({"best_val_loss": checkpoint_callback.best_model_score.item()})
# save model py code
wandb.finish()

# del model, trainer, train_dataset, val_dataset, train_loader, val_loader, wandb_logger, checkpoint_callback
# torch.cuda.empty_cache()
# gc.collect()

Global seed set to 420
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training:



  | Name       | Type               | Params
--------------------------------------------------
0 | text_model | RobertaForMaskedLM | 124 M 
--------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.790   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Global seed set to 420


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

AttributeError: 'NoneType' object has no attribute 'item'

In [24]:
model.text_model.roberta.state_dict()

OrderedDict([('embeddings.position_ids',
              tensor([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
                        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
                        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
                        42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
                        56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
                        70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
                        84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
                        98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
                       112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
                       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
                       140, 1