# Imports

In [1]:
!nvidia-smi

Tue Jun  8 21:05:30 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 8.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 33.8MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |█

In [36]:
import pandas as pd
import numpy as np

import torch.nn as nn
import torch

from sklearn.model_selection import KFold

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import RobertaConfig

from collections import defaultdict
from tqdm import tqdm_notebook
import random
import os

In [5]:
%cd drive/MyDrive/CommonLit

/content/drive/MyDrive/CommonLit


In [10]:
df=pd.read_csv("train.csv")

# Cross Validation

In [15]:
def cross_val(df: pd.DataFrame, n: int = 5, random_state: int = 2021):

  df["fold"] = 0
  kf = KFold(n_splits= n, random_state= random_state, shuffle= True)
  for i, (train_idx, valid_idx) in enumerate(kf.split(df)):
    df.loc[valid_idx,"fold"] = i

  return df

In [16]:
df = cross_val(df)

# Seed Everything

In [18]:
# code taken from https://www.kaggle.com/shoheiazuma/tweet-sentiment-roberta-pytorch
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 2021
seed_everything(seed)

# Configuration

In [29]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 5
ROBERTA_PATH = "roberta-base"
TOKENIZER = transformers.RobertaTokenizerFast.from_pretrained(ROBERTA_PATH)
TRAINING_FILE = "../input/commonlitreadabilityprize/train.csv"
MODEL_PATH = "model.bin"
NUM_WORKERS = 2

# Dataset

In [32]:
class RobertaDataset:
    def __init__(self,df):
        self.excerpt = df.excerpt.values
        self.target = df.target.values

    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self,item):
        excerpt = str(self.excerpt[item])
        excerpt = " ".join(excerpt.split())
        inputs = TOKENIZER(excerpt, add_special_tokens=True, max_length=MAX_LEN, padding=True, truncation=True)
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        
        padding_len = MAX_LEN-len(ids)
        ids = ids+([0]*padding_len)
        mask = mask+([0]*padding_len)
 
        return {"ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float)}

# Model Building

In [39]:
class RobertaModel(nn.Module):
    
    def __init__(self, conf):
        super(RobertaModel,self).__init__()
        self.roberta=transformers.RobertaModel.from_pretrained(ROBERTA_PATH,config=conf)
        self.dropout=nn.Dropout(0.3)
        self.linear=nn.Linear(768,1)
        
    def freeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = False

    def unfreeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = True
        
    def forward(self,ids,mask):

        outputs=self.roberta(ids,attention_mask=mask)
        pooler_outputs = outputs[1]
        logits = self.linear(self.dropout(pooler_outputs))
        
        return logits.view(-1)

# Training + Evaluation Function

In [40]:
loss_fn=nn.MSELoss()

def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss=0

    for index, d in tqdm_notebook(enumerate(data_loader), total=len(data_loader)):
        
        ids = d["ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask)

        loss = torch.sqrt(loss_fn(outputs, targets))
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss+=loss.item()
        
    return final_loss / len(data_loader)


def eval_fn(data_loader, model, device):
    model.eval()
    final_loss=0
    
    with torch.no_grad():
        for bi, d in tqdm_notebook(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask)
            loss = torch.sqrt(loss_fn(outputs, targets))
            
            final_loss += loss.item()
            
    return final_loss / len(data_loader)

# Train the Model

In [None]:
# Training until hs3
folds=[0,1,2,3,4]
loss=defaultdict(list)
for fold in folds:
    
    model_config = RobertaConfig.from_pretrained(ROBERTA_PATH)
    device = torch.device("cuda")
    model_config.output_hidden_states = True
    model = RobertaModel(model_config).to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [{"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.001},
        {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],"weight_decay": 0.0}]
    
    print("################################")
    print(f"Training Fold {fold}")
    print("################################")
    
    train= df[df.fold!=fold].reset_index(drop=True)
    valid= df[df.fold==fold].reset_index(drop=True)

    train_dataset = RobertaDataset(train)
    train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=NUM_WORKERS)

    valid_dataset = RobertaDataset(valid)
    valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1)

    
    num_train_steps = int(len(train) / TRAIN_BATCH_SIZE * EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_loss = np.inf
    for epoch in range(EPOCHS):
            
        train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
        test_loss = eval_fn(valid_data_loader, model, device)
        print(f"Train Loss = {train_loss} Valid Loss = {test_loss} for epoch {epoch}")
        loss[epoch].append(test_loss)

print("-------------------------")
print("-------------------------")
print("RESULTS")
print("-------------------------")
for epoch in range(EPOCHS):
    
    print(f"Result for epoch {epoch}: {np.mean(loss[epoch])}")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


################################
Training Fold 0
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))