# Imports

In [1]:
!nvidia-smi

Mon Jun 14 21:17:21 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P0    47W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
pip install transformers



In [4]:
import pandas as pd
import numpy as np

import torch.nn as nn
import torch

from sklearn.model_selection import KFold

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import logging

logging.set_verbosity_warning()

from collections import defaultdict
from tqdm import tqdm_notebook
import random
import os

In [5]:
%cd drive/MyDrive/CommonLit

/content/drive/MyDrive/CommonLit


In [6]:
df = pd.read_csv("train.csv")

# Cross Validation

In [7]:
def cross_val(df: pd.DataFrame, n: int = 5, random_state: int = 2021):

  df["fold"] = 0
  kf = KFold(n_splits= n, random_state= random_state, shuffle= True)
  for i, (train_idx, valid_idx) in enumerate(kf.split(df)):
    df.loc[valid_idx,"fold"] = i

  return df

In [8]:
df = cross_val(df)

# Seed Everything

In [9]:
# code taken from https://www.kaggle.com/shoheiazuma/tweet-sentiment-roberta-pytorch
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 2021
seed_everything(seed)

# Configuration

In [10]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 2
ROBERTA_PATH = "roberta-base"
TOKENIZER = transformers.AutoTokenizer.from_pretrained(ROBERTA_PATH)
TRAINING_FILE = "../input/commonlitreadabilityprize/train.csv"
MODEL_PATH = "model.bin"
NUM_WORKERS = 2

# Dataset

In [11]:
class RobertaDataset:
    def __init__(self,df):
        self.excerpt = df.excerpt.values
        self.target = df.target.values

    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self,item):
        excerpt = str(self.excerpt[item])
        excerpt = " ".join(excerpt.split())
        inputs = TOKENIZER(excerpt, add_special_tokens=True, max_length=MAX_LEN, padding=True, truncation=True)
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        
        padding_len = MAX_LEN-len(ids)
        ids = ids+([0]*padding_len)
        mask = mask+([0]*padding_len)
 
        return {"ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float)}

# Model Building

In [12]:
class RobertaModel(nn.Module):
    
    def __init__(self, conf):
        super(RobertaModel,self).__init__()
        self.roberta = transformers.AutoModel.from_pretrained(ROBERTA_PATH,config=conf)
        self.linear = nn.Linear(768,1)
        self.dropout = nn.Dropout(0.3)
        self.layer_norm = nn.LayerNorm(768)
        
    def forward(self, ids, mask, loss_fn = None, targets = None):

        outputs = self.roberta(ids,attention_mask=mask)
        pooler_outputs = self.layer_norm(outputs[1])
        logits = self.linear(self.dropout(pooler_outputs))

        if targets is not None:
          loss = torch.sqrt(loss_fn(logits.view(-1),targets.view(-1)))
          return loss, logits
        else:
          return logits


# Training + Evaluation Function

In [13]:
def train_fn(train_dataloader, valid_dataloader, model, optimizer, device, scheduler, valid_interval = 10):
    model.train()
    train_loss=0
    best_loss = np.inf
    for epoch in range(EPOCHS):
      for index, d in tqdm_notebook(enumerate(train_dataloader), total=len(train_dataloader)):
  
        ids = d["ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        loss, outputs = model(ids=ids, mask=mask, loss_fn = loss_fn, targets = targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()

        if (index % valid_interval == 0) | ((len(train_dataloader)-index) == 1):

          valid_loss = eval_fn(valid_dataloader,model,device)
          if valid_loss < best_loss:
            print(f"Epoch:{epoch} | Train Loss:{train_loss/(index*(epoch+1)+1)} | Validation loss:{valid_loss}")
            print(f"Validation loss decreased from {best_loss} to {valid_loss}.")
            best_loss = valid_loss
            torch.save(model.state_dict(),f'model{fold}.bin')
            
    return best_loss

def eval_fn(data_loader, model, device):
    model.eval()
    final_loss=0
    
    with torch.no_grad():
        for bi, d in enumerate(data_loader):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            loss, outputs = model(ids=ids, mask=mask, loss_fn = loss_fn, targets = targets)
            
            final_loss += loss.item()
            
    return final_loss / len(data_loader)

# Useful Functions

In [14]:
def create_dataloader(df, fold):
  train = df[df.fold!=fold].reset_index(drop=True)
  valid = df[df.fold==fold].reset_index(drop=True)

  train_dataset = RobertaDataset(train)
  valid_dataset = RobertaDataset(valid)

  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size= TRAIN_BATCH_SIZE, num_workers= NUM_WORKERS )
  valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size= VALID_BATCH_SIZE, num_workers= NUM_WORKERS )

  return train_dataloader, valid_dataloader

def create_model(device, model_path = ROBERTA_PATH):

  config = AutoConfig.from_pretrained(model_path)
  config.output_hidden_states = True
  model = RobertaModel(config).to(device)

  return model

def create_scheduler(optimizer, num_warmup_steps, num_train_steps, scheduler_name = "get_linear_schedule_with_warmup" ):

  if scheduler_name == "get_linear_schedule_with_warmup":
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
  else:
    raise Exception(f"Unknown scheduler: {scheduler_name}")

  return scheduler

def create_optimizer_parameters(model):

  param_optimizer = list(model.named_parameters())
  no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
  optimizer_parameters = [{"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.001},
        {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],"weight_decay": 0.0}]

  return optimizer_parameters

def create_optimizer(model, optimizer_name = "AdamW"):

  kwargs = {'lr':5e-5,
            'weight_decay':0.01}
  optimizer_parameters = create_optimizer_parameters(model)

  if optimizer_name == "AdamW":
    optimizer = AdamW(optimizer_parameters, **kwargs)
  else:
    raise Exception(f"Unknown optimizer: {optimizer_name}")

  return optimizer


# Train the Model

In [None]:
loss_fn=nn.MSELoss()
loss=defaultdict(list)
results = {}
for fold in range(5):

    device = torch.device("cuda")
    model = create_model(device)

    print("################################")
    print(f"Training Fold {fold}")
    print("################################")

    train_dataloader, valid_dataloader = create_dataloader(df, fold)
    num_train_steps = len(train_dataloader) * EPOCHS

    optimizer = create_optimizer(model)
    scheduler = create_scheduler(optimizer, num_warmup_steps = 0, num_train_steps = num_train_steps )

    results[fold] = train_fn(train_dataloader,valid_dataloader, model, optimizer, device, scheduler)

print("################################")
print("RESULTS")
print("################################")
cv = np.mean([results[i] for i in range(5)])
print(f"Results of cross validation for seed {seed}: {cv}")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


################################
Training Fold 0
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=142.0), HTML(value='')))


Epoch:0 | Train Loss:0.6937087774276733 | Validation loss:1.0225444478048404
Validation loss decreased from inf to 1.0225444478048404.
Epoch:0 | Train Loss:0.8089540925892916 | Validation loss:0.9898563640218385
Validation loss decreased from 1.0225444478048404 to 0.9898563640218385.
Epoch:0 | Train Loss:0.8793490529060364 | Validation loss:0.903856206947649
Validation loss decreased from 0.9898563640218385 to 0.903856206947649.
Epoch:0 | Train Loss:0.8196615962421193 | Validation loss:0.7644889044090056
Validation loss decreased from 0.903856206947649 to 0.7644889044090056.
Epoch:0 | Train Loss:0.8056918488870753 | Validation loss:0.7551879756887194
Validation loss decreased from 0.7644889044090056 to 0.7551879756887194.
Epoch:0 | Train Loss:0.7963298559188843 | Validation loss:0.6892083866495482
Validation loss decreased from 0.7551879756887194 to 0.6892083866495482.
Epoch:0 | Train Loss:0.7815035573134186 | Validation loss:0.6260289394519698
Validation loss decreased from 0.6892083

HBox(children=(FloatProgress(value=0.0, max=142.0), HTML(value='')))

Epoch:1 | Train Loss:111.64452889561653 | Validation loss:0.5966168222293048
Validation loss decreased from 0.6008463077981707 to 0.5966168222293048.
Epoch:1 | Train Loss:2.9784209161269954 | Validation loss:0.5641880738483348
Validation loss decreased from 0.5966168222293048 to 0.5641880738483348.
Epoch:1 | Train Loss:1.3598265456091059 | Validation loss:0.5543664446179296
Validation loss decreased from 0.5641880738483348 to 0.5543664446179296.
Epoch:1 | Train Loss:0.951180664093598 | Validation loss:0.5372298916041012
Validation loss decreased from 0.5543664446179296 to 0.5372298916041012.
Epoch:1 | Train Loss:0.8093264123693628 | Validation loss:0.5190254722682524
Validation loss decreased from 0.5372298916041012 to 0.5190254722682524.


In [17]:
run()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


################################
Training Fold 0
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=142.0), HTML(value='')))




NameError: ignored