In [1]:
import os
import gc
import itertools
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk import tokenize
from nltk.corpus import stopwords

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification

from tqdm.notebook import tqdm
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import WandbLogger, CSVLogger
import glob
import json
os.environ["TOKENIZERS_PARALLELISM"] = "false"

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
with open('/kaggle/input/bert-train/config.json') as json_file:
    config = json.load(json_file)
    
torch.manual_seed(config["SEED"])
torch.cuda.manual_seed(config["SEED"])
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
pl.seed_everything(config["SEED"])
print(device)

cuda


In [3]:
sample_submission = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")
prompts_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
prompts_test  = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")
summaries_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")
summaries_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")

print(f'sample_submission shape: {sample_submission.shape}')
print(f'prompts_train shape: {prompts_train.shape}')
print(f'summaries_test shape: {summaries_test.shape}')
print(f'summaries_train shape: {summaries_train.shape}')
print(f'prompts_test shape: {prompts_test.shape}')
print('-'*90)

# summaries_train = pd.merge(summaries_train, prompts_train, on="prompt_id")
# summaries_test = pd.merge(summaries_test, prompts_test, on="prompt_id")
summaries_test.head()

sample_submission shape: (4, 3)
prompts_train shape: (4, 4)
summaries_test shape: (4, 3)
summaries_train shape: (7165, 5)
prompts_test shape: (2, 4)
------------------------------------------------------------------------------------------


Unnamed: 0,student_id,prompt_id,text
0,000000ffffff,abc123,Example text 1
1,111111eeeeee,def789,Example text 2
2,222222cccccc,abc123,Example text 3
3,333333dddddd,def789,Example text 4


In [4]:
from torch import nn


class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps

    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat, y) + self.eps)
        return loss


class MCRMSELoss(nn.Module):
    def __init__(self, num_scored=3):
        super().__init__()
        self.rmse = RMSELoss()
        self.num_scored = num_scored

    def forward(self, yhat, y):
        score = 0
        for i in range(self.num_scored):
            score += self.rmse(yhat[:,  i], y[:, i]) / self.num_scored

        return score

In [5]:
class CommonLitModel(pl.LightningModule):
    def __init__(self, config):
        super(CommonLitModel, self).__init__()
        model_config = AutoConfig.from_pretrained(
            config["model_dir"], 
            num_labels = 2,
            problem_type = "regression",
            hidden_dropout_prob = config["hidden_dropout_prob"], 
            attention_probs_dropout_prob = config["attention_probs_dropout_prob"],
            classifier_dropout = config["classifier_dropout"],
        )
        self.model = AutoModelForSequenceClassification.from_pretrained(config["model_dir"], config=model_config)
        if config["loss"] == "mse":
            self.loss_fn = torch.nn.MSELoss()
        elif config["loss"] == "mcrmse":
            self.loss_fn = MCRMSELoss(num_scored=2)
        self.config = config
        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.lr = config["lr"]
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.model(input_ids, 
                            attention_mask = attention_mask, 
                            token_type_ids = token_type_ids)
        return output
    
    def step(self, batch):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        outputs = self(input_ids, attention_mask, token_type_ids)
        targets = batch['targets']
        loss = self.loss_fn(outputs["logits"], targets)
        content, wording = compute_RMSE(outputs["logits"], targets)
        return loss, content, wording
        
    def training_step(self, batch, batch_idx):
        loss, content, wording = self.step(batch)
        self.training_step_outputs.append(loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss, content, wording = self.step(batch)
        self.validation_step_outputs.append(loss)
        self.log("content_error", content) 
        self.log("wording_error", wording) 
        self.log("total_error", (content+wording)/2)
        return loss
    
    def on_train_epoch_end(self):
        loss_mean = torch.stack(self.training_step_outputs).mean()
        self.log(f'ep_train_loss_fold{self.config["fold"]}', loss_mean, prog_bar=True)
        self.training_step_outputs.clear()
        
    def on_validation_epoch_end(self):
        loss_mean = torch.stack(self.validation_step_outputs).mean()
        self.log(f'ep_val_loss_fold{self.config["fold"]}', loss_mean, prog_bar=True)
        self.validation_step_outputs.clear()
    
    def configure_optimizers(self):
        model = self.model
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': self.config["weight_decay"]},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optim = torch.optim.AdamW(self.parameters(), lr=self.lr)
        
        return optim

In [6]:
class CommonLitTestDataset(Dataset):
    def __init__(self, dataframe, target_list, max_len):
        self.dataframe = dataframe
        self.max_len = max_len
        self.tokenizer = AutoTokenizer.from_pretrained(config["model_dir"])
        self.text = dataframe['text'].values
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        text = row["text"]
        inputs = self.tokenizer.encode_plus(
            text.lower(),
            truncation="longest_first",
            padding='max_length',
            max_length=self.max_len,
            return_attention_mask=True,
            return_tensors = 'pt'
        )
        
        input_ids = inputs['input_ids'].flatten()
        attention_mask = inputs['attention_mask'].flatten()
        token_type_ids = inputs['token_type_ids'].flatten()
        
        return {'input_ids': input_ids,
                'attention_mask': attention_mask,
                'token_type_ids': token_type_ids}
    

test_dataset = CommonLitTestDataset(summaries_test,
                                    target_list = config["target_list"],
                                    max_len = config["MAX_LEN"],
                                    )

test_data_loader = DataLoader(test_dataset)

In [7]:
# model_list = []
# for i in glob.glob("/kaggle/input/bert-train/fold*/*"):
#     model = CommonLitModel(config)
#     model.load_state_dict(torch.load(i)["state_dict"])
#     model_list.append(model)

i = "/kaggle/input/bert-train/full_train/epoch=5.ckpt"
model = CommonLitModel(config)
model.load_state_dict(torch.load(i)["state_dict"])
model = model.to(device)

Some weights of the model checkpoint at /kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassi

In [8]:
def test_model(model, test_loader):
    model.eval()
    preds = []
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            input_ids = batch['input_ids'].to(device, dtype = torch.long).to(device)
            attention_mask = batch['attention_mask'].to(device, dtype = torch.long).to(device)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long).to(device)
            outputs = model(input_ids, attention_mask, token_type_ids)
            preds.extend(outputs["logits"].cpu().detach().numpy().tolist())
    return preds

In [9]:
# y_pred = []
# for model in model_list:
#     model = model.to(device)
#     y_pred_i = test_model(model, test_data_loader)
#     y_pred.append(y_pred_i)
# y_pred = np.mean(y_pred, axis=0)

y_pred = test_model(model, test_data_loader)
    
pred_data = pd.DataFrame({col: [col[idx] for col in y_pred]
                          for idx,col in enumerate(config["target_list"])})
for col in sample_submission.columns[1:]:
    sample_submission[col] = pred_data[col]
sample_submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,-0.222112,-0.263311
1,111111eeeeee,-0.222112,-0.263311
2,222222cccccc,-0.222112,-0.263311
3,333333dddddd,-0.222112,-0.263311


In [10]:
sample_submission.to_csv('submission.csv', index = False)