In [1]:
from kaggle_secrets import UserSecretsClient
import wandb
user_secrets = UserSecretsClient()
my_secret = user_secrets.get_secret("wandb_api_key") 
wandb.login(key=my_secret)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import os
import gc
import itertools
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk import tokenize
from nltk.corpus import stopwords

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification

from tqdm.notebook import tqdm
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import WandbLogger, CSVLogger
import glob
import json
os.environ["TOKENIZERS_PARALLELISM"] = "false"

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
config = {
    "SEED":42,
    "MAX_LEN":128,
    "model_dir":"/kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased",
    "EPOCHS":6,
    "TRAIN_BATCH_SIZE":64,
    "VAL_BATCH_SIZE":64,
    "target_list":['content', 'wording'],
    "NUM_WORKERS": os.cpu_count(),
    "lr" : 4e-5,
    "optimizer":"adamW",
    "project_name":"CommonLit - Evaluate Student Summaries",
    "name":"train-all",
    "unfreeze_epochs":40,
    "loss":"mcrmse",
    "weight_decay":0.01,
    "hidden_dropout_prob":0.3, 
    "attention_probs_dropout_prob":0.3,
    "classifier_dropout":0.3,
    "n_folds":4,
    "fold":0,
    "full_train":False,
}

with open("config.json", "w") as outfile:
    json.dump(config, outfile)

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    pl.seed_everything(seed)
    
set_seed(config["SEED"])

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [4]:
wandb_logger = WandbLogger(
    project=config["project_name"],
    name=config["name"]
)
wandb_logger.log_hyperparams(config)

[34m[1mwandb[0m: Currently logged in as: [33mnorrawee[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230803_064414-cyqhjl82[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mtrain-all[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/norrawee/CommonLit%20-%20Evaluate%20Student%20Summaries[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/norrawee/CommonLit%20-%20Evaluate%20Student%20Summaries/runs/cyqhjl82[0m


In [5]:
sample_submission = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")
prompts_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
prompts_test  = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")
summaries_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")
summaries_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")

print(f'sample_submission shape: {sample_submission.shape}')
print(f'prompts_train shape: {prompts_train.shape}')
print(f'summaries_test shape: {summaries_test.shape}')
print(f'summaries_train shape: {summaries_train.shape}')
print(f'prompts_test shape: {prompts_test.shape}')
print('-'*90)
summaries_train = pd.merge(summaries_train, prompts_train, on="prompt_id")
summaries_test = pd.merge(summaries_test, prompts_test, on="prompt_id")

id2fold = {
    "814d6b": 0,
    "39c16e": 1,
    "3b9047": 2,
    "ebad26": 3,
}

summaries_train["fold"] = summaries_train["prompt_id"].map(id2fold)
# summaries_train = summaries_train.sample(30)
print(summaries_train["fold"].value_counts())
summaries_train.head()

sample_submission shape: (4, 3)
prompts_train shape: (4, 4)
summaries_test shape: (4, 3)
summaries_train shape: (7165, 5)
prompts_test shape: (2, 4)
------------------------------------------------------------------------------------------
1    2057
2    2009
3    1996
0    1103
Name: fold, dtype: int64


Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,fold
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,0
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,0
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,0
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,0
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,0


In [6]:
tokenizer = AutoTokenizer.from_pretrained(config["model_dir"])
summaries_train["tokens"] = summaries_train["text"].apply(
    lambda x: tokenizer.encode(x))
summaries_train["len"] = summaries_train["tokens"].apply(len)
summaries_train[["len"]].describe()

Unnamed: 0,len
count,7165.0
mean,92.563433
std,66.213125
min,28.0
25%,49.0
50%,71.0
75%,113.0
max,817.0


In [7]:
class CommonLitDataset(Dataset):
    def __init__(self, dataframe, target_list, max_len, tokenizer):
        self.dataframe = dataframe
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.text = dataframe['text'].values
        self.targets = dataframe[target_list].values
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        text = row["text"]
        
        inputs = self.tokenizer.encode_plus(
            text.lower(),
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_attention_mask=True,
            return_tensors = 'pt'
        )

        input_ids = inputs['input_ids'].flatten()
        attention_mask = inputs['attention_mask'].flatten()
        token_type_ids = inputs['token_type_ids'].flatten()
        targets = torch.FloatTensor(self.targets[index])
        
        return {'input_ids': input_ids,
                'attention_mask': attention_mask,
                'token_type_ids': token_type_ids,
                'targets': targets}

In [8]:
class CommonLitDataModule(pl.LightningDataModule):
    def __init__(self, summaries_train, config, fold):
        super().__init__()
        self.config = config
        self.summaries_train = summaries_train
        self.tokenizer = AutoTokenizer.from_pretrained(config["model_dir"])
        self.fold = fold
    
    def setup(self, stage):
        if stage == "fit":
            if config["full_train"]:
                self.train_df = summaries_train
                self.val_df = summaries_train
            else:
                self.train_df = summaries_train[summaries_train["fold"]!=self.fold]
                self.val_df = summaries_train[summaries_train["fold"]==self.fold]

                
            self.train_dataset = CommonLitDataset(self.train_df,
                                 target_list = config["target_list"],
                                 max_len = config["MAX_LEN"],
                                 tokenizer = self.tokenizer)
            
            self.valid_dataset = CommonLitDataset(self.val_df,
                     target_list = config["target_list"],
                     max_len = config["MAX_LEN"],
                     tokenizer = self.tokenizer)
 

                

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            shuffle = True,
            batch_size = config["TRAIN_BATCH_SIZE"],
            num_workers=config["NUM_WORKERS"])

    def val_dataloader(self):
        return DataLoader(
            self.valid_dataset,
            shuffle = False,
            batch_size = config["VAL_BATCH_SIZE"],
            num_workers=config["NUM_WORKERS"])


In [9]:
from torch import nn


class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps

    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat, y) + self.eps)
        return loss


class MCRMSELoss(nn.Module):
    def __init__(self, num_scored=3):
        super().__init__()
        self.rmse = RMSELoss()
        self.num_scored = num_scored

    def forward(self, yhat, y):
        score = 0
        for i in range(self.num_scored):
            score += self.rmse(yhat[:,  i], y[:, i]) / self.num_scored
        return score

def compute_RMSE(y_hat, y):
    return torch.sqrt(torch.mean((y_hat-y)**2, axis=0))

In [10]:
class CommonLitModel(pl.LightningModule):
    def __init__(self, config):
        super(CommonLitModel, self).__init__()
        model_config = AutoConfig.from_pretrained(
            config["model_dir"], 
            num_labels = 2,
            problem_type = "regression",
            hidden_dropout_prob = config["hidden_dropout_prob"], 
            attention_probs_dropout_prob = config["attention_probs_dropout_prob"],
            classifier_dropout = config["classifier_dropout"],
        )
        self.model = AutoModelForSequenceClassification.from_pretrained(config["model_dir"], config=model_config)
        if config["loss"] == "mse":
            self.loss_fn = torch.nn.MSELoss()
        elif config["loss"] == "mcrmse":
            self.loss_fn = MCRMSELoss(num_scored=2)
        self.config = config
        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.lr = config["lr"]
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.model(input_ids, 
                            attention_mask = attention_mask, 
                            token_type_ids = token_type_ids)
        return output
    
    def step(self, batch):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        outputs = self(input_ids, attention_mask, token_type_ids)
        targets = batch['targets']
        loss = self.loss_fn(outputs["logits"], targets)
        content, wording = compute_RMSE(outputs["logits"], targets)
        return loss, content, wording
        
    def training_step(self, batch, batch_idx):
        loss, content, wording = self.step(batch)
        self.training_step_outputs.append(loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss, content, wording = self.step(batch)
        self.validation_step_outputs.append(loss)
        self.log(f"content_error_fold{self.config['fold']}", content) 
        self.log(f"wording_error_fold{self.config['fold']}", wording) 
        self.log(f"total_error_fold{self.config['fold']}", (content+wording)/2)
        return loss
    
    def on_train_epoch_end(self):
        loss_mean = torch.stack(self.training_step_outputs).mean()
        self.log(f'ep_train_loss_fold{self.config["fold"]}', loss_mean, prog_bar=True)
        self.training_step_outputs.clear()
        
    def on_validation_epoch_end(self):
        loss_mean = torch.stack(self.validation_step_outputs).mean()
        self.log(f'ep_val_loss_fold{self.config["fold"]}', loss_mean, prog_bar=True)
        self.validation_step_outputs.clear()
    
    def configure_optimizers(self):
        model = self.model
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': self.config["weight_decay"]},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optim = torch.optim.AdamW(self.parameters(), lr=self.lr)
        
        return optim

# Train the model

In [11]:
def train_model(fold):
    config["fold"] = fold
    checkpoint_callback = ModelCheckpoint(
        dirpath =f"fold_{fold}",
        filename='{epoch}-{ep_val_loss_fold%s:.3f}' % fold,
        monitor=f"ep_val_loss_fold{fold}",
        mode="min",
    )
    dm = CommonLitDataModule(summaries_train, config, fold)
    model = CommonLitModel(config)
    
#     # Freeze BERT weights.
#     for param in model.model.parameters():
#         param.requires_grad = False
#     for param in model.model.classifier.parameters():
#         param.requires_grad = True
    # Find the best lr.
    trainer = pl.Trainer()
    tuner = pl.tuner.Tuner(trainer)
    x = tuner.lr_find(model, datamodule=dm, num_training=50, attr_name="lr")
    print(model.lr)
#     model.lr = 1.5*model.lr
#     print(model.lr)
    
    trainer = pl.Trainer(
        max_epochs=config["EPOCHS"], 
        logger=wandb_logger,
        callbacks=[checkpoint_callback],
        limit_val_batches=0,
        num_sanity_val_steps=0
    )
    trainer.fit(model, dm)
    
    del model
    
    

In [12]:
for i in range(config["n_folds"]):
    train_model(i)
    

Some weights of the model checkpoint at /kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassi

Finding best initial lr:   0%|          | 0/50 [00:00<?, ?it/s]

0.0002089296130854041


Training: 0it [00:00, ?it/s]

Some weights of the model checkpoint at /kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassi

Finding best initial lr:   0%|          | 0/50 [00:00<?, ?it/s]

0.0003019951720402019


Training: 0it [00:00, ?it/s]

Some weights of the model checkpoint at /kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassi

Finding best initial lr:   0%|          | 0/50 [00:00<?, ?it/s]

1.584893192461114e-05


Training: 0it [00:00, ?it/s]

Some weights of the model checkpoint at /kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassi

Finding best initial lr:   0%|          | 0/50 [00:00<?, ?it/s]

0.0001445439770745928


Training: 0it [00:00, ?it/s]



In [13]:
fold = 1
config["full_train"] = True
checkpoint_callback = ModelCheckpoint(
    dirpath =f"full_train",
    filename='{epoch}',
)
dm = CommonLitDataModule(summaries_train, config, fold)
model = CommonLitModel(config)

#     # Freeze BERT weights.
#     for param in model.model.parameters():
#         param.requires_grad = False
#     for param in model.model.classifier.parameters():
#         param.requires_grad = True
# Find the best lr.
trainer = pl.Trainer()
tuner = pl.tuner.Tuner(trainer)
x = tuner.lr_find(model, datamodule=dm, num_training=50, attr_name="lr")
print(model.lr)
#     model.lr = 1.5*model.lr
#     print(model.lr)

trainer = pl.Trainer(
    max_epochs=config["EPOCHS"], 
    logger=wandb_logger,
    callbacks=[checkpoint_callback],
    limit_val_batches=0,
    num_sanity_val_steps=0
)
trainer.fit(model, dm)

Some weights of the model checkpoint at /kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassi

Finding best initial lr:   0%|          | 0/50 [00:00<?, ?it/s]

0.0009120108393559097


Training: 0it [00:00, ?it/s]

In [14]:
# # Freeze BERT weights.
# for param in model.model.parameters():
#     param.requires_grad = False
# for param in model.model.classifier.parameters():
#     param.requires_grad = True

# # Find the best lr.
# trainer = pl.Trainer()
# tuner = pl.tuner.Tuner(trainer)
# x = tuner.lr_find(model, datamodule=dm, num_training=50, attr_name="lr")
# print(model.lr)
# model.lr = 2*model.lr
# print(model.lr)

In [15]:
# fig = x.plot(suggest=True)
# fig.show()

In [16]:
# trainer = pl.Trainer(
#     max_epochs=config["EPOCHS"], 
#     logger=wandb_logger,
#     callbacks=[checkpoint_callback])
# trainer.fit(model, dm)

In [17]:
# # Unfreeze BERT weights.
# for param in model.model.parameters():
#     param.requires_grad = True

# # Find the best lr.
# trainer = pl.Trainer()
# tuner = pl.tuner.Tuner(trainer)
# x = tuner.lr_find(model, datamodule=dm, num_training=50, attr_name="lr")
# print(model.lr)
# model.lr = 2*model.lr
# print(model.lr)

In [18]:
# fig = x.plot(suggest=True)
# fig.show()

In [19]:
# path = glob.glob("/kaggle/working/CommonLit - Evaluate Student Summaries/*/*/*")
# trainer = pl.Trainer(
#     max_epochs=config["EPOCHS"]+config["unfreeze_epochs"], 
#     logger=wandb_logger,
#     callbacks=[checkpoint_callback],
# )
# trainer.fit(model, dm, ckpt_path=path[0])

# Save models

In [20]:
path = glob.glob("/kaggle/working/fold*/*")
model = CommonLitModel(config)
for i in path:
    fold = i.split("/")[-2].split("_")[-1]
    model.load_state_dict(torch.load(i)["state_dict"])
    torch.save(model, f"{config['name']}_fold{fold}.pt")

Some weights of the model checkpoint at /kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassi

In [21]:
def predict_val(fold):
    preds = []
    config["fold"] = fold
    dm = CommonLitDataModule(summaries_train, config, fold)
    dm.setup(stage="fit")
    model = CommonLitModel(config)
    model = torch.load(f"{config['name']}_fold{fold}.pt")
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for batch_idx, batch in enumerate(dm.val_dataloader()):
            input_ids = batch['input_ids'].to(device, dtype = torch.long).to(device)
            attention_mask = batch['attention_mask'].to(device, dtype = torch.long).to(device)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long).to(device)
            outputs = model(input_ids, attention_mask, token_type_ids)
            preds.extend(outputs["logits"].cpu().detach().numpy().tolist())
    del model
    return dm.val_df, preds

In [22]:
oof_df = pd.DataFrame()
for i in range(config["n_folds"]):
    val_df,preds = predict_val(fold=i)
    val_df[["content_pred","wording_pred"]] = preds
    oof_df = pd.concat([oof_df, val_df])
    

Some weights of the model checkpoint at /kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassi

In [None]:
label = oof_df[["content","wording"]]
pred = oof_df[["content_pred","wording_pred"]]

In [None]:
score = compute_RMSE(torch.tensor(label.values), torch.tensor(pred.values))
print(score)

# Prediction

In [None]:
# path = glob.glob("/kaggle/working/CommonLit - Evaluate Student Summaries/*/*/*")
# model.load_state_dict(torch.load(path[0])["state_dict"])

# torch.save(model, f"{config['name']}.pt")
# model = torch.load(f"{config['name']}.pt")
# model.eval()

In [None]:
# class CommonLitTestDataset(Dataset):
#     def __init__(self, dataframe, target_list, max_len):
#         self.dataframe = dataframe
#         self.max_len = max_len
#         self.tokenizer = AutoTokenizer.from_pretrained(config["model_dir"])
#         self.text = dataframe['text'].values
        
#     def __len__(self):
#         return len(self.dataframe)
    
#     def __getitem__(self, index):
#         row = self.dataframe.iloc[index]
#         text = row["text"]
#         question = row["prompt_text"]
#         inputs = self.tokenizer.encode_plus(
#             text.lower(),
#             truncation="longest_first",
#             padding='max_length',
#             max_length=self.max_len,
#             return_attention_mask=True,
#             return_tensors = 'pt'
#         )
        
#         input_ids = inputs['input_ids'].flatten()
#         attention_mask = inputs['attention_mask'].flatten()
#         token_type_ids = inputs['token_type_ids'].flatten()
        
#         return {'input_ids': input_ids,
#                 'attention_mask': attention_mask,
#                 'token_type_ids': token_type_ids}
    

# test_dataset = CommonLitTestDataset(summaries_test,
#                                     target_list = config["target_list"],
#                                     max_len = config["MAX_LEN"],
#                                     )

# test_data_loader = DataLoader(test_dataset)

In [None]:
# def test_model(model, test_loader):
#     model.eval()
#     preds = []
#     with torch.no_grad():
#         for batch_idx, batch in enumerate(test_loader):
#             input_ids = batch['input_ids'].to(device, dtype = torch.long).to(device)
#             attention_mask = batch['attention_mask'].to(device, dtype = torch.long).to(device)
#             token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long).to(device)
#             outputs = model(input_ids, attention_mask, token_type_ids)
#             preds.extend(outputs["logits"].cpu().detach().numpy().tolist())
#     return preds

In [None]:
# model = model.to(device)
# y_pred = test_model(model, test_data_loader)
# pred_data = pd.DataFrame({col: [col[idx] for col in y_pred]
#                           for idx,col in enumerate(config["target_list"])})
# for col in sample_submission.columns[1:]:
#     sample_submission[col] = pred_data[col]
# sample_submission

In [None]:
# sample_submission.to_csv('submission.csv', index = False)

In [None]:
wandb.finish()