In [1]:
class Config: 
    model_preset = "symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli"
    save_dir = 'my_model'
    
    train_dir = "/kaggle/input/contradictory-my-dear-watson/train.csv"
    test_dir = "/kaggle/input/contradictory-my-dear-watson/test.csv"
    
    backbone_trainable = False
    fast_mode = False
     
    batch_size = 16
    min_epoch = 10
    max_epoch = 20
    lr = 5e-1
    
    ic = ('premise', 'hypothesis')
    it = ('label')
    
    tt_split = 0.1 # train test split
    max_length = 1024

cfg = Config() 

In [2]:
! python -m pip install -q lightning
! pip install -q torchmetrics

In [3]:
import polars as pl
import numpy as np

from transformers import AutoTokenizer, AutoModel
import datasets

import torch

from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import lightning as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks.lr_finder import LearningRateFinder

import torchmetrics as tm

# other
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

# define device 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

import os 

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [4]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_preset)
bert_model = AutoModel.from_pretrained(cfg.model_preset)

tokenizer_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [5]:
if (cfg.backbone_trainable == False): 
    # Freeze the weights of the BERT model
    for param in bert_model.parameters():
        param.requires_grad = False

In [6]:
def bert_encode(premises, hypothesis, tokenizer):
    if type(premises) == str: 
        premises = [premises]
    if type(hypothesis) == str: 
        hypothesis = [hypothesis]
        
    sentence1 = [pr + "[SEP]" for pr in premises]
    sentence2 = [hip + "[SEP]" for hip in hypothesis]

    num_examples = len(hypothesis)

    cls = [tokenizer.cls_token] * num_examples
    
    input_words = [" ".join([cls_] + [s1] + [s2]) for cls_, s1, s2 in zip(cls, sentence1, sentence2)]
#     input_word_ids = torch.cat([cls, sentence1, sentence2], dim=-1)
    return tokenizer(input_words, return_tensors = 'pt', truncation=True, padding='max_length')
    

# Example usage:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# premises = ["This is a premise.", "Another premise."]
# hypothesis = ["This is a hypothesis.", "Another hypothesis."]
# encoded_inputs = bert_encode(premises, hypothesis, tokenizer)

In [7]:
def preprocess_df(df): 
    df_ = df.clone()

    x = df_.select([*cfg.ic])
#     cast to str

    if cfg.it in df_.columns: 
        y = df_[cfg.it]
#         cast to Categorical
    else: 
        y = None

    return x, y

class MyDataset(Dataset): 
    def __init__(self, 
                 df, 
                 tokenizer, 
                 label = None, 
                 **kwargs) -> None: 
        super(MyDataset, self).__init__()
#     tokenize all inputs
        
#         self.x = datasets.Dataset.from_pandas(df.to_pandas()).map(lambda i: bert_encode(i['premise'], i['hypothesis'], tokenizer)
#                                                                   , remove_columns = df.columns
#                                                                   , **kwargs)    
        self.tok = tokenizer
        self.premise = df['premise'].to_list()
        self.hip = df['hypothesis'].to_list()
        
        self.y = label
        
    def __len__(self) -> int: 
#         raise len(self.y) == len(self.x)
        
        return len(self.premise)
    
    def __getitem__(self, idx) -> torch.tensor: 
        if self.y == None: 
            return bert_encode(self.premise[idx], self.hip[idx], self.tok)
        else: 
            return bert_encode(self.premise[idx], self.hip[idx], self.tok), self.y[idx] 
    
class MyLightningDataModule(L.LightningDataModule): 
    def __init__(self
                 , tokenizer
                 , limit_data = None
                 , batch_size = 8
                 , num_workers = 3
                 , *args
                 , **kwargs): 
        
        super().__init__(*args, **kwargs)
        
        self.fp = cfg.train_dir
        self.tokenizer = tokenizer
        self.limit_data = limit_data
        self.batch_size = batch_size 
        self.num_w = num_workers
        
#         common fn for build dataset
    @staticmethod
    def build_dataset(df, tokenizer, **kwargs): 
        x, y = preprocess_df(df) 
        return MyDataset(df = x,tokenizer = tokenizer, label = y.to_list(), **kwargs)
    
    def prepare_data(self): 
        self.df = pl.read_csv(self.fp)
    
    def setup(self, stage):  
#       split data to train and val  
        train_df, val_df = train_test_split(self.df, test_size=cfg.tt_split) 

#       build datasets for both train and validation 
        self.train_ds = self.build_dataset(train_df, self.tokenizer, batch_size = 1000, batched=True)
        self.val_ds = self.build_dataset(val_df, self.tokenizer, batch_size = 1000, batched =True)

    def train_dataloader(self): 
        return DataLoader(self.train_ds, self.batch_size, num_workers = self.num_w, pin_memory=True)
    
    def val_dataloader(self): 
        return DataLoader(self.val_ds, self.batch_size, num_workers = self.num_w)
        
    def test_dataloader(self): 
        return DataLoader(self.val_ds, self.batch_size, num_workers = self.num_w)

# Lightning Module (NN)

In [8]:
class MyNN(nn.Module):
    def __init__(self, bert_model, num_classes: int = 3):
        
        super(MyNN, self).__init__()
        
        self.bert_model = bert_model
        
        self.dropout = nn.Dropout(0.2)
        self.layer_norm = nn.LayerNorm(bert_model.config.hidden_size)
        self.lstm = nn.LSTM(input_size=bert_model.config.hidden_size, hidden_size=128, batch_first=True)
        self.linear = nn.Linear(128, num_classes)
        
    def forward(self, inputs):
        # Get BERT outputs
        outputs = self.bert_model(**inputs)
        
        pooled_output = outputs[0]
        
        # Apply dropout and layer normalization
        pooled_output = self.dropout(pooled_output)
        pooled_output = self.layer_norm(pooled_output)
        
        # Pass through LSTM
        lstm_output, _ = self.lstm(pooled_output)
        
        # Pass through linear layer and apply softmax activation
        logits = self.linear(lstm_output[:, 0, :])
        probabilities = F.softmax(logits, dim=1)
        
        return probabilities
    
class MyLightningNN(L.LightningModule): 
    def __init__(self, bert_model = None, model: MyNN = None, num_classes: int = 3): 
        super(MyLightningNN, self).__init__()
        
        if bert_model == None: 
            bert_model = AutoModel.from_pretrained(cfg.model_preset)
        
        self.bert = bert_model 
        if model == None: 
            model = MyNN(self.bert, num_classes)
                
        self.my_model = model 

#         define accuracy fn 
        self.acc = tm.classification.Accuracy(task="multiclass", num_classes=num_classes)
        
    def forward(self, inputs): 
        inputs = {k: v.reshape([v.shape[0], v.shape[-1]]) for k, v in inputs.items()}
        
#         inputs = {k: torch.tensor(v) for k,v in inputs.items()}
        return self.my_model(inputs)
        
    def common_step(self, batch): 
        inputs, labels = batch
        
        logits= self.forward(inputs)
#         calculate loss 
        loss = F.cross_entropy(logits, labels)
        return loss, (logits, labels) 
    
    def training_step(self, batch, batch_idx): 
        loss, (logits, labels) = self.common_step(batch)
#         logging losses 
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx): 
        loss, (logits, labels) = self.common_step(batch)
        
        acc = self.acc(logits, labels)
#         logging losses 
        self.log('val_loss', loss)
        self.log('val_acc', acc)
        return loss
    
    def predict_step(self, batch, batch_idx, dataloader_idx = 0): 
        inputs = batch
        
        logits= self.forward(inputs)
        return torch.argmax(logits, axis=-1)
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=cfg.batch_size)
        return optimizer

# Train model (Lightnings Trainer)

In [9]:
datamodule = MyLightningDataModule(tokenizer, batch_size = cfg.batch_size)
model = MyLightningNN()

trainer = L.Trainer(max_epochs = cfg.max_epoch
                    , min_epochs = cfg.min_epoch
                    , default_root_dir = cfg.save_dir
                    , fast_dev_run = True if (cfg.fast_mode == True) else False
                    , precision = 16
                    , accelerator = 'gpu'
                    , strategy = 'auto'
                    , callbacks=[
                        EarlyStopping(patience=10, monitor='val_acc', mode='max'),
#                         LearningRateFinder(), 
                    ])

/opt/conda/lib/python3.10/site-packages/lightning/fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [10]:
trainer.fit(model = model, datamodule = datamodule)

2024-05-12 12:29:47.952448: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-12 12:29:47.952548: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-12 12:29:48.048710: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name     | Type               | Params
------------------------------------------------
0 | bert     | XLMRobertaModel    | 278 M 
1 | my_model | MyNN               | 278 M 
2 | acc      | MulticlassAccuracy | 0     
------------------------------------------------
278 M     Trainable params
0         Non

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

# Make submission

In [11]:
test_df = pl.read_csv(cfg.test_dir)
test_ds = MyDataset(df = test_df
                    , tokenizer = tokenizer
                    , label = None)
test_ds = DataLoader(test_ds, batch_size = 64, num_workers = 3)

In [12]:
predictions = []

# for i, batch in enumerate(test_ds): 
#     print(f"prediction {i}/{len(test_ds)}")
#     with torch.no_grad(): 
#         inputs = batch
#         preds, logits = model.predict_step(inputs, i)
#         predictions.append(preds)
        
predictions = trainer.predict(model, test_ds)        

predictions_ = []
for i in predictions: 
    predictions_ += i.tolist()

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [13]:
submissions = pl.DataFrame({'id': test_df['id'].to_list()[:len(predictions_)], 'prediction': predictions_})
submissions.write_csv("submission.csv")

In [14]:
pl.read_csv('submission.csv')

id,prediction
str,i64
"""c6d58c3f69""",0
"""cefcc82292""",0
"""e98005252c""",0
"""58518c10ba""",0
"""c32b0d16df""",0
…,…
"""5f90dd59b0""",0
"""f357a04e86""",0
"""1f0ea92118""",0
"""0407b48afb""",0
