In [1]:
class Config: 
    model_preset = "symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli"
    save_dir = 'my_model'
    
    train_dir = "/kaggle/input/contradictory-my-dear-watson/train.csv"
    test_dir = "/kaggle/input/contradictory-my-dear-watson/test.csv"
    
    fast_mode = False
     
    batch_size = 16
    min_epoch = 7
    max_epoch = 9
    lr = 4e-1
    l2 = 0.01
    
    
    ic = ('premise', 'hypothesis')
    it = ('label')
    
    tt_split = 0.1 # train test split

cfg = Config() 

In [2]:
! python -m pip install -q lightning peft
! pip install -q torchmetrics polars datasets==2.18.0

In [3]:
import polars as pl
import numpy as np

from transformers import AutoTokenizer, AutoModel
import datasets

import torch

from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from peft import LoraConfig, get_peft_model

import pytorch_lightning as L
import torchmetrics as tm

import lightning as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks.lr_finder import LearningRateFinder

import torchmetrics as tm

# other
from sklearn.model_selection import train_test_split

# define device 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

import os 

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

L.seed_everything(42, workers=True)

INFO: Seed set to 42


42

In [4]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_preset)

tokenizer_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [5]:
def bert_encode(premises, hypothesis, tokenizer):
    if type(premises) == str: 
        premises = [premises]
    if type(hypothesis) == str: 
        hypothesis = [hypothesis]
        
    sentence1 = premises
    sentence2 = hypothesis

    num_examples = len(hypothesis)

    cls = [tokenizer.sep_token] * num_examples
    
    input_words = [" ".join( [s1] + [cls_] +[s2]) for cls_, s1, s2 in zip(cls, sentence1, sentence2)]
#     input_word_ids = torch.cat([cls, sentence1, sentence2], dim=-1)
    return tokenizer(input_words, return_tensors = 'pt', padding = 'max_length', truncation=True)
    

# Example usage:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# premises = ["This is a premise.", "Another premise."]
# hypothesis = ["This is a hypothesis.", "Another hypothesis."]
# bert_encode(premises, hypothesis, tokenizer)

In [6]:
def preprocess_df(df): 
    df_ = df.clone()

    x = df_.select([*cfg.ic])
#     cast to str

    if cfg.it in df_.columns: 
        y = df_[cfg.it]
#         cast to Categorical
    else: 
        y = None

    return x, y

class MyDataset(Dataset): 
    def __init__(self, 
                 df, 
                 tokenizer, 
                 label = None, 
                 **kwargs) -> None: 
        super(MyDataset, self).__init__()
#     tokenize all inputs
        
#         self.x = datasets.Dataset.from_pandas(df.to_pandas()).map(lambda i: bert_encode(i['premise'], i['hypothesis'], tokenizer)
#                                                                   , remove_columns = df.columns
#                                                                   , **kwargs)    
        self.tok = tokenizer
        self.premise = df['premise'].to_list()
        self.hip = df['hypothesis'].to_list()
        
        self.y = label
        
    def __len__(self) -> int: 
#         raise len(self.y) == len(self.x)
        
        return len(self.premise)
    
    def __getitem__(self, idx) -> torch.tensor: 
        if self.y == None: 
            return bert_encode(self.premise[idx], self.hip[idx], self.tok)
        else: 
            return bert_encode(self.premise[idx], self.hip[idx], self.tok), self.y[idx] 
    
class MyLightningDataModule(L.LightningDataModule): 
    def __init__(self
                 , tokenizer
                 , limit_data = None
                 , batch_size = 8
                 , num_workers = 3
                 , *args
                 , **kwargs): 
        
        super().__init__(*args, **kwargs)
        
        self.fp = cfg.train_dir
        self.tokenizer = tokenizer
        self.limit_data = limit_data
        self.batch_size = batch_size 
        self.num_w = num_workers
        
#         common fn for build dataset
    @staticmethod
    def build_dataset(df, tokenizer, **kwargs): 
        x, y = preprocess_df(df) 
        return MyDataset(df = x,tokenizer = tokenizer, label = y.to_list(), **kwargs)
    
    def prepare_data(self): 
        
        from transformers import DataCollatorWithPadding
        
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
        
        self.df = pl.read_csv(self.fp)
    
    def setup(self, stage):  
#       split data to train and val  
        train_df, val_df = train_test_split(self.df, test_size=cfg.tt_split) 

#       build datasets for both train and validation 
        self.train_ds = self.build_dataset(train_df, self.tokenizer, batch_size = 1000, batched=True)
        self.val_ds = self.build_dataset(val_df, self.tokenizer, batch_size = 1000, batched =True)

    def train_dataloader(self): 
        return DataLoader(self.train_ds, 
                          batch_size = self.batch_size, 
                          num_workers = self.num_w, 
                          pin_memory=True, 
                          shuffle= True)
    
    def val_dataloader(self): 
        return DataLoader(self.val_ds, 
                          batch_size =self.batch_size, 
                          num_workers = self.num_w, 
                          shuffle = True
                         )
        
    def test_dataloader(self): 
        return DataLoader(self.val_ds, 
                          batch_size = self.batch_size, 
                          num_workers = self.num_w, 
                          shuffle=True)

# Lightning Module (NN)

In [7]:

class MyNN(nn.Module):
    def __init__(self, bert_model, num_classes: int = 3):
        super(MyNN, self).__init__()
        
        self.bert_model = bert_model
        self.dropout1 = nn.Dropout(0.2)
        self.layer_norm = nn.LayerNorm(bert_model.config.hidden_size)
        self.lstm = nn.LSTM(input_size=bert_model.config.hidden_size, hidden_size=128, batch_first=True)
        self.batch_norm = nn.BatchNorm1d(512)  # Added batch normalization
        
        # Add global average pooling 1D
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        
        # Add dropout before final linear layer
        self.dropout2 = nn.Dropout(0.3)
        
        self.linear = nn.Linear(128, num_classes)
        
    def forward(self, inputs):
        # Get BERT outputs
        outputs = self.bert_model(**inputs)
        pooled_output = outputs.last_hidden_state
        
        # Apply dropout and layer normalization
        pooled_output = self.dropout1(pooled_output)
        pooled_output = self.layer_norm(pooled_output)
        pooled_output = self.batch_norm(pooled_output)  # Apply batch normalization
        
        # Pass through LSTM
        lstm_output, _ = self.lstm(pooled_output)
        
        # Apply global average pooling 1D and flatten
        pooled_output = self.global_avg_pool(lstm_output.transpose(1, 2)).squeeze(-1)
        
        # Apply dropout before final linear layer
        pooled_output = self.dropout2(pooled_output)
        
        # Pass through linear layer
        logits = self.linear(pooled_output)
        probabilities = nn.functional.softmax(logits, dim=1)
        
        return probabilities
    
class MyLightningNN(L.LightningModule): 
#     LOAD MODEL AND ADD THE EXTRA LAYERS (LORA)
    @staticmethod 
    def load_model(model_preset): 
        model = AutoModel.from_pretrained(model_preset)
        lora_config = LoraConfig(
            target_modules=["query", "key", "value"],
            r=8,
            lora_alpha=16,
            lora_dropout=0.1
        )
        peft_model = get_peft_model(model, lora_config)
        peft_model.print_trainable_parameters() 
        return peft_model
    
    def __init__(self, bert_model=None, model: MyNN=None, num_classes: int=3): 
        super(MyLightningNN, self).__init__()
        
        if bert_model is None: 
            bert_model = self.load_model(cfg.model_preset)
        
        self.bert = bert_model 
        if model is None: 
            model = MyNN(self.bert, num_classes)
                
        self.my_model = model 

        # Define accuracy function 
        self.acc = tm.classification.Accuracy(task="multiclass", num_classes=num_classes)
        
    def forward(self, inputs): 
        inputs = {k: v.reshape([v.shape[0], v.shape[-1]]) for k, v in inputs.items()}
        return self.my_model(inputs)
        
    def common_step(self, batch): 
        inputs, labels = batch
        logits = self.forward(inputs)
        
        # Calculate loss 
        loss = F.cross_entropy(logits, labels)
        return loss, (logits, labels) 
    
    def training_step(self, batch, batch_idx): 
        loss, (logits, labels) = self.common_step(batch)
        
        # Log training loss 
        self.log('train_loss', loss, prog_bar=True)
        
        acc = self.acc(logits, labels)
        self.log('train_acc', acc, prog_bar=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx): 
        loss, (logits, labels) = self.common_step(batch)
        
        # Calculate and log validation accuracy 
        acc = self.acc(logits, labels)
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        return loss
    
    def predict_step(self, batch, batch_idx, dataloader_idx=0): 
        inputs = batch
        logits = self.forward(inputs)
        return logits
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=cfg.lr, weight_decay=cfg.l2)
        return optimizer


# Train model (Lightnings Trainer)

In [8]:
datamodule = MyLightningDataModule(tokenizer, batch_size = cfg.batch_size)
model = MyLightningNN()

trainer = L.Trainer(min_epochs = cfg.min_epoch
                    , max_epochs = cfg.max_epoch
                    , default_root_dir = cfg.save_dir
                    , fast_dev_run = cfg.fast_mode
                    , precision = 16
                    , accelerator = 'gpu'
                    , strategy = 'auto'
                    , callbacks=[
                        EarlyStopping(patience=10, monitor='val_acc', mode='max'),
#                         LearningRateFinder(), 
                    ]
                    , logger = L.pytorch.loggers.CSVLogger(save_dir='loggs')
                   )

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


trainable params: 442,368 || all params: 278,486,016 || trainable%: 0.1588


/opt/conda/lib/python3.10/site-packages/lightning/fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [9]:
! mkdir -P "loggs/lightning_logs"

h = trainer.fit(model = model, datamodule = datamodule)

mkdir: invalid option -- 'P'
Try 'mkdir --help' for more information.


2024-05-20 18:08:46.802696: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-20 18:08:46.802832: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-20 18:08:46.927681: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name     | Type               | Params
------------------------------------------------
0 | bert     | PeftModel          | 278 M 
1 | my_model | MyNN               | 278 M 
2 | acc      | MulticlassAccuracy | 0     
------------------------------------------------
905 K     Trainable params
278 M     Non

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=9` reached.


# Make submission

In [10]:
test_df = pl.read_csv(cfg.test_dir)
test_ds = MyDataset(df = test_df
                    , tokenizer = tokenizer
                    , label = None)
test_ds = DataLoader(test_ds, batch_size = 64, num_workers = 3, collate_fn = datamodule.data_collator)

In [11]:
predictions = []

# for i, batch in enumerate(test_ds): 
#     print(f"prediction {i}/{len(test_ds)}")
#     with torch.no_grad(): 
#         inputs = batch
#         preds, logits = model.predict_step(inputs, i)
#         predictions.append(preds)
        
predictions = trainer.predict(model, test_ds)        

predictions_ = []
for i in predictions: 
    predictions_ += torch.argmax(i, axis=-1)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [12]:
submissions = pl.DataFrame({'id': test_df['id'].to_list()[:len(predictions_)], 'prediction': predictions_})
submissions.write_csv("submission.csv")

In [13]:
pl.read_csv('submission.csv')['prediction'].describe()

statistic,value
str,f64
"""count""",5195.0
"""null_count""",0.0
"""mean""",0.0
"""std""",0.0
"""min""",0.0
"""25%""",0.0
"""50%""",0.0
"""75%""",0.0
"""max""",0.0
