# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Install Required Libraries & Utils Function </h1></span>

In [1]:
# ====================================================
# Required Libraries
# ====================================================

import os
import gc
import re
import sys
import time
import math
import random
import warnings
import json
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

tokenizers.__version__: 0.15.2
transformers.__version__: 4.39.3
env: TOKENIZERS_PARALLELISM=true
cuda


In [2]:
# ====================================================
# Configuration (Hyper Parameters Value)
# ====================================================

class CFG:
    debug=False # want to debug or not 
    apex=True # for faster training
    print_freq= 300
    num_workers=4 
    model= "FacebookAI/xlm-roberta-base" 
    #"csebuetnlp/banglabert_large", "sagorsarker/bangla-bert-base","csebuetnlp/banglabert" 
    # "xlm-roberta-base"     
    epochs=10
    learning_rate=2e-5 
    eps=1e-6
    betas=(0.9, 0.999) # for adam optimizer
    batch_size= 32 #32  # batch size
    max_len=512
    weight_decay=0.01 # for adam optimizer regulaization parameter
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['IDEOLOGICALINEQUALITY', 'STEREOTYPING-DOMINANCE', 'OBJECTIFICATION', 'SEXUAL-VIOLENCE', 'MISOGYNY-NON-SEXUAL-VIOLENCE'] #target columns
    seed=42 # seed no. for random initialization 
    train=True
    num_class = None # Number of class in your dataset
    mode = "cls_based" #"cls_based", "attention_based", "lstm_based"


In [3]:
def clean_dataframe(path_1, path_2):
    with open(path_1, 'r') as file:
        data = json.load(file)

    flattened_data = []
    for key, value in data.items():
        row = {'id_EXIST': key}
        for k, v in value.items():
            row[k] = v
        flattened_data.append(row)

    df = pd.DataFrame(flattened_data)

    with open(path_2, 'r') as file:
        data = json.load(file)

    gold = pd.DataFrame(data)
    df = pd.merge(df, gold, left_on='id_EXIST', right_on='id')

    df = df[['id','tweet','value']]
    return df

In [4]:
dpath_training = '/kaggle/input/exist-2024-task-3/EXIST2024_training.json'
dpath_gold = '/kaggle/input/exist-2024-task-3/EXIST2024_training_task3_gold_hard.json'
df = clean_dataframe(dpath_training, dpath_gold)
df.head()

Unnamed: 0,id,tweet,value
0,100001,"@TheChiflis Ignora al otro, es un capullo.El p...","[OBJECTIFICATION, SEXUAL-VIOLENCE]"
1,100002,@ultimonomada_ Si comicsgate se parece en algo...,[NO]
2,100003,"@Steven2897 Lee sobre Gamergate, y como eso ha...",[NO]
3,100005,@novadragon21 @icep4ck @TvDannyZ Entonces como...,"[STEREOTYPING-DOMINANCE, OBJECTIFICATION, IDEO..."
4,100006,@yonkykong Aaah sí. Andrew Dobson. El que se d...,[NO]


In [5]:
dpath_dev = '/kaggle/input/exist-2024-task-3/EXIST2024_dev.json'
dpath_gold_dev = '/kaggle/input/exist-2024-task-3/EXIST2024_dev_task3_gold_hard.json'
df_dev = clean_dataframe(dpath_dev, dpath_gold_dev)
df_dev.head()

Unnamed: 0,id,tweet,value
0,300002,@anacaotica88 @MordorLivin No me acuerdo de lo...,"[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINANC..."
1,300003,@cosmicJunkBot lo digo cada pocos dias y lo re...,[NO]
2,300004,Also mientras les decia eso la señalaba y deci...,[SEXUAL-VIOLENCE]
3,300005,"And all people killed, attacked, harassed by ...",[NO]
4,300006,On this #WorldPressFreedomDay I’m thinking of ...,[NO]


In [6]:
columns = ['IDEOLOGICALINEQUALITY', 'STEREOTYPING-DOMINANCE', 'OBJECTIFICATION', 'SEXUAL-VIOLENCE', 'MISOGYNY-NON-SEXUAL-VIOLENCE']

def make_multilabel_columns(df, columns):
    for column_name in columns:
        l = []
        for x in df.value:
            if column_name in x:
                l.append(1)
            else:
                l.append(0)
        df[column_name] = l

In [7]:
make_multilabel_columns(df_dev, columns)
make_multilabel_columns(df, columns)

In [8]:
df.drop(columns=['value'], inplace=True)
df_dev.drop(columns=['value'], inplace=True)

In [9]:
CFG.num_class = 5

In [10]:
df.head()

Unnamed: 0,id,tweet,IDEOLOGICALINEQUALITY,STEREOTYPING-DOMINANCE,OBJECTIFICATION,SEXUAL-VIOLENCE,MISOGYNY-NON-SEXUAL-VIOLENCE
0,100001,"@TheChiflis Ignora al otro, es un capullo.El p...",0,0,1,1,0
1,100002,@ultimonomada_ Si comicsgate se parece en algo...,0,0,0,0,0
2,100003,"@Steven2897 Lee sobre Gamergate, y como eso ha...",0,0,0,0,0
3,100005,@novadragon21 @icep4ck @TvDannyZ Entonces como...,0,1,1,0,0
4,100006,@yonkykong Aaah sí. Andrew Dobson. El que se d...,0,0,0,0,0


In [11]:
X = df['tweet']
y = df[df.columns[2:]]
X_dev = df_dev['tweet']
y_dev = df_dev[df_dev.columns[2:]]

In [12]:
class_weights = {}
positive_weights = {}
negative_weights = {}
N = len(df)
for label in sorted(columns):
    div = (2 * sum(df[label] == 1))
    if div == 0:
      positive_weights[label] = 0
    else:
        positive_weights[label] = N /div
    div2 = (2 * sum(df[label] == 0))
    if div == 0:
        negative_weights[label] = 0
    else:
        negative_weights[label] = N /div2
    
class_weights['positive_weights'] = positive_weights
class_weights['negative_weights'] = negative_weights
class_weights

{'positive_weights': {'IDEOLOGICALINEQUALITY': 0,
  'MISOGYNY-NON-SEXUAL-VIOLENCE': 3.5338785046728973,
  'OBJECTIFICATION': 2.742520398912058,
  'SEXUAL-VIOLENCE': 4.481481481481482,
  'STEREOTYPING-DOMINANCE': 2.1257905832747714},
 'negative_weights': {'IDEOLOGICALINEQUALITY': 0,
  'MISOGYNY-NON-SEXUAL-VIOLENCE': 0.5824027724297266,
  'OBJECTIFICATION': 0.6114817060844957,
  'SEXUAL-VIOLENCE': 0.5627906976744186,
  'STEREOTYPING-DOMINANCE': 0.6537713421223255}}

In [13]:
# Function to remove words starting with '@'
# def remove_mentions(text):
#     words = text.split()
#     filtered_words = [word for word in words if not word.startswith('@')]
#     return ' '.join(filtered_words)

# # Apply the function to the 'tweet' column
# df['tweet'] = df['tweet'].apply(remove_mentions)
# df_dev['tweet'] = df_dev['tweet'].apply(remove_mentions)
# df.head()

In [14]:
# import re

# def remove_url_emoji(text):
#   # Remove URLs
#   text = re.sub(r"http\S+", "", text)
#   # Remove punctuation (except apostrophes) and emojis
# #   text = re.sub(r"[^\w\s'¡¿]", "", text)
#   return text

# # sentence = "hello http:\\www.facebook.com here is an emoji 😍"
# # cleaned_sentence = clean_sentence(sentence)

# # print(cleaned_sentence)
# df['tweet'] = df['tweet'].apply(remove_url_emoji)
# df_dev['tweet'] = df_dev['tweet'].apply(remove_url_emoji)

In [15]:
# If Debugging is True then we will consider a small set of training data

if CFG.debug:
    CFG.epochs = 2
    train = train.sample(frac =.1) 

In [16]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [17]:
# ====================================================
# Logger File
# ====================================================

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(CFG.seed)

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Tokenizer, Dataset & Collate Function</h1></span>


In [18]:
# ====================================================
# tokenizer
# ====================================================

tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [19]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(df['tweet'].fillna("").values, total=len(df))
for text in tk0:
    length = len(tokenizer(text, truncation=True, add_special_tokens=False)['input_ids'])
    lengths.append(length)
    
if max(lengths) + 2 > 512:
    CFG.max_len = 512
else:
    CFG.max_len = max(lengths) + 2 # cls & sep 
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/6050 [00:00<?, ?it/s]

max_len: 323


In [20]:
# ====================================================
# Dataset Preparation
# ====================================================

def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['tweet'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.long)
        return inputs, label
    

# the colllate function to increase training speed

def collate(inputs): 
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs


# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Model Architecture </h1></span>


In [21]:
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        
        # defining attention network for attention scores 
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1))
        
        self.linear = nn.Linear(768, 768*2)
        self.lstm = nn.LSTM(768*2, self.config.hidden_size)
        
        self._init_weights(self.attention)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.num_class)
        self._init_weights(self.fc)


    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs.last_hidden_state # word level representation of last hiddent state
        
        if self.cfg.mode == "attention_based":
            # attention based sentence representation
            weights = self.attention(last_hidden_states)
            feature = torch.sum(weights * last_hidden_states, dim=1)
            
            cls_token_feature = last_hidden_states[:, 0, :]
            combine_feature = torch.cat([feature, cls_token_feature], dim = -1)
            
        if self.cfg.mode == "cls_based":
            # [CLS] Token Repr
            feature = last_hidden_states[:, 0, :]
            weights= None
            
        if self.cfg.mode == "lstm_based":
            x = last_hidden_states[:, 0, :]
            x = self.linear(x)
            feature, _ = self.lstm(x)
            weights= None

        return feature, weights

    def forward(self, inputs):
        feature, weights = self.feature(inputs)
        output = self.fc(feature)
        return output, weights


# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Helpler functions for Training  </h1></span>

<font size="3">Few important function are created here.</font>

1. <i>AverageMeter</i> - To compute & store the average
2. <i>asMinutes</i> - To calculate the time
3. <i>timeSince</i> - To compute training & validation time
4. <i>train_fn</i> - Calculation of forward & backward pass for a single epoch in training data
5. <i>valid_fn</i> - Calculation of forward & backward pass for a single epoch in validation data


In [22]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(train_loader, model, criterion, optimizer, epoch, device):

    
    # Enabling Model Training Mode
    model.train()
     
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex) # using Automatic Mixed Precision (AMP) for speed up
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    
    for step, (inputs, labels) in enumerate(train_loader): # iterate over the training data 
        inputs = collate(inputs) # the collate function I discussed for speeding up training
        
        for k, v in inputs.items():
            inputs[k] = v.to(device)  # formatting the input to feed into the transformer model 
        labels = labels.to(device) 
        batch_size = labels.size(0)
        
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds, _ = model(inputs) 
#             loss = criterion(y_preds.view(-1, CFG.num_class), labels.view(-1))
            loss = criterion(y_preds, labels)

        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        
        scaler.scale(loss).backward() # backpropagation
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm) # clipping the gradient
        
        losses.update(loss.item(), batch_size)
        
        # Updating weights via optimizer & scaler
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
        end = time.time()
        
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    
    for step, (inputs, labels) in enumerate(valid_loader): # iterate over the validation data 
        inputs = collate(inputs)
        
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        
        with torch.no_grad(): # we don't need to store the gradients w.r.t validation data
            y_preds, _ = model(inputs)
#             loss = criterion(y_preds.view(-1, CFG.num_class), labels.view(-1))
            loss = criterion(y_preds, labels)
            
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Training Loop </h1></span>

In [23]:
from sklearn.metrics import f1_score, classification_report

def get_score(y_trues, y_preds):
    y_predicted = y_preds.argmax(axis=1)  # Convert probabilities to class predictions
    macro_f1 = f1_score(y_trues, y_predicted, average='macro')
#     print(classification_report(y_trues, y_predicted, digits=4))
    return macro_f1

In [24]:
Wp = class_weights['positive_weights']
Wn = class_weights['negative_weights']

def criterion(y_true, y_logit):
    '''
    Multi-label cross-entropy
    * Required "Wp", "Wn" as positive & negative class-weights
    y_true: true value
    y_logit: predicted value
    '''
    loss = 0.0
    
    for i, (Wp_val, Wn_val) in enumerate(zip(Wp.values(), Wn.values())):
        first_term = Wp_val * y_true[:, i] * torch.log(y_logit[:, i] + 1e-7)
        second_term = Wn_val * (1 - y_true[:, i]) * torch.log(1 - y_logit[:, i] + 1e-7)
        loss += torch.sum(-(first_term + second_term))  # Sum of negative values will be negative
    return loss



# ====================================================
# train loop
# ====================================================
def train_loop():

    # ====================================================
    # loader
    # ====================================================
    
    train_dataset = TrainDataset(CFG, df) # training dataset formatting 
    valid_dataset = TrainDataset(CFG, df_dev) # validation dataset formatting

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True) # train dataloader
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False) # validation dataloader

    valid_labels = df_dev[CFG.target_cols].values
    
    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)  # initializing the model
    torch.save(model.config, OUTPUT_DIR+'config.pth') # saving the model configuration 
    model.to(device) # GPU Config
    
    optimizer = AdamW(model.parameters(), lr=CFG.learning_rate, eps=CFG.eps, betas=CFG.betas) # declaring the optimizer
    
#     criterion = nn.CrossEntropyLoss(reduction="mean")
#     criterion = nn.BCEWithLogitsLoss()
#     criterion = custom_loss()
    
    best_score = 0


    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train function 
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, device)

        # eval function 
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
#         score = get_score(valid_labels, predictions)
        score = 0
        # Apply softmax activation function to convert logits to probabilities
        predicted_probabilities = torch.sigmoid(torch.tensor(predictions))

        # Convert predicted probabilities to binary predictions
        predicted_labels = (predicted_probabilities > 0.5).int().numpy()

        # Assuming you have true labels in y_dev
        true_labels = y_dev.values
        print(predicted_labels, true_labels)

        # Calculate accuracy
        accuracy = accuracy_score(true_labels, predicted_labels)
        print('Accuracy:', accuracy)
#         print(predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        
        if best_score < score: # Saving the best model w.r.t the score 
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_score{best_score:.4f}_best.pth")

#     predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_score{best_score:.4f}_best.pth", 
#                              map_location=torch.device('cpu'))['predictions']
#     final_pred = predictions.argmax(axis=1)
#     final_pred = final_pred.tolist()
#     df_dev[f"pred_label"] = final_pred

    torch.cuda.empty_cache()
    gc.collect()
    
    return best_score

In [25]:
# ====================================================
# the training
# ====================================================
    
if __name__ == '__main__':
    
    if CFG.train:
        best_score = train_loop()

XLMRobertaConfig {
  "_name_or_path": "FacebookAI/xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.39.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}



model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch: [1][0/189] Elapsed 0m 1s (remain 4m 54s) Loss: 1257.0947(1257.0947) Grad: nan  
Epoch: [1][188/189] Elapsed 1m 9s (remain 0m 0s) Loss: -43350.9062(-29875.3156) Grad: 83709.3359  
EVAL: [0/15] Elapsed 0m 0s (remain 0m 5s) Loss: -82405.2656(-82405.2656) 


Epoch 1 - avg_train_loss: -29875.3156  avg_val_loss: -80251.6389  time: 73s
Epoch 1 - Score: 0.0000


EVAL: [14/15] Elapsed 0m 3s (remain 0m 0s) Loss: -32211.4531(-80251.6389) 
[[1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 ...
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]] [[0 1 0 0 1]
 [0 0 0 0 0]
 [0 0 0 1 0]
 ...
 [0 0 1 0 0]
 [0 1 1 0 1]
 [0 0 1 1 1]]
Accuracy: 0.0
Epoch: [2][0/189] Elapsed 0m 0s (remain 1m 24s) Loss: -42537.6367(-42537.6367) Grad: nan  
Epoch: [2][188/189] Elapsed 1m 7s (remain 0m 0s) Loss: -45610.4297(-43952.9354) Grad: 252432.4219  
EVAL: [0/15] Elapsed 0m 0s (remain 0m 5s) Loss: -90901.7422(-90901.7422) 


Epoch 2 - avg_train_loss: -43952.9354  avg_val_loss: -88462.9967  time: 71s
Epoch 2 - Score: 0.0000


EVAL: [14/15] Elapsed 0m 3s (remain 0m 0s) Loss: -35496.2266(-88462.9967) 
[[1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 ...
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]] [[0 1 0 0 1]
 [0 0 0 0 0]
 [0 0 0 1 0]
 ...
 [0 0 1 0 0]
 [0 1 1 0 1]
 [0 0 1 1 1]]
Accuracy: 0.0
Epoch: [3][0/189] Elapsed 0m 0s (remain 1m 36s) Loss: -46196.1641(-46196.1641) Grad: nan  
Epoch: [3][188/189] Elapsed 1m 7s (remain 0m 0s) Loss: -52809.4258(-47565.5869) Grad: 546422.2500  
EVAL: [0/15] Elapsed 0m 0s (remain 0m 5s) Loss: -96727.8359(-96727.8359) 


Epoch 3 - avg_train_loss: -47565.5869  avg_val_loss: -94020.7723  time: 71s
Epoch 3 - Score: 0.0000


EVAL: [14/15] Elapsed 0m 3s (remain 0m 0s) Loss: -37666.4570(-94020.7723) 
[[1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 ...
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]] [[0 1 0 0 1]
 [0 0 0 0 0]
 [0 0 0 1 0]
 ...
 [0 0 1 0 0]
 [0 1 1 0 1]
 [0 0 1 1 1]]
Accuracy: 0.0
Epoch: [4][0/189] Elapsed 0m 0s (remain 3m 2s) Loss: -48389.2344(-48389.2344) Grad: nan  
Epoch: [4][188/189] Elapsed 1m 7s (remain 0m 0s) Loss: -52365.9258(-50764.0168) Grad: 503589.4688  
EVAL: [0/15] Elapsed 0m 0s (remain 0m 5s) Loss: -103889.5703(-103889.5703) 


Epoch 4 - avg_train_loss: -50764.0168  avg_val_loss: -100948.2504  time: 71s
Epoch 4 - Score: 0.0000


EVAL: [14/15] Elapsed 0m 3s (remain 0m 0s) Loss: -40440.7891(-100948.2504) 
[[1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 ...
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]] [[0 1 0 0 1]
 [0 0 0 0 0]
 [0 0 0 1 0]
 ...
 [0 0 1 0 0]
 [0 1 1 0 1]
 [0 0 1 1 1]]
Accuracy: 0.0
Epoch: [5][0/189] Elapsed 0m 0s (remain 1m 20s) Loss: -49202.8750(-49202.8750) Grad: nan  
Epoch: [5][188/189] Elapsed 1m 7s (remain 0m 0s) Loss: -61864.4609(-54793.9980) Grad: 548645.0625  
EVAL: [0/15] Elapsed 0m 0s (remain 0m 5s) Loss: -113031.8125(-113031.8125) 


Epoch 5 - avg_train_loss: -54793.9980  avg_val_loss: -109644.4398  time: 71s
Epoch 5 - Score: 0.0000


EVAL: [14/15] Elapsed 0m 3s (remain 0m 0s) Loss: -43816.8125(-109644.4398) 
[[1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 ...
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]] [[0 1 0 0 1]
 [0 0 0 0 0]
 [0 0 0 1 0]
 ...
 [0 0 1 0 0]
 [0 1 1 0 1]
 [0 0 1 1 1]]
Accuracy: 0.0
Epoch: [6][0/189] Elapsed 0m 0s (remain 1m 24s) Loss: -59543.5195(-59543.5195) Grad: nan  
Epoch: [6][188/189] Elapsed 1m 8s (remain 0m 0s) Loss: -62055.4219(-59784.5160) Grad: 512866.0938  
EVAL: [0/15] Elapsed 0m 0s (remain 0m 5s) Loss: -124037.8281(-124037.8281) 


Epoch 6 - avg_train_loss: -59784.5160  avg_val_loss: -120266.3233  time: 72s
Epoch 6 - Score: 0.0000


EVAL: [14/15] Elapsed 0m 3s (remain 0m 0s) Loss: -48058.8438(-120266.3233) 
[[1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 ...
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]] [[0 1 0 0 1]
 [0 0 0 0 0]
 [0 0 0 1 0]
 ...
 [0 0 1 0 0]
 [0 1 1 0 1]
 [0 0 1 1 1]]
Accuracy: 0.0
Epoch: [7][0/189] Elapsed 0m 0s (remain 1m 32s) Loss: -69439.0859(-69439.0859) Grad: nan  
Epoch: [7][188/189] Elapsed 1m 8s (remain 0m 0s) Loss: -72992.1797(-65747.1841) Grad: 559078.1250  
EVAL: [0/15] Elapsed 0m 0s (remain 0m 5s) Loss: -136918.8281(-136918.8281) 


Epoch 7 - avg_train_loss: -65747.1841  avg_val_loss: -132553.0568  time: 72s
Epoch 7 - Score: 0.0000


EVAL: [14/15] Elapsed 0m 3s (remain 0m 0s) Loss: -52869.4688(-132553.0568) 
[[1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 ...
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]] [[0 1 0 0 1]
 [0 0 0 0 0]
 [0 0 0 1 0]
 ...
 [0 0 1 0 0]
 [0 1 1 0 1]
 [0 0 1 1 1]]
Accuracy: 0.0
Epoch: [8][0/189] Elapsed 0m 0s (remain 1m 20s) Loss: -72203.2734(-72203.2734) Grad: nan  


KeyboardInterrupt: 

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center">  Inference </h1></span>


<font color='#3498DB'> <h3> <a id ="section11a"> <b> Configuration for Inference</b> </a> </h3> </font>

<font size="3"> The basic and important configuration for infernce is described here along with some function & other stuff.</font>


In [None]:
# ====================================================
# CFG for testing
# ====================================================

class CFG_Test:
    num_workers=4
    path="./"
    config_path=path+'config.pth'
    model=CFG.model
    batch_size=CFG.batch_size
    target_cols=CFG.target_cols
    seed=CFG.seed
    num_class = CFG.num_class
    mode = CFG.mode
    
CFG_Test.tokenizer = AutoTokenizer.from_pretrained(CFG_Test.path+'tokenizer/') # load the saved pretrained tokenizer

In [None]:
def get_logger(filename='inference'): # infernece logger file
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

<font color='#3498DB'> <h3> <b> Model Loading for Inference</b> </h3> </font>

<font size="3"> Dataset for predicting on the test data and Model Loading for inference are done in this section </font>

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True)
    
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['tweet'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

<font color='#3498DB'> <h3> <b> Prediction on Test Data</b></h3> </font>

<font size="3"> An inference function is made for predicting on the test data. Then finally, loading the previously saved model for each fold and taking prediction on test dataset for each fold. Then, take the average of the each of prediction is considered as model final prediction, </font>

In [None]:
def clean_test_dataframe(path_1):
    with open(path_1, 'r') as file:
        data = json.load(file)

    flattened_data = []
    for key, value in data.items():
        row = {'id_EXIST': key}
        for k, v in value.items():
            row[k] = v
        flattened_data.append(row)

    df = pd.DataFrame(flattened_data)[['id_EXIST', 'tweet']]
    return df
    
dpath_test = '/kaggle/input/exist-2024-task1/EXIST2023_test_clean.json'
df_test = clean_test_dataframe(dpath_test)
df_test['tweet'] = df_test['tweet'].apply(remove_url_emoji)
df_test.head()

In [None]:
df_test.rename(columns={'id_EXIST': 'id'}, inplace=True)

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0: # iterate over the test data
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds, _ = model(inputs) # considering the logits only
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from transformers import DataCollatorWithPadding
test_dataset = TestDataset(CFG_Test, df_test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG_Test.batch_size,
                         shuffle=False,
                         collate_fn=DataCollatorWithPadding(tokenizer=CFG_Test.tokenizer, padding='longest'))
                         

model = CustomModel(CFG_Test, config_path=CFG_Test.config_path, pretrained=True)
state = torch.load(CFG_Test.path+f"{CFG_Test.model.replace('/', '-')}_score{best_score:.4f}_best.pth",
                   map_location=torch.device('cpu')) # loading the saved model

model.load_state_dict(state['model'])
prediction = inference_fn(test_loader, model, device)
del model, state; gc.collect()
torch.cuda.empty_cache()
    


# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Evaluation on Test Dataset </h1></span>


In [None]:
final_prediction = prediction.argmax(axis = 1)

In [None]:
print(prediction)

In [None]:
print(final_prediction)

In [None]:
sigmoid = torch.nn.Sigmoid()
probabilities = sigmoid(torch.from_numpy(prediction))

print(probabilities)

In [None]:
df_final_submission_hard = df_test.copy()

In [None]:
df_final_submission_hard['value'] = final_prediction

In [None]:
df_final_submission_hard.head()

In [None]:
df_final_submission_hard.drop(columns=['tweet'], inplace=True)

In [None]:
df_final_submission_hard.head()

In [None]:
df_final_submission_hard['value'] = df_final_submission_hard['value'].map({1:'YES',0:'NO'})

In [None]:
df_final_submission_hard.head()

In [None]:
# Insert a new column named 'test_case' filled with 'EXIST2024' at position 0
# df_final_submission_hard.insert(0, 'test_case', 'EXIST2024')
# Save DataFrame to JSON file
first = True

# Save DataFrame to JSON file with specific format
with open("EXIST2024_test_task1_gold_hard.json", "w") as f:
    f.write("[")  # Write opening bracket
    for index, row in df_final_submission_hard.iterrows():
        record = "{\n"  # Start of the record
        for column, value in row.items():
            record += f'  "{column}": "{value}",\n'  # Add column and value to the record
        record = record[:-2]
        record += "\n}"  # End of the record
        f.write(record)  # Write the record
        if index < len(df_final_submission_hard) - 1:
          f.write(", ")   # Add comma after each record except the last one
    f.write("]")  # Write closing bracket

In [None]:
# for index, row in df_final_submission_soft.iterrows():
#         for column, value in row.items():
#             print(column, value)
        

In [None]:
import ast

data = []
for row in probabilities:
    data.append({"value": {"YES":row[1].item(), "NO": row[0].item()}})
    
df_data = pd.DataFrame(data)

df_final_submission_soft = df_final_submission_hard.drop(columns=['value']).copy()
df_final_submission_soft = pd.concat([df_final_submission_soft, df_data], axis=1)
# df_final_submission_soft.to_json('EXIST2024_test_task1_gold_soft.json', orient='records')

first = True

# Save DataFrame to JSON file with specific format
with open("EXIST2024_test_task1_gold_soft.json", "w") as f:
    f.write("[")  # Write opening bracket
    for index, row in df_final_submission_soft.iterrows():
        record = "{\n"  # Start of the record
        for column, value in row.items():
            if column == "value":
                # Separate keys and values
                keys = list(value.keys())
                values = list(value.values())
                record += f'  "value" : '
                record += "{\n"
                for key, value in zip(keys, values):
                    record += f'    "{key}": {value},\n'
                record = record[:-2]
                record += "\n  }"
            else:
                record += f'  "{column}": "{value}",\n'  # Add column and value to the record
        record += "\n}"  # End of the record
        f.write(record)  # Write the record
        if index < len(df_final_submission_soft) - 1:
            f.write(", ")   # Add comma after each record except the last one
    f.write("]")  # Write closing bracket

In [None]:
# from sklearn.metrics import roc_auc_score, classification_report

# print('\nThe Classification Report is as follows\n')
# print(classification_report(df_dev['value'].tolist(), final_prediction, digits = 4))

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Thanks for Reading </h1></span>