# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Install Required Libraries & Utils Function </h1></span>

In [4]:
# ====================================================
# Required Libraries
# ====================================================

import os
import gc
import re
import sys
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

tokenizers.__version__: 0.15.2
transformers.__version__: 4.39.3
env: TOKENIZERS_PARALLELISM=true
cuda


In [5]:
# ====================================================
# Configuration (Hyper Parameters Value)
# ====================================================

class CFG:
    debug=False # want to debug or not 
    apex=True # for faster training
    print_freq= 300
    num_workers=4 
    model= "FacebookAI/xlm-roberta-base" 
    #"csebuetnlp/banglabert_large", "sagorsarker/bangla-bert-base","csebuetnlp/banglabert" 
    # "xlm-roberta-base"     
    epochs=5
    learning_rate=2e-5 
    eps=1e-6
    betas=(0.9, 0.999) # for adam optimizer
    batch_size= 16  # batch size
    max_len=512
    weight_decay=0.01 # for adam optimizer regulaization parameter
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['label'] #target columns
    seed=42 # seed no. for random initialization 
    train=True
    num_class = None # Number of class in your dataset
    mode = "attention_based " #"cls_based", "attention_based"


In [6]:
# data = pd.read_excel("/kaggle/input/product-review-sentiment-data-bangla/Product Review Sentiment Analysis.xlsx", sheet_name = "Sheet1")
data = pd.read_csv("/kaggle/input/exist-task-1/df_train.csv")
data.head()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/exist-task-1/df_train.csv'

In [None]:
print(data["label"].unique())
CFG.num_class = data["label"].nunique()
print(CFG.num_class)

In [None]:
# mapping = {
#     'Negative ' : 0, # space matter
#     'Positive' : 1
# }

# # data['label'] = data['Sentiments'].map(mapping)
# data['text'] = data['Reviews']

from sklearn.model_selection import train_test_split

train, validation = train_test_split(data, test_size=0.2, random_state=CFG.seed)
valid, test = train_test_split(validation, test_size=0.5, random_state=CFG.seed)

print(train.shape)
print(valid.shape)
print(test.shape)

In [None]:
train.to_csv("Training_Data_Product_Review_Sentiment.csv", index = False)
valid.to_csv("Validation_Data_Product_Review_Sentiment.csv", index = False)
test.to_csv("Test_Data_Product_Review_Sentiment.csv", index = False)

In [None]:
# train['label'] = train['Sentiments'].map(mapping)
# valid['label'] = valid['Sentiments'].map(mapping)
# test['label'] = test['Sentiments'].map(mapping)

In [None]:
train.head()

In [7]:
# If Debugging is True then we will consider a small set of training data

if CFG.debug:
    CFG.epochs = 2
    train = train.sample(frac =.1) 

In [8]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [9]:
# ====================================================
# Logger File
# ====================================================

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(CFG.seed)

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Tokenizer, Dataset & Collate Function</h1></span>


In [10]:
# ====================================================
# tokenizer
# ====================================================

tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [11]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, truncation=True, add_special_tokens=False)['input_ids'])
    lengths.append(length)
    
if max(lengths) + 2 > 512:
    CFG.max_len = 512
else:
    CFG.max_len = max(lengths) + 2 # cls & sep 
LOGGER.info(f"max_len: {CFG.max_len}")

NameError: name 'train' is not defined

In [None]:
# ====================================================
# Dataset Preparation
# ====================================================

def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.long)
        return inputs, label
    

# the colllate function to increase training speed

def collate(inputs): 
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs


# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Model Architecture </h1></span>


In [None]:
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        
        # defining attention network for attention scores 
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1))
        
        self._init_weights(self.attention)
        self.concat_pool = nn.Linear(self.config.hidden_size*3, self.config.hidden_size)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.num_class)
        self._init_weights(self.fc)


    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs.last_hidden_state # word level representation of last hiddent state
        
        if self.cfg.mode == "attention_based":
            # attention based sentence representation
            weights = self.attention(last_hidden_states)
            feature = torch.sum(weights * last_hidden_states, dim=1)
            
            # CLS Token representation
            cls_token_feature = last_hidden_states[:, 0, :] # only cls token

            # Concat them
            combine_feature = torch.cat([feature, cls_token_feature], dim = -1)
            
            feature = combine_feature
            
        if self.cfg.mode == "cls_based":
            # [CLS] Token Repr
            feature = last_hidden_states[:, 0, :]
            weights= None

        return feature, weights

    def forward(self, inputs):
        feature, weights = self.feature(inputs)
        output = self.fc(feature)
        return output, weights


# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Helpler functions for Training  </h1></span>

<font size="3">Few important function are created here.</font>

1. <i>AverageMeter</i> - To compute & store the average
2. <i>asMinutes</i> - To calculate the time
3. <i>timeSince</i> - To compute training & validation time
4. <i>train_fn</i> - Calculation of forward & backward pass for a single epoch in training data
5. <i>valid_fn</i> - Calculation of forward & backward pass for a single epoch in validation data


In [None]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(train_loader, model, criterion, optimizer, epoch, device):

    
    # Enabling Model Training Mode
    model.train()
     
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex) # using Automatic Mixed Precision (AMP) for speed up
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    
    for step, (inputs, labels) in enumerate(train_loader): # iterate over the training data 
        inputs = collate(inputs) # the collate function I discussed for speeding up training
        
        for k, v in inputs.items():
            inputs[k] = v.to(device)  # formatting the input to feed into the transformer model 
        labels = labels.to(device) 
        batch_size = labels.size(0)
        
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds, _ = model(inputs) 
            loss = criterion(y_preds.view(-1, CFG.num_class), labels.view(-1))

        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        
        scaler.scale(loss).backward() # backpropagation
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm) # clipping the gradient
        
        losses.update(loss.item(), batch_size)
        
        # Updating weights via optimizer & scaler
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
        end = time.time()
        
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    
    for step, (inputs, labels) in enumerate(valid_loader): # iterate over the validation data 
        inputs = collate(inputs)
        
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        
        with torch.no_grad(): # we don't need to store the gradients w.r.t validation data
            y_preds, _ = model(inputs)
            loss = criterion(y_preds.view(-1, CFG.num_class), labels.view(-1))
            
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Training Loop </h1></span>

In [None]:
from sklearn.metrics import f1_score

def get_score(y_trues, y_preds):
    y_predicted = y_preds.argmax(axis=1)  # Convert probabilities to class predictions
    macro_f1 = f1_score(y_trues, y_predicted, average='macro')
    return macro_f1

In [None]:
# ====================================================
# train loop
# ====================================================
def train_loop():

    # ====================================================
    # loader
    # ====================================================
    
    train_dataset = TrainDataset(CFG, train) # training dataset formatting 
    valid_dataset = TrainDataset(CFG, valid) # validation dataset formatting

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True) # train dataloader
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False) # validation dataloader

    valid_labels = valid[CFG.target_cols].values
    
    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)  # initializing the model
    torch.save(model.config, OUTPUT_DIR+'config.pth') # saving the model configuration 
    model.to(device) # GPU Config
    
    optimizer = AdamW(model.parameters(), lr=CFG.learning_rate, eps=CFG.eps, betas=CFG.betas) # declaring the optimizer
    
    criterion = nn.CrossEntropyLoss(reduction="mean")
    best_score = 0


    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train function 
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, device)

        # eval function 
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        
        if best_score < score: # Saving the best model w.r.t the score 
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_score{best_score:.4f}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_score{best_score:.4f}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    final_pred = predictions.argmax(axis=1)
    final_pred = final_pred.tolist()
    valid[f"pred_label"] = final_pred

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid, best_score

In [None]:
# ====================================================
# the training
# ====================================================
    
if __name__ == '__main__':
    
    if CFG.train:
        valid, best_score = train_loop()

In [None]:
#sdhgsdf

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center">  Inference </h1></span>


<font color='#3498DB'> <h3> <a id ="section11a"> <b> Configuration for Inference</b> </a> </h3> </font>

<font size="3"> The basic and important configuration for infernce is described here along with some function & other stuff.</font>


In [None]:
# ====================================================
# CFG for testing
# ====================================================

class CFG_Test:
    num_workers=4
    path="./"
    config_path=path+'config.pth'
    model=CFG.model
    batch_size=CFG.batch_size
    target_cols=CFG.target_cols
    seed=CFG.seed
    num_class = CFG.num_class
    mode = CFG.mode
    
CFG_Test.tokenizer = AutoTokenizer.from_pretrained(CFG_Test.path+'tokenizer/') # load the saved pretrained tokenizer

In [None]:
def get_logger(filename='inference'): # infernece logger file
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

<font color='#3498DB'> <h3> <b> Model Loading for Inference</b> </h3> </font>

<font size="3"> Dataset for predicting on the test data and Model Loading for inference are done in this section </font>

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True)
    
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

<font color='#3498DB'> <h3> <b> Prediction on Test Data</b></h3> </font>

<font size="3"> An inference function is made for predicting on the test data. Then finally, loading the previously saved model for each fold and taking prediction on test dataset for each fold. Then, take the average of the each of prediction is considered as model final prediction, </font>

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0: # iterate over the test data
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds, _ = model(inputs) # considering the logits only
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from transformers import DataCollatorWithPadding
test_dataset = TestDataset(CFG_Test, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG_Test.batch_size,
                         shuffle=False,
                         collate_fn=DataCollatorWithPadding(tokenizer=CFG_Test.tokenizer, padding='longest'))
                         

model = CustomModel(CFG_Test, config_path=CFG_Test.config_path, pretrained=False)
state = torch.load(CFG_Test.path+f"{CFG_Test.model.replace('/', '-')}_score{best_score:.4f}_best.pth",
                   map_location=torch.device('cpu')) # loading the saved model

model.load_state_dict(state['model'])
prediction = inference_fn(test_loader, model, device)
del model, state; gc.collect()
torch.cuda.empty_cache()
    


# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Evaluation on Test Dataset </h1></span>


In [None]:
final_prediction = prediction.argmax(axis = 1)
test['Prediction'] = final_prediction
display(test.head())

In [None]:
from sklearn.metrics import roc_auc_score, classification_report

print('\nThe Classification Report is as follows\n')
print(classification_report(test['label'].tolist(), test['Prediction'].tolist(), digits = 4))

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#C01F4D; border-radius: 100px 100px; text-align:center"> Thanks for Reading </h1></span>