<a href="https://www.kaggle.com/code/feezakhankhanzada/training-bert-for-prediction-beginner?scriptVersionId=100935341" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Libraries**

In [None]:
import numpy as np 
import pandas as pd 
import os
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AdamW
from torch.utils.data import DataLoader
import torch.nn as nn
import transformers
import copy
from collections import defaultdict
import gc
from tqdm import tqdm
from sklearn.model_selection import GroupKFold, KFold
import joblib
from torch.optim import lr_scheduler
import time
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig

# **Configurations**

In [None]:
EPOCHS = 2
MODEL_NAME = "../input/bert-base-uncased"
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 16
MAX_LEN = 512
LEARNING_RATE = 1e-5
SCHEDULER = 'CosineAnnealingLR'
MIN_LR = 1e-6
T_MAX = 500
WEIGTH_DECAY = 1e-6
NFOLDS = 5
NACCUMULATE = 1
NCLASSES = 3
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)

# **GPU**

In [None]:
import torch

if torch.cuda.is_available():
    device= torch.device("cuda:0")
else:
    device = "cpu"

# **Reading Dataset**

In [None]:
train = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')

def fetchEssay(essay_id: str):
    """
    Read the text file of the specific essay_id
    """
    essay_path = os.path.join('../input/feedback-prize-effectiveness/train/', essay_id + '.txt')
    essay_text = open(essay_path, 'r').read()
    return essay_text

train['essay_text'] = train['essay_id'].apply(fetchEssay)

In [None]:
encoder = LabelEncoder()
train['discourse_effectiveness'] = encoder.fit_transform(train['discourse_effectiveness'])

# **Training Dataset**

In [None]:
class FeedBackDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.discourse = df['discourse_text'].values
        self.essay = df['essay_text'].values
        self.discourse_type = df['discourse_type']
        self.targets = df['discourse_effectiveness'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        discourse = self.discourse[index]
        essay = self.essay[index]
        discourse_type = self.discourse_type[index]
        text = discourse + " " + self.tokenizer.sep_token + " " + essay + " " + self.tokenizer.sep_token + " " + discourse_type
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_length
                    )
        
        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'target': self.targets[index]
        }

In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=TOKENIZER)

# **Training Model**

In [None]:

class FeedBackModel(nn.Module):
    def __init__(self, model_name):
        super(FeedBackModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('../input/bert-base-uncased')
        self.bert_drop = nn.Dropout(0.4)
        self.out= nn.Linear(768, 3)
    
    def forward(self, ids, mask):
        sequence_output = self.bert(
            ids, 
            attention_mask=mask
        )[0]
        pooled_output = self.bert(
            ids, 
            attention_mask=mask
        )[1]
        
        bertOut = self.bert_drop(pooled_output)
        output = self.out(bertOut)
        
        return output

# **Loss Function**

In [None]:
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

# **Training Function**

In [None]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = ids.size(0)

        outputs = model(ids, mask)
        
        loss = criterion(outputs, targets)
        loss = loss / NACCUMULATE
        loss.backward()
    
        if (step + 1) % NACCUMULATE == 0:
            optimizer.step()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

# **Run Training**

In [None]:
def run_training(model, optimizer, scheduler, device, num_epochs):
    # To automatically log gradients
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()

        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=device, epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
    
    return model, history

# **Data Preparation**

In [None]:
def prepare_loaders():
    df_train = train
    
    train_dataset = FeedBackDataset(df_train, tokenizer=TOKENIZER, max_length=MAX_LEN)

    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, collate_fn=collate_fn, 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    
    return train_loader

In [None]:
def fetch_scheduler(optimizer):
    if SCHEDULER == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=T_MAX, 
                                                   eta_min=MIN_LR)
    elif SCHEDULER == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=T_MAX, 
                                                             eta_min=MIN_LR)
    elif SCHEDULER == None:
        return None
        
    return scheduler

# **Start Training**

In [None]:
# Create Dataloaders
train_loader = prepare_loaders()
model = FeedBackModel(MODEL_NAME)
model.to(device)

# Define Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGTH_DECAY)
scheduler = fetch_scheduler(optimizer)

model, history = run_training(model, optimizer, scheduler,
                              device=device,
                              num_epochs=EPOCHS)


# **Test Dataset**

In [None]:
TEST = "../input/feedback-prize-effectiveness/test"

In [None]:
def get_essay(essay_id):
    essay_path = os.path.join(TEST, f"{essay_id}.txt")
    essay_text = open(essay_path, 'r').read()
    return essay_text

In [None]:
df = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
df['essay_text'] = df['essay_id'].apply(get_essay)
df.head()

In [None]:
class FeedBackDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.discourse = df['discourse_text'].values
        self.essay = df['essay_text'].values
        self.discourse_type = df['discourse_type']
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        discourse = self.discourse[index]
        essay = self.essay[index]
        discourse_type = self.discourse_type[index]
        text = discourse + " " + self.tokenizer.sep_token + " " + essay + " " + self.tokenizer.sep_token + " " + discourse_type
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }

In [None]:
test_dataset = FeedBackDataset(df, TOKENIZER, max_length=MAX_LENGTH)
test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE,
                         num_workers=2, shuffle=False, pin_memory=True)


In [None]:
import torch.nn.functional as F

@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    PREDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        
        outputs = model(ids, mask)
        outputs = F.softmax(outputs, dim=1)
        PREDS.append(outputs.cpu().detach().numpy()) 
    
    PREDS = np.concatenate(PREDS)
    gc.collect()
    
    return PREDS

In [None]:
def inference(model_paths, dataloader, device):
    final_preds = []
    model = FeedBackModel(MODEL_NAME)
    model.to(DEVICE)
        
    preds = valid_fn(model, dataloader, device)
    final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

In [None]:
preds = inference(model, test_loader, DEVICE)

In [None]:
preds

In [None]:
sample = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")
sample.head()

In [None]:
sample['Adequate'] = preds[:, 0]
sample['Effective'] = preds[:, 1]
sample['Ineffective'] = preds[:, 2]

sample.head()

In [None]:
sample.to_csv('submission.csv', index=False)

References:

https://www.kaggle.com/code/debarshichanda/pytorch-feedback-deberta-v3-baseline
https://www.kaggle.com/code/debarshichanda/feedback-inference