# Data

In [1]:
import pickle
import re

save_pkl = lambda data, filepath: pickle.dump(data, open(filepath, "wb"))
load_pkl = lambda filepath: pickle.load(open(filepath, "rb"))

In [2]:
# load and aggregate raw data
import os
import json

# Specify the folder path containing the JSON files
folder_path = './data'

# files read
files_read_path = "./rank_data/files_read.pkl"
files_read = load_pkl(files_read_path) if os.path.exists(files_read_path) else set()
current_files_read = set()

# Initialize an empty list to aggregate the data
data = []

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.json') and filename not in files_read:
        current_files_read.add(filename)
        
        file_path = os.path.join(folder_path, filename)
        
        # Read and parse JSON data from the file
        with open(file_path, 'r') as json_file:
            file_data = json.load(json_file)
            
            # Assuming each JSON file contains a list of dictionaries
            if isinstance(file_data, list):
                data.extend(file_data)

In [3]:
current_files_read

{'08-16-23.json',
 '08-17-23.json',
 '08-18-23.json',
 '08-19-23.json',
 '08-20-23.json'}

# Curate Data

In [4]:
import random

data = [news for news in data if news["full_text"] != "" and "JavaScript is not available" not in news["full_text"] and "reuters" not in news["link"]]
random.shuffle(data)
len(data)

51579

In [5]:
for news in data:
    if "<p>" in news["summary"]:
        # Regular expression to match content between <p> tags
        pattern = re.compile(r'<p>(.*?)</p>', re.DOTALL)
        matches = pattern.findall(news["summary"])

        # Extracted content from <p> tags
        extracted_content = [re.sub(r'<.*?>', '', match) for match in matches]
        news["summary"] = max(extracted_content, key=len)

In [6]:
data[0]

{'title': 'Sood Charity Foundation fulfilling my dream of becoming teacher: Sonu',
 'summary': 'Sonu Sood said if he wasn\'t an actor, he would have been a teacher. He said his dream is now coming true with \'Sood Charity Foundation\', which is helping many students to pursue education. Calling his mother his "favourite teacher", Sonu shared, "Since my mother was a teacher herself, there was no excuse for me to have secured less marks."',
 'link': 'https://www.mid-day.com/amp/entertainment/bollywood-news/article/my-teachers-have-taught-me-to-be-patient-and-perseverant-says-sonu-sood-23244355?utm_campaign=fullarticle&utm_medium=referral&utm_source=inshorts',
 'image_link': 'https://static.inshorts.com/inshorts/images/v1/variants/jpg/m/2022/09_sep/5_mon/img_1662373782807_419.jpg?',
 'source': 'inshorts',
 'full_text': "My teachers have taught me that patience and perseverance always do wonders in everyone’s life, says Sonu Sood My teachers have taught me that patience and perseverance al

# Load Instruct Model

In [7]:
from datasets import load_dataset
from transformers import (AutoModelForSeq2SeqLM, 
                          AutoTokenizer, 
                          GenerationConfig, 
                          TrainingArguments, 
                          Trainer)
import torch
import time
import os
import evaluate
import pandas as pd
import numpy as np
from math import ceil

2023-08-21 08:29:25.764456: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-21 08:29:27.215233: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-08-21 08:29:27.215372: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [8]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  
torch.cuda.device_count()

1

In [9]:
class PeftModel:
    @staticmethod
    def load_base_model(model_path="google/flan-t5-base"):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_path, torch_dtype=torch.bfloat16
        ).to(device)
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        return model, tokenizer

    @staticmethod
    def load_from_peft_adapter(
        base_model_path, peft_model_path, train=False
    ):
        model, tokenizer = self.load_base_model(base_model_path)
        model = PeftModel.from_pretrained(
            model, peft_model_path, torch_dtype=torch.bfloat16, is_trainable=False
        ).to(device)

        model = model.merge_and_unload()

        if train:
            for param in model.parameters():
                param.requires_grad = True

        # merge the adapter to the main model
        return model, tokenizer

    @staticmethod
    def save_peft_adapter(model, tokenizer, model_path):
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)

    @staticmethod
    def merge_peft_and_save(model, tokenizer, model_path):
        model = model.merge_and_unload()
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)

In [10]:
# load instruct model
name = './checkpoint/'
model, tokenizer = PeftModel.load_base_model(model_path=name)

# Training Data Generation
### with instruct model responses

In [11]:
import random
from tqdm import tqdm
import torch
import pandas as pd
from torch.utils.data import Dataset, random_split


def get_summary_prompt(example):
    # word count round off
    multiple = 25
    word_count = len(example["summary"].split())
    word_count = int(round(word_count / multiple)) * multiple

    start_prompt = f'Summarize this news article in {word_count} words.\n\n'
    end_prompt = '\n\nSummary: '

    prompt = start_prompt + example["full_text"] + end_prompt

    return prompt, example["summary"]


def get_title_prompt(example):
    # word count round off
    multiple = 5
    word_count = len(example["title"].split())
    word_count = int(ceil(word_count / multiple)) * multiple

    start_prompt = f'Give a title to the given news article in not more than {word_count} words.\n\n'
    mid_prompt = '\n\nSummary: '
    end_prompt = '\n\nTitle: '

    prompt = start_prompt + example["full_text"] + mid_prompt + example["summary"] + end_prompt
    return prompt, example["title"]

In [12]:
# batching data
def batch_list_of_dicts(data, batch_size):
    """
    Batch a list of dictionaries with a given batch size.
    
    Args:
    data (list): List of dictionaries to be batched.
    batch_size (int): The desired batch size.
    
    Returns:
    list: A list of batches, where each batch is a list of dictionaries.
    """
    batched_data = []
    
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        batched_data.append(batch)
        
    return batched_data

batch_size = 200
batches = batch_list_of_dicts(data, batch_size)

In [None]:
# batched generation
training_data = []

for news_batch in tqdm(batches):
    prompt_batch = []
    human_label_batch = []
    
    for news in news_batch:
        for prompt, human_label in [get_summary_prompt(news), get_title_prompt(news)]:
            prompt_batch.append(prompt)
            human_label_batch.append(human_label)
    
    input_ids = tokenizer(prompt_batch, return_tensors="pt", truncation=True, padding=True).input_ids.to("cuda")

    model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    model_text_output_batch = [tokenizer.decode(model_outputs[i], skip_special_tokens=True) for i in range(len(model_outputs))]
    
    training_data.extend(list(zip(prompt_batch, human_label_batch, model_text_output_batch)))

 52%|█████▏    | 135/258 [33:08<30:12, 14.73s/it] 

In [None]:
training_data[0]

In [None]:
import pickle


reward_model_training_data_path = "./rank_data/data.pkl"
data_store = load_pkl(filepath=reward_model_training_data_path) if os.path.exists(reward_model_training_data_path) else []

data_store.extend(training_data)
save_pkl(data=data_store, filepath=reward_model_training_data_path)

# update files read
files_read = files_read.union(current_files_read)
save_pkl(files_read, filepath="./rank_data/files_read.pkl")

# Reward Model

In [1]:
import pickle
import re
from tqdm import tqdm 

save_pkl = lambda data, filepath: pickle.dump(data, open(filepath, "wb"))
load_pkl = lambda filepath: pickle.load(open(filepath, "rb"))

In [2]:
training_data = load_pkl("./rank_data/data.pkl")

In [3]:
training_data[-200]

('Summarize this news article in 50 words.\n\nA former BJP MLA from Jharkhand took law in his hands and decided to punish a youth for allegedly make videos of women while bathing. In the video, former Jarmundi MLA Devendra Kunwar can be seen telling the youth to do sit-ups. The leader then asks him to spit and lick. As he follows the instructions of the former legislator, Kunwar kicks him. The youth can be seen weeping in the video. The inhumane punishment was given at the ‘kangaroo court’ of Kunwar, during a gram sabha meeting. Some of the spectators could be seen cheering and recording the video. Reacting to the viral video, Kunwar called the incident “politically motivated" and said that the video was portrayed in a “wrong manner". Kunwar said that the youth who was punished used to make videos of women while they were bathing. “So the villagers caught the accused and brought him to the panchayat, where the decision was taken in front of the gram sabha.\n\nSummary: ',
 'A video purp

In [4]:
training_data[-199]

('Give a title to the given news article in not more than 15 words.\n\nA former BJP MLA from Jharkhand took law in his hands and decided to punish a youth for allegedly make videos of women while bathing. In the video, former Jarmundi MLA Devendra Kunwar can be seen telling the youth to do sit-ups. The leader then asks him to spit and lick. As he follows the instructions of the former legislator, Kunwar kicks him. The youth can be seen weeping in the video. The inhumane punishment was given at the ‘kangaroo court’ of Kunwar, during a gram sabha meeting. Some of the spectators could be seen cheering and recording the video. Reacting to the viral video, Kunwar called the incident “politically motivated" and said that the video was portrayed in a “wrong manner". Kunwar said that the youth who was punished used to make videos of women while they were bathing. “So the villagers caught the accused and brought him to the panchayat, where the decision was taken in front of the gram sabha.\n\nS

In [5]:
from torchinfo import summary
import numpy as np
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

2023-08-22 10:14:35.173757: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-22 10:14:36.598096: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-08-22 10:14:36.598220: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [6]:
import os
from torch import cuda

os.environ["CUDA_VISIBLE_DEVICES"] = "2"  
torch.cuda.device_count()

device = 'cuda' if cuda.is_available() else 'cpu'

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.tokenizer = tokenizer
        self.data = data
        self.max_len = tokenizer.model_max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # [prompt, human text, model text]
        inputs = self.tokenizer(self.data[index],             
                                add_special_tokens=True,
                                max_length=self.max_len,
                                pad_to_max_length=True,
                                return_token_type_ids=True,
                                truncation=True,
                                padding="max_length")

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }

In [9]:
train_ratio = 0.8
train_size = int(len(training_data) * train_ratio)

train_data = training_data[:train_size].copy()
test_data = training_data[train_size:].copy()

training_set = CustomDataset(train_data, tokenizer)
testing_set = CustomDataset(test_data, tokenizer)

In [10]:
training_set[0]

{'ids': tensor([[ 101, 7680, 7849,  ..., 2335, 1010,  102],
         [ 101, 2365, 2226,  ...,    0,    0,    0],
         [ 101, 3364, 2365,  ...,    0,    0,    0]]),
 'mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]])}

In [11]:
BATCH_SIZE = 16

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
               'shuffle': False,
               'num_workers': 0
                }

trainloader = DataLoader(training_set, **train_params)
testloader = DataLoader(testing_set, **test_params)

In [19]:
import torch.nn as nn


class RewardModel(torch.nn.Module):
    def __init__(self, dropout=0.3):
        super(RewardModel, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        
        # for param in self.l1.parameters():
        #     param.requires_grad = False
        
        self.l2 = nn.Sequential(
            nn.Linear(768, 1536),
            nn.BatchNorm1d(1536),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(1536, 1)
        )
        self.outl = nn.Sigmoid()
    
    def forward(self, ids, mask, token_type_ids):
        """
        ids shape = [batch, 3, 512]
        3: prompt, human_text, model_text
        """  
        prompt = self.l1(ids[:, 0], 
                         attention_mask=mask[:, 0], 
                         token_type_ids=token_type_ids[:, 0]).pooler_output
        
        human_text = self.l1(ids[:, 1], 
                             attention_mask=mask[:, 1], 
                             token_type_ids=token_type_ids[:, 1]).pooler_output
            
        model_text = self.l1(ids[:, 2], 
                             attention_mask=mask[:, 2], 
                             token_type_ids=token_type_ids[:, 2]).pooler_output

        human_score = self.l2(prompt + human_text)
        model_score = self.l2(prompt + model_text)
        
        return self.outl(human_score - model_score)
    
    def predict(self, ids, mask, token_type_ids):
        """
        ids shape: [batch, 2, 512]
        2: prompt, text
        """
        prompt = self.l1(ids[:, 0], 
                         attention_mask=mask[:, 0], 
                         token_type_ids=token_type_ids[:, 0]).pooler_output
        
        text = self.l1(ids[:, 1], 
                       attention_mask=mask[:, 1], 
                       token_type_ids=token_type_ids[:, 1]).pooler_output
        
        return self.l2(prompt + text)

In [20]:
# model
model = RewardModel(dropout=0.01)
model.to(device)

# sample input
sample_input = next(iter(trainloader))

ids = sample_input["ids"].to(device)
mask = sample_input["mask"].to(device)
token_type_ids = sample_input["token_type_ids"].to(device)

summary(model, input_data=[ids, mask, token_type_ids])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Layer (type:depth-idx)                                  Output Shape              Param #
RewardModel                                             [16, 1]                   --
├─BertModel: 1-1                                        [16, 768]                 --
│    └─BertEmbeddings: 2-1                              [16, 512, 768]            --
│    │    └─Embedding: 3-1                              [16, 512, 768]            23,440,896
│    │    └─Embedding: 3-2                              [16, 512, 768]            1,536
│    │    └─Embedding: 3-3                              [1, 512, 768]             393,216
│    │    └─LayerNorm: 3-4                              [16, 512, 768]            1,536
│    │    └─Dropout: 3-5                                [16, 512, 768]            --
│    └─BertEncoder: 2-2                                 [16, 512, 768]            --
│    │    └─ModuleList: 3-22                            --                        (recursive)
│    └─BertPooler: 2-3          

In [21]:
import torch.optim as optim

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [22]:
def train(epoch, model, dataloader, optimizer, criterion, device):
    model.train()
    
    correct = 0
    processed = 0
    total_loss = 0
    
    pbar = tqdm(dataloader)
    
    for idx, data in enumerate(pbar):
        ids = data["ids"].to(device)
        mask = data["mask"].to(device)
        token_type_ids = data["token_type_ids"].to(device)
        
        batch_size = ids.size()[0]
        optimizer.zero_grad()
        
        pred = model(ids, mask, token_type_ids)
        target = torch.from_numpy(np.ones(shape=(batch_size, 1))).float().to(device)
        
        loss = criterion(pred, target)
        loss.backward()
        optimizer.step()
        
        correct += (
            sum((pred.detach().squeeze(-1) >= 0.50).float() == target.squeeze(-1))
            .detach()
            .item()
        )
        processed += pred.detach().shape[0]
        total_loss += loss.detach().item()
        
        # tqdm writing
        pbar.set_description(
            desc="Train Epoch: {epoch}, Mini Batch: {batch}, Train Accuracy: {accuracy}, Train Loss: {loss}".format(
                epoch=epoch,
                batch=idx+1,
                accuracy=round((correct / processed) * 100, 4),
                loss=round(total_loss / (idx+1), 4)
            )
        )
    return round((correct / processed) * 100, 4), round(total_loss / (idx+1), 4)

In [23]:
def test(epoch, model, dataloader, criterion, device):
    model.eval()
    
    correct = 0
    processed = 0
    total_loss = 0
    
    pbar = tqdm(dataloader)
    
    with torch.no_grad():
        for idx, data in enumerate(pbar):
            ids = data["ids"].to(device)
            mask = data["mask"].to(device)
            token_type_ids = data["token_type_ids"].to(device)

            batch_size = ids.size()[0]

            pred = model(ids, mask, token_type_ids)
            target = torch.from_numpy(np.ones(shape=(batch_size, 1))).float().to(device)

            loss = criterion(pred, target)

            correct += (
                sum((pred.detach().squeeze(-1) >= 0.50).float() == target.squeeze(-1))
                .detach()
                .item()
            )
            processed += pred.detach().shape[0]
            total_loss += loss.detach().item()

            # tqdm writing
            pbar.set_description(
                desc="Test Epoch: {epoch}, Mini Batch: {batch}, Test Accuracy: {accuracy}, Test Loss: {loss}".format(
                    epoch=epoch,
                    batch=idx+1,
                    accuracy=round((correct / processed) * 100, 4),
                    loss=round(total_loss / (idx+1), 4)
                )
            )
        return round((correct / processed) * 100, 4), round(total_loss / (idx+1), 4)

In [24]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        
        if param.requires_grad:
            trainable_model_params += param.numel()
    
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 110668033
all model parameters: 110668033
percentage of trainable model parameters: 100.00%


In [None]:
EPOCHS = 5

for epoch in range(EPOCHS):
    train_accuracy, train_loss = train(epoch, model, trainloader, optimizer, criterion, device)
    test_accuracy, test_loss = test(epoch, model, testloader, criterion, device)

Train Epoch: 0, Mini Batch: 5158, Train Accuracy: 51.8612, Train Loss: 0.7027: 100%|██████████| 5158/5158 [2:04:11<00:00,  1.44s/it]
Test Epoch: 0, Mini Batch: 1290, Test Accuracy: 100.0, Test Loss: 0.6931: 100%|██████████| 1290/1290 [16:21<00:00,  1.31it/s]
Train Epoch: 1, Mini Batch: 5158, Train Accuracy: 73.1672, Train Loss: 0.6949: 100%|██████████| 5158/5158 [2:04:09<00:00,  1.44s/it]  
Test Epoch: 1, Mini Batch: 1290, Test Accuracy: 100.0, Test Loss: 0.6931: 100%|██████████| 1290/1290 [16:20<00:00,  1.32it/s]
Train Epoch: 3, Mini Batch: 5158, Train Accuracy: 98.0903, Train Loss: 0.6932: 100%|██████████| 5158/5158 [2:02:43<00:00,  1.43s/it]  
Test Epoch: 3, Mini Batch: 102, Test Accuracy: 100.0, Test Loss: 0.6931:   8%|▊         | 101/1290 [01:20<15:17,  1.30it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current v

In [28]:
test(0, model, testloader, criterion, device)

Test Epoch: 0, Mini Batch: 1290, Test Accuracy: 100.0, Test Loss: 0.6931: 100%|██████████| 1290/1290 [15:44<00:00,  1.37it/s]


(100.0, 0.6931)

In [27]:
torch.save({"model_state_dict": model.state_dict(), 
            "optimizer_state_dict": optimizer.state_dict()}, 
           "./reward_model_checkpoint/bert_fully_trained_model.pt")

In [None]:
"""
PROMPT: EMBEDDING1
HUMAN SUMMARY: EMBEDDING2
MODEL GENERATED SUMMARY: EMBEDDING3

SIGMOID(LINEAR(EMBEDDING1 + EMBEDDING2) - LINEAR(EMBEDDING1 + EMBEDDING3)) = 1
"""