# Data

In [1]:
import pickle
import re

save_pkl = lambda data, filepath: pickle.dump(data, open(filepath, "wb"))
load_pkl = lambda filepath: pickle.load(open(filepath, "rb"))

In [2]:
# load and aggregate raw data
import os
import json

# Specify the folder path containing the JSON files
folder_path = './data'

# files read
files_read_path = "./rank_data/files_read.pkl"
files_read = load_pkl(files_read_path) if os.path.exists(files_read_path) else set()
current_files_read = set()

# Initialize an empty list to aggregate the data
data = []

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.json') and filename not in files_read:
        current_files_read.add(filename)
        
        file_path = os.path.join(folder_path, filename)
        
        # Read and parse JSON data from the file
        with open(file_path, 'r') as json_file:
            file_data = json.load(json_file)
            
            # Assuming each JSON file contains a list of dictionaries
            if isinstance(file_data, list):
                data.extend(file_data)

In [3]:
current_files_read

{'08-16-23.json',
 '08-17-23.json',
 '08-18-23.json',
 '08-19-23.json',
 '08-20-23.json'}

# Curate Data

In [4]:
import random

data = [news for news in data if news["full_text"] != "" and "JavaScript is not available" not in news["full_text"] and "reuters" not in news["link"]]
random.shuffle(data)
len(data)

51579

In [5]:
for news in data:
    if "<p>" in news["summary"]:
        # Regular expression to match content between <p> tags
        pattern = re.compile(r'<p>(.*?)</p>', re.DOTALL)
        matches = pattern.findall(news["summary"])

        # Extracted content from <p> tags
        extracted_content = [re.sub(r'<.*?>', '', match) for match in matches]
        news["summary"] = max(extracted_content, key=len)

In [6]:
data[0]

{'title': 'Sood Charity Foundation fulfilling my dream of becoming teacher: Sonu',
 'summary': 'Sonu Sood said if he wasn\'t an actor, he would have been a teacher. He said his dream is now coming true with \'Sood Charity Foundation\', which is helping many students to pursue education. Calling his mother his "favourite teacher", Sonu shared, "Since my mother was a teacher herself, there was no excuse for me to have secured less marks."',
 'link': 'https://www.mid-day.com/amp/entertainment/bollywood-news/article/my-teachers-have-taught-me-to-be-patient-and-perseverant-says-sonu-sood-23244355?utm_campaign=fullarticle&utm_medium=referral&utm_source=inshorts',
 'image_link': 'https://static.inshorts.com/inshorts/images/v1/variants/jpg/m/2022/09_sep/5_mon/img_1662373782807_419.jpg?',
 'source': 'inshorts',
 'full_text': "My teachers have taught me that patience and perseverance always do wonders in everyone’s life, says Sonu Sood My teachers have taught me that patience and perseverance al

# Load Instruct Model

In [7]:
from datasets import load_dataset
from transformers import (AutoModelForSeq2SeqLM, 
                          AutoTokenizer, 
                          GenerationConfig, 
                          TrainingArguments, 
                          Trainer)
import torch
import time
import os
import evaluate
import pandas as pd
import numpy as np
from math import ceil

2023-08-21 08:29:25.764456: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-21 08:29:27.215233: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-08-21 08:29:27.215372: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [8]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  
torch.cuda.device_count()

1

In [9]:
class PeftModel:
    @staticmethod
    def load_base_model(model_path="google/flan-t5-base"):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_path, torch_dtype=torch.bfloat16
        ).to(device)
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        return model, tokenizer

    @staticmethod
    def load_from_peft_adapter(
        base_model_path, peft_model_path, train=False
    ):
        model, tokenizer = self.load_base_model(base_model_path)
        model = PeftModel.from_pretrained(
            model, peft_model_path, torch_dtype=torch.bfloat16, is_trainable=False
        ).to(device)

        model = model.merge_and_unload()

        if train:
            for param in model.parameters():
                param.requires_grad = True

        # merge the adapter to the main model
        return model, tokenizer

    @staticmethod
    def save_peft_adapter(model, tokenizer, model_path):
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)

    @staticmethod
    def merge_peft_and_save(model, tokenizer, model_path):
        model = model.merge_and_unload()
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)

In [10]:
# load instruct model
name = './checkpoint/'
model, tokenizer = PeftModel.load_base_model(model_path=name)

# Training Data Generation
### with instruct model responses

In [11]:
import random
from tqdm import tqdm
import torch
import pandas as pd
from torch.utils.data import Dataset, random_split


def get_summary_prompt(example):
    # word count round off
    multiple = 25
    word_count = len(example["summary"].split())
    word_count = int(round(word_count / multiple)) * multiple

    start_prompt = f'Summarize this news article in {word_count} words.\n\n'
    end_prompt = '\n\nSummary: '

    prompt = start_prompt + example["full_text"] + end_prompt

    return prompt, example["summary"]


def get_title_prompt(example):
    # word count round off
    multiple = 5
    word_count = len(example["title"].split())
    word_count = int(ceil(word_count / multiple)) * multiple

    start_prompt = f'Give a title to the given news article in not more than {word_count} words.\n\n'
    mid_prompt = '\n\nSummary: '
    end_prompt = '\n\nTitle: '

    prompt = start_prompt + example["full_text"] + mid_prompt + example["summary"] + end_prompt
    return prompt, example["title"]

In [12]:
# batching data
batch = lambda data, batch_size: [data[i:i + batch_size] for i in range(0, len(data), batch_size)]

batch_size = 200
batches = batch(data, batch_size)

In [None]:
# batched generation
training_data = []

for news_batch in tqdm(batches):
    prompt_batch = []
    human_label_batch = []
    
    for news in news_batch:
        for prompt, human_label in [get_summary_prompt(news), get_title_prompt(news)]:
            prompt_batch.append(prompt)
            human_label_batch.append(human_label)
    
    input_ids = tokenizer(prompt_batch, return_tensors="pt", truncation=True, padding=True).input_ids.to("cuda")

    model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    model_text_output_batch = [tokenizer.decode(model_outputs[i], skip_special_tokens=True) for i in range(len(model_outputs))]
    
    training_data.extend(list(zip(prompt_batch, human_label_batch, model_text_output_batch)))

 52%|█████▏    | 135/258 [33:08<30:12, 14.73s/it] 

In [None]:
training_data[0]

In [None]:
import pickle


reward_model_training_data_path = "./rank_data/data.pkl"
data_store = load_pkl(filepath=reward_model_training_data_path) if os.path.exists(reward_model_training_data_path) else []

data_store.extend(training_data)
save_pkl(data=data_store, filepath=reward_model_training_data_path)

# update files read
files_read = files_read.union(current_files_read)
save_pkl(files_read, filepath="./rank_data/files_read.pkl")

# Reward Model

In [1]:
!nvidia-smi

Tue Aug 22 12:01:16 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    Off  | 00000000:0C:00.0 Off |                  Off |
| 30%   25C    P8    23W / 300W |   2325MiB / 48685MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000    Off  | 00000000:0D:00.0 Off |                  Off |
| 30%   26C    P8    31W / 300W |      8MiB / 48685MiB |      0%      Default |
|       

In [1]:
import pickle
import re
import json
from tqdm import tqdm 


save_pkl = lambda data, filepath: pickle.dump(data, open(filepath, "wb"))
load_pkl = lambda filepath: pickle.load(open(filepath, "rb"))

In [2]:
with open('../../../data/training/inshorts.json', 'r') as json_file:
    training_data = json.load(json_file)

In [3]:
training_data[0]

{'title': 'Man accused of sexually assaulting daughter granted bail by HC amid matrimonial dispute',
 'summary': 'Delhi HC granted bail to a man accused of sexually assaulting his daughter, noting that it cannot shut its eyes to matrimonial dispute between her parents and his false implication by "tutoring" cannot be ruled out. It observed she has been residing with the mother for over four years. The court also noted there was inordinate delay in FIR registration.',
 'link': 'https://www.outlookindia.com/national/delhi-hc-grants-bail-to-man-accused-of-sexually-assaulting-daughter-news-311110/amp?utm_campaign=fullarticle&utm_medium=referral&utm_source=inshorts',
 'image_link': 'https://static.inshorts.com/inshorts/images/v1/variants/jpg/m/2023/08_aug/16_wed/img_1692206351958_642.jpg?',
 'source': 'inshorts',
 'full_text': 'Delhi HC Grants Bail To Man Accused Of Sexually Assaulting Daughter Justice Vikas Mahajan observed the girl has been residing with the mother for more than 4 years a

In [4]:
# from torchinfo import summary
import numpy as np
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import os

device = torch.device("mps")

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [7]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.max_len = tokenizer.model_max_length
        self.tokenizer = tokenizer
        self.data = self.prepare(data)

    def __len__(self):
        return len(self.data)

    def prepare(self, prompts):
        print("preparing data..")
        features = []
        
        for data in tqdm(prompts):
            feature_i = data["full_text"] + "Summarize: \n" + data["summary"]
            feature_j = data["full_text"] + "Summarize: \n" + data["generated_summary"]

            # handwritten summary
            inputs = self.tokenizer(feature_i,             
                                    max_length=self.max_len,
                                    pad_to_max_length=True,
                                    truncation=True,
                                    padding="max_length")
    
            i_ids = inputs['input_ids']
            i_mask = inputs['attention_mask']

            # model generated summary
            inputs = self.tokenizer(feature_j, 
                                    truncation=True,
                                    max_length=self.max_len,
                                    pad_to_max_length=True,
                                    padding="max_length")
    
            j_ids = inputs['input_ids']
            j_mask = inputs['attention_mask']
    
            features.append({
                'i_ids': i_ids,
                'i_mask': i_mask,
                'j_ids': j_ids,
                'j_mask': j_mask
            })
        return features

    def __getitem__(self, index):
        data = self.data[index]

        return {
            'i_ids': torch.tensor(data["i_ids"], dtype=torch.long),
            'i_mask': torch.tensor(data["i_mask"], dtype=torch.long),
            'j_ids': torch.tensor(data["j_ids"], dtype=torch.long),
            'j_mask': torch.tensor(data["j_mask"], dtype=torch.long)
        }

In [8]:
train_ratio = 0.8
train_size = int(len(training_data) * train_ratio)

train_data = training_data[:train_size].copy()
test_data = training_data[train_size:].copy()

training_set = CustomDataset(train_data, tokenizer)
testing_set = CustomDataset(test_data, tokenizer)

preparing data..


100%|████████████████████████████████████████████████████████| 22064/22064 [01:07<00:00, 327.07it/s]


preparing data..


100%|██████████████████████████████████████████████████████████| 5516/5516 [00:19<00:00, 286.04it/s]


In [9]:
training_set[0]

{'i_ids': tensor([13856,  5303, 27327,  ..., 50256, 50256, 50256]),
 'i_mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'j_ids': tensor([13856,  5303, 27327,  ..., 50256, 50256, 50256]),
 'j_mask': tensor([1, 1, 1,  ..., 0, 0, 0])}

In [10]:
BATCH_SIZE = 2

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
               'shuffle': False,
               'num_workers': 0
                }

trainloader = DataLoader(training_set, **train_params)
testloader = DataLoader(testing_set, **test_params)

In [11]:
tokenizer.pad_token_id

50256

In [12]:
import torch.nn as nn
from peft import LoraConfig, get_peft_model, TaskType


class RewardModel(torch.nn.Module):
    def __init__(self, dropout=0.3):
        super(RewardModel, self).__init__()
        self.l1 = GPT2Model.from_pretrained("gpt2")

        lora_config = LoraConfig(
            # task_type=TaskType.SEQ_CLS,
            inference_mode=False,
            r=8,
            lora_alpha=16,
            lora_dropout=0.05,
            bias="none",
        )

        self.l1 = get_peft_model(self.l1, lora_config)
        self.l2 = nn.Sequential(
            nn.Linear(768, 1),
        )
        self.outl = nn.Sigmoid()
        
    def gpt2(self, ids, attention_mask):
        # logits shape: [batch=16, seqlen=1024, dim=768]
        logits, _ = self.l1(ids, 
                            attention_mask=attention_mask,
                            return_dict=False)
        
        sequence_lengths = (torch.eq(ids, tokenizer.pad_token_id).long().argmax(-1) - 1).to(
                    logits.device
                )
        
        batch_size = ids.shape[0]
        
        # pooled logits shape: [batch=16, dim=768]
        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
        return pooled_logits
    
    def forward(self, input):
        """
        ids shape = [batch, 3, 512]
        3: prompt, human_text, model_text
        """  
        i = self.gpt2(input["i_ids"], attention_mask=input["i_mask"])
        j = self.gpt2(input["j_ids"], attention_mask=input["j_mask"])
        
        i = self.l2(i)
        j = self.l2(j)
        
        return self.outl(i - j)
    
    def predict(self, input):
        """
        ids shape: [batch, 2, 512]
        2: prompt, text
        """
        i = self.gpt2(input["ids"], attention_mask=input["mask"])
        return self.l2(i)

In [13]:
# model
model = RewardModel(dropout=0.01)



In [14]:
from torchinfo import summary

# sample input
sample_input = next(iter(trainloader))
summary(model, input_data=[sample_input])

Layer (type:depth-idx)                                       Output Shape              Param #
RewardModel                                                  [2, 1]                    --
├─PeftModel: 1-1                                             [2, 1024, 768]            --
│    └─LoraModel: 2-2                                        --                        (recursive)
│    │    └─GPT2Model: 3-1                                   [2, 1024, 768]            124,734,720
├─PeftModel: 1-2                                             [2, 1024, 768]            (recursive)
│    └─LoraModel: 2-2                                        --                        (recursive)
│    │    └─GPT2Model: 3-2                                   [2, 1024, 768]            (recursive)
├─Sequential: 1-3                                            [2, 1]                    --
│    └─Linear: 2-3                                           [2, 1]                    769
├─Sequential: 1-4                                

In [15]:
model.to(device)

RewardModel(
  (l1): PeftModel(
    (base_model): LoraModel(
      (model): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): Linear(
                in_features=768, out_features=2304, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (

In [16]:
import torch.optim as optim

criterion = nn.BCELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.01)
optimizer = optim.AdamW(params=model.parameters(), lr=1e-4)

In [17]:
def train(epoch, model, dataloader, optimizer, criterion, device):
    model.train()
    
    correct = 0
    processed = 0
    total_loss = 0
    
    pbar = tqdm(dataloader)
    
    for idx, data in enumerate(pbar):
        data["i_ids"] = data["i_ids"].to(device)
        data["i_mask"] = data["i_mask"].to(device)
        data["j_ids"] = data["j_ids"].to(device)
        data["j_mask"] = data["j_mask"].to(device)

        batch_size = data["i_ids"].size()[0]
        optimizer.zero_grad()
        
        pred = model(data)
        target = torch.from_numpy(np.ones(shape=(batch_size, 1))).float().to(device)
        
        loss = criterion(pred, target)
        loss.backward()
        optimizer.step()
        
        correct += (
            sum((pred.detach().squeeze(-1) >= 0.50).float() == target.squeeze(-1))
            .detach()
            .item()
        )
        processed += pred.detach().shape[0]
        total_loss += loss.detach().item()
        
        # tqdm writing
        pbar.set_description(
            desc="Train Epoch: {epoch}, Mini Batch: {batch}, Train Accuracy: {accuracy}, Train Loss: {loss}".format(
                epoch=epoch,
                batch=idx+1,
                accuracy=round((correct / processed) * 100, 4),
                loss=round(total_loss / (idx+1), 4)
            )
        )
    return round((correct / processed) * 100, 4), round(total_loss / (idx+1), 4)

In [18]:
def test(epoch, model, dataloader, criterion, device):
    model.eval()
    
    correct = 0
    processed = 0
    total_loss = 0
    
    pbar = tqdm(dataloader)
    
    with torch.no_grad():
        for idx, data in enumerate(pbar):
            data["i_ids"] = data["i_ids"].to(device)
            data["i_mask"] = data["i_mask"].to(device)
            data["j_ids"] = data["j_ids"].to(device)
            data["j_mask"] = data["j_mask"].to(device)

            batch_size = data["i_ids"].size()[0]

            pred = model(data)
            target = torch.from_numpy(np.ones(shape=(batch_size, 1))).float().to(device)

            loss = criterion(pred, target)

            correct += (
                sum((pred.detach().squeeze(-1) >= 0.50).float() == target.squeeze(-1))
                .detach()
                .item()
            )
            processed += pred.detach().shape[0]
            total_loss += loss.detach().item()

            # tqdm writing
            pbar.set_description(
                desc="Test Epoch: {epoch}, Mini Batch: {batch}, Test Accuracy: {accuracy}, Test Loss: {loss}".format(
                    epoch=epoch,
                    batch=idx+1,
                    accuracy=round((correct / processed) * 100, 4),
                    loss=round(total_loss / (idx+1), 4)
                )
            )
        return round((correct / processed) * 100, 4), round(total_loss / (idx + 1), 4)

In [19]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        
        if param.requires_grad:
            trainable_model_params += param.numel()
    
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 295681
all model parameters: 124735489
percentage of trainable model parameters: 0.24%


In [None]:
EPOCHS = 5

for epoch in range(EPOCHS):
    train_accuracy, train_loss = train(epoch, model, trainloader, optimizer, criterion, device)
    test_accuracy, test_loss = test(epoch, model, testloader, criterion, device)

Train Epoch: 0, Mini Batch: 2716, Train Accuracy: 86.0457, Train Loss: 0.2754:  25%|▏| 2716/11032 [1

In [27]:
model.l1.save_pretrained("./reward_model_checkpoint/peft_gpt2/")
torch.save(model.l2.state_dict(), "./reward_model_checkpoint/peft_gpt2/l2.pt")

In [None]:
"""
input_i = PROMPT + HUMAN SUMMARY
input_j = PROMPT + MODEL GENERATED SUMMARY

SIGMOID(LINEAR(GPT2(input_i)) - LINEAR(GPT2(input_j))) = 1
"""