In [1]:
from datasets import load_dataset
from transformers import (AutoModelForSeq2SeqLM, 
                          AutoTokenizer, 
                          GenerationConfig, 
                          TrainingArguments, 
                          Trainer)
import torch
import time
import os
import evaluate
import pandas as pd
import numpy as np
from math import ceil

2023-09-06 08:33:33.754794: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-06 08:33:35.293235: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-06 08:33:35.293398: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1,0,3"  
torch.cuda.device_count()

3

In [3]:
class PeftModel:
    @staticmethod
    def load_base_model(model_path="google/flan-t5-base"):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_path, torch_dtype=torch.bfloat16, device_map='auto'
        )
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        return model, tokenizer

    @staticmethod
    def load_from_peft_adapter(
        base_model_path, peft_model_path, train=False, merge_adapter=True
    ):
        model, tokenizer = self.load_base_model(base_model_path)
        model = PeftModel.from_pretrained(
            model, peft_model_path, torch_dtype=torch.bfloat16, is_trainable=train, device_map='auto')

        if merge_adapter:
            model = model.merge_and_unload()

            if train:
                for param in model.parameters():
                    param.requires_grad = True

        # merge the adapter to the main model
        return model, tokenizer

    @staticmethod
    def save_peft_adapter(model, model_path):
        model.save_pretrained(model_path)

    @staticmethod
    def merge_peft_and_save(model, model_path):
        model = model.merge_and_unload()
        model.save_pretrained(model_path)
        
    @staticmethod
    def save_tokenizer(tokenizer):
        tokenizer.save_pretrained(model_path)

In [4]:
# load original model
name='google/flan-t5-base'
model, tokenizer = PeftModel.load_base_model(model_path=name)

<a name='1.2'></a>
### 1.2 - Load Dataset and LLM

In [5]:
# load and aggregate raw data
import os
import json

# Specify the folder path containing the JSON files
folder_path = './data'

# Initialize an empty list to aggregate the data
data = []

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        print(filename)
        file_path = os.path.join(folder_path, filename)
        
        # Read and parse JSON data from the file
        with open(file_path, 'r') as json_file:
            file_data = json.load(json_file)
            
            # Assuming each JSON file contains a list of dictionaries
            if isinstance(file_data, list):
                data.extend(file_data)

inshorts_scraped.json
git_data.json


# curate data

In [6]:
import random

data = [news for news in data if news["full_text"] != "" and "JavaScript is not available" not in news["full_text"] and "reuters" not in news["link"]]
random.shuffle(data)
len(data)

602641

In [7]:
import re

for news in data:
    if "<p>" in news["summary"]:
        # Regular expression to match content between <p> tags
        pattern = re.compile(r'<p>(.*?)</p>', re.DOTALL)
        matches = pattern.findall(news["summary"])

        # Extracted content from <p> tags
        extracted_content = [re.sub(r'<.*?>', '', match) for match in matches]
        news["summary"] = max(extracted_content, key=len)

# Dataset

In [8]:
import random
from tqdm import tqdm
import torch
import pandas as pd
from torch.utils.data import Dataset, random_split


class TextDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.inputs = []
        self.labels = []
        
        for news in data:   
            input_prompt, label = self._get_summary_prompt(news)
            self.inputs.append(input_prompt)
            self.labels.append(label)
            
            # input_prompt, label = self._get_title_prompt(news)
            # self.inputs.append(input_prompt)
            # self.labels.append(label)
            
        """
        Combine the lists using zip
        Shuffle the combined list
        Unpack the shuffled pairs back into separate lists
        And then tokenize
        """
        combined = list(zip(self.inputs, self.labels))
        random.shuffle(combined)
        self.inputs, self.labels = zip(*combined)

        # tokenize
        self.inputs = tokenizer(self.inputs, 
                                padding="max_length", 
                                truncation=True, 
                                return_tensors="pt").input_ids

        self.labels = tokenizer(self.labels, 
                                padding="max_length", 
                                truncation=True, 
                                return_tensors="pt").input_ids
            
    def __len__(self): 
        return len(self.inputs)

    def __getitem__(self, idx): 
        return self.inputs[idx], self.labels[idx]
    
    @staticmethod
    def _get_summary_prompt(example):
        # word count round off
        multiple = 25
        word_count = len(example["summary"].split())
        word_count = int(round(word_count / multiple)) * multiple
        
        start_prompt = f'Summarize this news article in {word_count} words.\n\n'
        end_prompt = '\n\nSummary: '

        prompt = start_prompt + example["full_text"] + end_prompt

        return prompt, example["summary"]
    
#     @staticmethod
#     def _get_title_prompt(example):
#         # word count round off
#         multiple = 5
#         word_count = len(example["title"].split())
#         word_count = int(ceil(word_count / multiple)) * multiple
        
#         start_prompt = f'Give a title to the given news article in not more than {word_count} words.\n\n'
#         mid_prompt = '\n\nSummary: '
#         end_prompt = '\n\nTitle: '

#         prompt = start_prompt + example["full_text"] + mid_prompt + example["summary"] + end_prompt
#         return prompt, example["title"]

In [9]:
train_data = TextDataset(data, tokenizer)
# test_data = TextDataset(data, tokenizer)

In [10]:
train_data[0]

(tensor([12198,  1635,  1737,    48,  1506,  1108,    16,   944,  1234,     5,
            96,  3713,  3392,    19,    69,     7,   233,    11,    27,  5712,
            25,  6224,   976,     3,    88,   243,    16,     3,     9,  1424,
          1115,     5, 16706, 16054,     7,    16,     8,    36,  2452,  5402,
          1511,    13,  2415,     7,  2256,   497,    79,    33,   365,  2437,
         26877,   297,    45,   216,   172,  4243,   521,   107,  4719,  2366,
             5,    37,  1511,    19,   885,    12,     8,   312,  3478,    15,
             7,    15,  4947,     6,     3,     9, 13389,    21,   321,     8,
           789,    11, 16054,     7,    12,   129,  7749,     5,    86,     8,
          5023,    45,    46,    64,   159, 16221,    26,  1128,     6,  1363,
          4498,    52, 30157,   243,     3,    99,  3068,    29,    23, 10172,
           343,     7,   808,   147,    16, 11380,     6,    79,   133,  7663,
             3,     9,  5888,    12,     8,  1297,  

In [11]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        
        if param.requires_grad:
            trainable_model_params += param.numel()
    
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [12]:
# # FULL MODEL TRAINING
# EPOCH = 1

# training_args = TrainingArguments(
#                                   save_steps=5000,
#                                   warmup_steps=10,
#                                   logging_steps=100,
#                                   weight_decay=0.01,
#                                   num_train_epochs=EPOCH,
#                                   logging_dir='./logs',
#                                   output_dir='./checkpoint',
#                                   per_device_eval_batch_size=32,
#                                   per_device_train_batch_size=32)

# Trainer(model=model,
#         args=training_args,
#         eval_dataset=test_data,
#         train_dataset=train_data,
#         data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]), 
#                                     'labels': torch.stack([f[1] for f in data])}).train()

In [None]:
# PEFT MODEL TRAINING
from peft import LoraConfig, get_peft_model, TaskType


EPOCH = 3

lora_config = LoraConfig(
    r=64,
    lora_alpha=64,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

peft_model = get_peft_model(model, lora_config)

print(print_number_of_trainable_model_parameters(peft_model))


peft_training_args = TrainingArguments(
                                  # save_steps=5000,
                                  save_strategy="no",
                                  warmup_steps=10,
                                  logging_steps=5000,
                                  weight_decay=0.01,
                                  num_train_epochs=EPOCH,
                                  logging_dir='./logs',
                                  output_dir='./checkpoint',
                                  learning_rate=0.0001,
                                  auto_find_batch_size=True)
    
peft_trainer = Trainer(
                model=peft_model,
                args=peft_training_args,
                train_dataset=train_data,
                data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]), 
                                            'labels': torch.stack([f[1] for f in data])})

peft_trainer.train()


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /home/qblocks/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


  warn(msg)


trainable model parameters: 7077888
all model parameters: 254655744
percentage of trainable model parameters: 2.78%




Step,Training Loss


# save model

In [20]:
# save peft adapter
adapter_path = "./checkpoint/adapter/"
PeftModel.save_peft_adapter(model=peft_model, model_path=adapter_path)

In [16]:
# merge peft with main model
# and save the model
model_path = "./checkpoint/"
PeftModel.merge_peft_and_save(model=peft_model, tokenizer=tokenizer, model_path=model_path)

# load pretrained flan t5 

In [21]:
# load original model
model_name='google/flan-t5-base'
original_model, original_tokenizer = PeftModel.load_base_model(model_path=model_name)

# load the saved peft inshorts model

In [None]:
model_path = "./checkpoint/"
peft_model, tokenizer = PeftModel.load_base_model(model_path=model_path)

# infer

In [23]:
dash_line = '-'.join('' for x in range(100))

index = 5670
news = data[index]
full_text = news['full_text']
baseline_human_summary = news['summary']

# word count round off
multiple = 25
word_count = len(baseline_human_summary.split())
word_count = int(round(word_count / multiple)) * multiple

start_prompt = f'Summarize this news article in {word_count} words.\n\n'
end_prompt = '\n\nSummary: '

prompt = start_prompt + full_text + end_prompt

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = original_tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=4))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'FULL TEXT:\n{full_text}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{baseline_human_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')


# EVALUATE
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=[original_model_text_output],
    references=[baseline_human_summary],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=[peft_model_text_output],
    references=[baseline_human_summary],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)

print('INSTRUCT MODEL:')
print(instruct_model_results)

---------------------------------------------------------------------------------------------------
FULL TEXT:
Days after Rajyavardhan Singh Rathore said that Sonia Gandhi and Rahul Gandhi should be tried for “treason”, the Congress hit back at the Bharatiya Janata Party MP for speaking “blatant lies”, claiming that during the 2008 Beijing Olympics, not only did the Congress Parliamentary party chief visit the Games Village but also met with Indian athletes in the Indian block. UPA chairperson Sonia Gandhi with party leader Rahul Gandhi. (ANI file) {{^userSubscribed}} {{/userSubscribed}} {{^userSubscribed}} {{/userSubscribed}} Speaking in the Lok Sabha on Thursday during a no-confidence motion against the NDA government, Rathore had claimed that Sonia Gandhi and Rahul Gandhi met the Communist Party of China in 2008 when he was in Beijing during the Olympics. “I was at the 2008 Beijing Olympics (in China). We came to know that Sonia Gandhi and Rahul Gandhi are coming to meet us. They di

In [24]:
dash_line = '-'.join('' for x in range(100))

index = 5670
news = data[index]
full_text = news['full_text']
baseline_human_summary = news['title']

# word count round off
multiple = 5
word_count = len(news["title"].split())
word_count = int(ceil(word_count / multiple)) * multiple

start_prompt = f'Give a title to the given news article in not more than {word_count} words.\n\n'
mid_prompt = '\n\nSummary: '
end_prompt = '\n\nTitle: '

prompt = start_prompt + news["full_text"] + mid_prompt + news["summary"] + end_prompt

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = original_tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'FULL TEXT:\n{full_text}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{baseline_human_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')


# EVALUATE
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=[original_model_text_output],
    references=[baseline_human_summary],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=[peft_model_text_output],
    references=[baseline_human_summary],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)

print('INSTRUCT MODEL:')
print(instruct_model_results)

---------------------------------------------------------------------------------------------------
FULL TEXT:
Days after Rajyavardhan Singh Rathore said that Sonia Gandhi and Rahul Gandhi should be tried for “treason”, the Congress hit back at the Bharatiya Janata Party MP for speaking “blatant lies”, claiming that during the 2008 Beijing Olympics, not only did the Congress Parliamentary party chief visit the Games Village but also met with Indian athletes in the Indian block. UPA chairperson Sonia Gandhi with party leader Rahul Gandhi. (ANI file) {{^userSubscribed}} {{/userSubscribed}} {{^userSubscribed}} {{/userSubscribed}} Speaking in the Lok Sabha on Thursday during a no-confidence motion against the NDA government, Rathore had claimed that Sonia Gandhi and Rahul Gandhi met the Communist Party of China in 2008 when he was in Beijing during the Olympics. “I was at the 2008 Beijing Olympics (in China). We came to know that Sonia Gandhi and Rahul Gandhi are coming to meet us. They di