In [2]:
# %pip install --upgrade pip
# %pip install --disable-pip-version-check \
#     torch==1.13.1 \
#     torchdata==0.5.1 --quiet

# %pip install \
#     transformers==4.27.2 \
#     datasets==2.11.0 \
#     evaluate==0.4.0 \
#     rouge_score==0.1.2 \
#     loralib==0.1.1 \
#     peft==0.3.0 --quiet

In [1]:
!nvidia-smi

Tue Aug 29 06:32:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    Off  | 00000000:0C:00.0 Off |                  Off |
| 30%   34C    P2    65W / 300W |   2481MiB / 48685MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000    Off  | 00000000:0D:00.0 Off |                  Off |
| 30%   25C    P8    31W / 300W |   3899MiB / 48685MiB |      0%      Default |
|       

In [1]:
from transformers import (AutoModelForSeq2SeqLM, 
                          AutoTokenizer, 
                          GenerationConfig, 
                          TrainingArguments, 
                          Trainer)
import torch
import time
import os
import evaluate
import pandas as pd
import numpy as np
from math import ceil
from saar.models.instruct import PeftModelUtils

2023-09-03 13:23:19.573288: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-03 13:23:21.056797: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-03 13:23:21.056953: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [None]:
MODE = "summary"

assert MODE in ["summary", "title", "summary and title"]

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  
torch.cuda.device_count()

1

In [6]:
# load original model
name='google/flan-t5-base'
model, tokenizer = PeftModelUtils.load_base_model(model_path=name)

<a name='1.2'></a>
### 1.2 - Load Data

In [7]:
# load and aggregate raw data
import os
import json

# Specify the folder path containing the JSON files
folder_path = './data'

# Initialize an empty list to aggregate the data
data = []

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.json') and "shorts" in filename:
        print(filename)
        file_path = os.path.join(folder_path, filename)
        
        # Read and parse JSON data from the file
        with open(file_path, 'r') as json_file:
            file_data = json.load(json_file)
            
            # Assuming each JSON file contains a list of dictionaries
            if isinstance(file_data, list):
                data.extend(file_data)

inshorts_scraped.json


In [8]:
data[0]

{'title': 'Man accused of sexually assaulting daughter granted bail by HC amid matrimonial dispute',
 'summary': 'Delhi HC granted bail to a man accused of sexually assaulting his daughter, noting that it cannot shut its eyes to matrimonial dispute between her parents and his false implication by "tutoring" cannot be ruled out. It observed she has been residing with the mother for over four years. The court also noted there was inordinate delay in FIR registration.',
 'link': 'https://www.outlookindia.com/national/delhi-hc-grants-bail-to-man-accused-of-sexually-assaulting-daughter-news-311110/amp?utm_campaign=fullarticle&utm_medium=referral&utm_source=inshorts',
 'image_link': 'https://static.inshorts.com/inshorts/images/v1/variants/jpg/m/2023/08_aug/16_wed/img_1692206351958_642.jpg?',
 'source': 'inshorts',
 'full_text': 'Delhi HC Grants Bail To Man Accused Of Sexually Assaulting Daughter Justice Vikas Mahajan observed the girl has been residing with the mother for more than 4 years a

# curate data

In [9]:
import random

data = [news for news in data if news["full_text"] != "" and "JavaScript is not available" not in news["full_text"] and "reuters" not in news["link"]]
random.shuffle(data)
len(data)

22477

In [10]:
import re

for news in data:
    if "<p>" in news["summary"]:
        # Regular expression to match content between <p> tags
        pattern = re.compile(r'<p>(.*?)</p>', re.DOTALL)
        matches = pattern.findall(news["summary"])

        # Extracted content from <p> tags
        extracted_content = [re.sub(r'<.*?>', '', match) for match in matches]
        news["summary"] = max(extracted_content, key=len)

# Dataset

In [11]:
import random
from tqdm import tqdm
import torch
import pandas as pd
from torch.utils.data import Dataset, random_split


class TextDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.inputs = []
        self.labels = []

        # define the prompt functions
        prompt_functions = []

        if "summary" in MODE:
            prompt_functions.append(self._get_summary_prompt)

        if "title" in MODE:
            prompt_functions.append(self._get_title_prompt)

        # get prompts
        for function in prompt_functions:
            for news in data:
                input_prompt, label = function(news)
                self.inputs.append(input_prompt)
                self.labels.append(label)
            
        """
        Combine the lists using zip
        Shuffle the combined list
        Unpack the shuffled pairs back into separate lists
        And then tokenize
        """
        combined = list(zip(self.inputs, self.labels))
        random.shuffle(combined)
        self.inputs, self.labels = zip(*combined)

        # tokenize
        self.inputs = tokenizer(self.inputs, 
                                padding="max_length", 
                                truncation=True, 
                                return_tensors="pt").input_ids

        self.labels = tokenizer(self.labels, 
                                padding="max_length", 
                                truncation=True, 
                                return_tensors="pt").input_ids
            
    def __len__(self): 
        return len(self.inputs)

    def __getitem__(self, idx): 
        return self.inputs[idx], self.labels[idx]
    
    @staticmethod
    def _get_summary_prompt(example):
        # word count round off
        multiple = 25
        word_count = len(example["summary"].split())
        word_count = int(round(word_count / multiple)) * multiple
        
        start_prompt = f'Summarize this news article in {word_count} words.\n\n'
        end_prompt = '\n\nSummary: '

        prompt = start_prompt + example["full_text"] + end_prompt

        return prompt, example["summary"]
    
    @staticmethod
    def _get_title_prompt(example):
        # word count round off
        multiple = 5
        word_count = len(example["title"].split())
        word_count = int(ceil(word_count / multiple)) * multiple
        
        start_prompt = f'Give a title to the given news article in not more than {word_count} words.\n\n'
        mid_prompt = '\n\nSummary: '
        end_prompt = '\n\nTitle: '

        prompt = start_prompt + example["full_text"] + mid_prompt + example["summary"] + end_prompt
        return prompt, example["title"]

In [12]:
train_data = TextDataset(data, tokenizer)
# test_data = TextDataset(data, tokenizer)

In [13]:
train_data[0]

(tensor([ 6434,     3,     9,  2233,    12,     8,   787,  1506,  1108,    16,
            59,    72,   145,   627,  1234,     5,  1547,    43,  2031,  3030,
            12,  3197,    80,   223,    16,     8,   874,    18, 19515,   332,
          1755,   196,   939,   227,   352,     3,   632,  4949,   323,   778,
            16,     8,  2101,   192,  1031,     5,   461,  2818,     6,    44,
             8,   337,  5669,   116,    79,  8151,    70,   511,  2541,  1453,
             6,  1547,  3030,    12,  4943,    46,  8943,   547,   447,  2391,
            18,  5981,    15,    17,  1369,   213,  3705,    63, 16296,  1635,
          4701,    26,     9,   208,     3, 25202,    15,    26,   223,    12,
           607,    16,   869,    12,   428,     3,     9, 15937,    13,   112,
            20,   900, 11026,   200,    16,     8,  1910,     5,   978, 22716,
          7673,    13,     3,  4591,  3154,   326,  8537, 11607,  2139,  1547,
         15389,   323,     8,  2387,    13, 11321,  

In [14]:
from saar.utils import print_number_of_trainable_model_parameters

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [15]:
# # FULL MODEL TRAINING
# EPOCH = 1

# training_args = TrainingArguments(
#                                   save_steps=5000,
#                                   warmup_steps=10,
#                                   logging_steps=100,
#                                   weight_decay=0.01,
#                                   num_train_epochs=EPOCH,
#                                   logging_dir='./logs',
#                                   output_dir='./checkpoint',
#                                   per_device_eval_batch_size=32,
#                                   per_device_train_batch_size=32)

# Trainer(model=model,
#         args=training_args,
#         eval_dataset=test_data,
#         train_dataset=train_data,
#         data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]), 
#                                     'labels': torch.stack([f[1] for f in data])}).train()

In [16]:
# PEFT MODEL TRAINING
from peft import LoraConfig, get_peft_model, TaskType


EPOCH = 10

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

peft_model = get_peft_model(model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model))


peft_training_args = TrainingArguments(
                                  # save_steps=5000,
                                  save_strategy="no",
                                  warmup_steps=10,
                                  logging_steps=1000,
                                  weight_decay=0.01,
                                  num_train_epochs=EPOCH,
                                  logging_dir='./logs',
                                  output_dir='./checkpoint',
                                  learning_rate=0.0001,
                                  auto_find_batch_size=True)
    
peft_trainer = Trainer(
                model=peft_model,
                args=peft_training_args,
                train_dataset=train_data,
                data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]), 
                                            'labels': torch.stack([f[1] for f in data])})

peft_trainer.train()

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%




Step,Training Loss
1000,4.1328
2000,0.1378
3000,0.1132
4000,0.1041
5000,0.0999
6000,0.0969
7000,0.0931
8000,0.091
9000,0.0892
10000,0.088


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



TrainOutput(global_step=28100, training_loss=0.23320891139346087, metrics={'train_runtime': 9754.7113, 'train_samples_per_second': 23.042, 'train_steps_per_second': 2.881, 'total_flos': 1.5635652772626432e+17, 'train_loss': 0.23320891139346087, 'epoch': 10.0})

# save model

In [17]:
# save peft adapter
adapter_path = "./checkpoint/title_adapter/"
PeftModelUtils.save_peft_adapter(model=peft_model, model_path=adapter_path)

In [None]:
# merge peft with main model and save the model
model_path = "./checkpoint/"
PeftModelUtils.merge_peft_and_save(model=peft_model, model_path=model_path)