In [1]:
!pip install -qqq peft datasets trl bitsandbytes accelerate evaluate bert_score rouge_score IProgress

In [2]:
import requests
import zipfile
import os
import pandas as pd
import numpy as np
from transformers import BitsAndBytesConfig, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
import torch
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from datasets import Dataset
from trl import SFTTrainer
from tqdm import tqdm
import warnings
import evaluate
from huggingface_hub import login
from google.colab import drive

In [None]:
# login to huggingface
my_token = ''
login(my_token)

# mount Google Drive
drive.mount('/content/drive/')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
class TitleGeneratorModel:

    """


    """

    SYSTEM_MESSAGE = "you are an ai asistant to generate persian title for given article"

    TRAIN_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

    {context}<|eot_id|><|start_header_id|>user<|end_header_id|>

    {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

    {answer}<|eot_id|>"""

    TEST_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

    {context}<|eot_id|><|start_header_id|>user<|end_header_id|>

    {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

    SYSTEM_TAG_START = '<|start_header_id|>assistant<|end_header_id|>'

    EOT_ID = "<|eot_id|>"

    def __init__(self, model_name="meta-llama/Meta-Llama-3-8B-Instruct", dtype=torch.bfloat16, quantization_type="4bit", device=None):

        self.model_name = model_name
        self.dtype = dtype
        self.quantization_type = quantization_type
        self.device = device if device else "cuda" if torch.cuda.is_available() else "cpu"
        self.model = None
        self.tokenizer = None
        self.base_model_loaded = False
        self.peft_model_loaded = False

    def load_data(self, extract_path="./", input_of_model="summary", output_of_model="title", only_return_data_frames = False, only_return_test_data_as_list=False):

        url = 'https://huggingface.co/datasets/HooshvareLab/pn_summary/resolve/main/data/pn_summary.zip'
        dataset_zip_file_name = extract_path+'pn_summary.zip'

        if not os.path.isdir(extract_path+'pn_summary/'):
            response = requests.get(url)
            with open(dataset_zip_file_name, 'wb') as f:
                f.write(response.content)

            with zipfile.ZipFile(dataset_zip_file_name, 'r') as zip_ref:
                zip_ref.extractall(extract_path)

        train_data = pd.read_csv(extract_path+'pn_summary/'+'train.csv', sep='\t')
        val_data   = pd.read_csv(extract_path+'pn_summary/'+'dev.csv', sep='\t')
        test_data  = pd.read_csv(extract_path+'pn_summary/'+'test.csv', sep='\t')

        if only_return_data_frames:
            return train_data, val_data, test_data

        if only_return_test_data_as_list:
            return test_data["article"].to_list(), test_data["summary"].to_list(), test_data["title"].to_list()

        _data = []
        for index, row in train_data.iterrows():
            # article = row["article"]
            # summary = row["summary"]
            # title = row["title"]
            input_data = row[input_of_model]
            output_data = row[output_of_model]
            _data.append({"text":self.TRAIN_TEMPLATE.format(context=self.SYSTEM_MESSAGE, question=input_data, answer=output_data)})
        train_dataset = Dataset.from_list(_data)

        _data = []
        for index, row in val_data.iterrows():
            # article = row["article"]
            # summary = row["summary"]
            # title = row["title"]
            input_data = row[input_of_model]
            output_data = row[output_of_model]
            _data.append({"text":self.TRAIN_TEMPLATE.format(context=self.SYSTEM_MESSAGE, question=input_data, answer=output_data)})
        val_dataset = Dataset.from_list(_data)

        _data = []
        for index, row in test_data.iterrows():
            # article = row["article"]
            # summary = row["summary"]
            # title = row["title"]
            input_data = row[input_of_model]
            output_data = row[output_of_model]
            _data.append({"text":self.TRAIN_TEMPLATE.format(context=self.SYSTEM_MESSAGE, question=input_data, answer=output_data)})
        test_dataset = Dataset.from_list(_data)

        return train_dataset, val_dataset, test_dataset, train_data, val_data, test_data

    def load_base_model(self):

        if self.base_model_loaded:
            return True

        try:

            if self.quantization_type.lower() == "4bit":
                quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=self.dtype, bnb_4bit_use_double_quant=True)
            elif self.quantization_type.lower() == "8bit":
                quantization_config = BitsAndBytesConfig(load_in_8bit=True)

            self.model = AutoModelForCausalLM.from_pretrained(self.model_name, quantization_config=quantization_config, device_map="auto")
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name,)

            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.model.config.pad_token_id = self.model.config.eos_token_id
            self.model.config.use_cache = False # Gradient checkpointing is used by default but not compatible with caching

            self.base_model_loaded = True
            return True

        except Exception as e:
            print(f"Error loading base model: {e}")
            return False

    def load_peftmodel(self, path):

        # if self.peft_model_loaded:
        #     return True

        try:
            self.model = PeftModel.from_pretrained(self.model, path)
            self.model = self.model.merge_and_unload()
            self.peft_model_loaded = True
            return True
        except Exception as e:
            print(f"Error loading peft model: {e}")
            return False

    def train(self, epochs = 1, batch_size=4, max_seq_length=128, input_of_model="summary", output_of_model="title"):

        train_dataset, val_dataset, test_dataset, train_data, val_data, test_data = self.load_data(input_of_model = input_of_model, output_of_model = output_of_model)

        self.load_base_model()
        self.model = prepare_model_for_kbit_training(self.model)

        Lora_config = LoraConfig(r=8,lora_alpha=32,lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'])
        os.environ["WANDB_DISABLED"] = "true"
        training_arguments = TrainingArguments(
                output_dir="./results",
                # evaluation_strategy="steps",
                do_eval=False, # True
                per_device_train_batch_size=batch_size, # 4
                gradient_accumulation_steps=1,
                # per_device_eval_batch_size=1, # 4
                log_level="debug",
                optim="paged_adamw_32bit",
                save_steps=100,
                logging_steps=100,
                learning_rate=1e-4,
                # eval_steps=5,
                fp16=True,
                max_grad_norm=0.3,
                num_train_epochs=epochs,
                # max_steps=100,
                warmup_ratio=0.03,
                lr_scheduler_type="constant",
        )
        trainer = SFTTrainer(
                model=self.model,
                train_dataset=train_dataset,
                # eval_dataset=val_dataset,
                peft_config=Lora_config,
                dataset_text_field="text",
                max_seq_length=max_seq_length,
                tokenizer=self.tokenizer,
                args=training_arguments,
        )
        self.train_results = trainer.train()
        return self.train_results

    def test(self, max_new_tokens=128, input_of_model="summary", output_of_model="title", load_top=0, return_only_one_title = True):

        if not self.peft_model_loaded:
            print("please load peft model before testing.")
            return

        train_data, val_data, test_data = self.load_data(only_return_data_frames=True)
        inputs = test_data[input_of_model].to_list()
        outputs = test_data[output_of_model].to_list()

        if load_top != 0:
            inputs = inputs[:load_top]
            outputs = outputs[:load_top]

        generated_titles = []
        for i in tqdm(range(len(inputs)), desc=f"Processing {input_of_model}"):
            generated_title = self.__call__(inputs=inputs[i], max_new_tokens=max_new_tokens, return_only_one_title=return_only_one_title)[0]
            generated_titles.append(generated_title)

            with open("./generated_titles.txt", "a") as outfile:
                outfile.write(generated_title + "\n")

        return inputs, outputs, generated_titles

    def __call__(self, inputs, max_new_tokens=128, return_only_one_title = True):

        if not self.base_model_loaded:
            self.load_base_model()

        if not self.peft_model_loaded:
            warnings.warn("peft model not loaded yet!", category=Warning)

        if isinstance(inputs, str):
            inputs = [inputs]

        prompts = [self.TEST_TEMPLATE.format(context=self.SYSTEM_MESSAGE, question=inputs[i]) for i in range(len(inputs))]
        tokenized_chat = self.tokenizer(prompts, return_tensors="pt") # , padding=True, truncation=True
        tokenized_chat = tokenized_chat.to(self.device)
        generate_ids = self.model.generate(tokenized_chat["input_ids"], attention_mask=tokenized_chat["attention_mask"], pad_token_id=self.tokenizer.eos_token_id, max_new_tokens=max_new_tokens)
        responses = []
        for ids in generate_ids:
            response = self.tokenizer.decode(ids)
            response = response[response.index(self.SYSTEM_TAG_START)+len(self.SYSTEM_TAG_START):].strip().replace(self.EOT_ID,'').replace('/','\n').replace('+','\n').replace('|','\n').replace('[n]','\n').split('\n')
            response = response[0] if return_only_one_title else '|'.join(response)
            responses.append(response)
        return responses

    @classmethod
    def from_pretrained(cls, path, model_name = "meta-llama/Meta-Llama-3-8B-Instruct", dtype = torch.bfloat16, quantization_type = "4bit", device = None):
        try:

            instance = cls(model_name=model_name, dtype=dtype, quantization_type= quantization_type, device=device)
            if instance.load_base_model():
                if instance.load_peftmodel(path=path):
                    return instance
            return None

        except Exception as e:
            print(f"Error loading model from_pretrained: {e}")
            return None

    @classmethod
    def read_generated_titles_file(cls, file_path):
        with open(file_path, "r") as f:
            lines = f.readlines()
            return lines


In [None]:
model_path = '/content/drive/My Drive/Python/abolfazl-final/summary-to-title'
generated_titles_file_path = '/content/drive/My Drive/Python/abolfazl-final/generated_titles.txt'

In [6]:
# load fine-tuned model
model = TitleGeneratorModel.from_pretrained(model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# train: : about 30 hours on summaries as input
# model = TitleGeneratorModel()
# model.train(epochs = 1, batch_size=4, max_seq_length=128, input_of_model="summary", output_of_model="title")

In [12]:
# sample use of pretrained model to generate title:
# model = TitleGeneratorModel.from_pretrained(model_path)
article = "رئیس جمهور لبنان در دیدار با رئیس هیات آمریکایی میانجی‌گر مذاکرات غیرمستقیم ترسیم مرز دریایی جنوب لبنان با رژیم صهیونیستی، گفت: لبنان بر حاکمیت بر اراضی و آبهای خود اصرار دارد."
print(model(article)[0])


اصرار لبنان بر حاکمیت بر اراضی و آبهای خود در مذاکرات ترسیم مرز دریایی جنوب لبنان با رژیم صهیونیستی 


In [9]:
# test entite test_dataset : about 11 hours
# model = TitleGeneratorModel.from_pretrained(model_path)
# inputs, titles, generated_titles = model.test(max_new_tokens=64, input_of_model="summary", load_top=0)
# print(len(inputs), len(titles), len(generated_titles))


In [10]:
# evaluate measures:

# load test data and generated_titles
articles, summaries, titles = TitleGeneratorModel().load_data(only_return_test_data_as_list=True)
generated_titles = TitleGeneratorModel.read_generated_titles_file(generated_titles_file_path)

# Bert-Score:
bertscore = evaluate.load("bertscore")
results = bertscore.compute(predictions=generated_titles, references=titles, lang="fa")
print(f"Bert-Score:\nprecision: {np.mean(results['precision'])}, recall: {np.mean(results['recall'])}, f1: {np.mean(results['f1'])}\nhashcode: {results['hashcode']}\n\n")

# output:
# precision: 0.797476650227508, recall: 0.8075003109335495, f1: 0.8018598673852211
# hashcode: bert-base-multilingual-cased_L9_no-idf_version=0.3.12(hug_trans=4.42.4)


# Rouge:
rouge = evaluate.load('rouge')
results = rouge.compute(predictions=generated_titles, references=titles)
print(f'Rouge:\n{results}\n\n')

# output:
# {'rouge1': 0.006466416353775554, 'rouge2': 0.00017879492222420883, 'rougeL': 0.006496215507479588, 'rougeLsum': 0.006466416353775554}

# Bleu-{1,2,3,4}:
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=generated_titles, references=[[ref] for ref in titles])
print(f'Bleu:\n{results}\n')

# output:
# {'bleu': 0.18660108043988477, 'precisions': [0.3905652022646045, 0.2222202945819816, 0.1432084534101825, 0.09754627636676712], 'brevity_penalty': 1.0, 'length_ratio': 1.2122385598987788, 'translation_length': 63234, 'reference_length': 52163}


Bert-Score:
precision: 0.797476650131595, recall: 0.8075003114024578, f1: 0.8018598676729602
hashcode: bert-base-multilingual-cased_L9_no-idf_version=0.3.12(hug_trans=4.42.4)


Rouge:
{'rouge1': 0.006526014661183623, 'rouge2': 0.00017879492222420883, 'rougeL': 0.006526014661183621, 'rougeLsum': 0.006555813814887657}


Bleu:
{'bleu': 0.18660108043988477, 'precisions': [0.3905652022646045, 0.2222202945819816, 0.1432084534101825, 0.09754627636676712], 'brevity_penalty': 1.0, 'length_ratio': 1.2122385598987788, 'translation_length': 63234, 'reference_length': 52163}



In [13]:
# random test
# model = TitleGeneratorModel.from_pretrained(model_path)
articles, summaries, titles = model.load_data(only_return_test_data_as_list=True)
random_id = np.random.randint(len(articles))
article, summary, title = articles[random_id], summaries[random_id], titles[random_id]
generated_titles = model(summary, max_new_tokens=128)
print(f'article:\n{article}\nsummary:\n{summary}\ntitle:\n{title}\ngenerated_title:\n{generated_titles[0]}\n')


article:
ایمان افتخاری در گفتگو با خبرنگار مهر در خصوص برنامه‌های حمایتی این صندوق از پژوهشگران گفت: ما ۲ برنامه حمایتی ویژه داریم که به آن هم امیدوار هستیم تا محققان کشور بتوانند پروژه‌های خود را عملی کنند. [n] رئیس صندوق حمایت از پژوهشگران و فناوران با اشاره بهکی از برنامه‌های صندوق با چین گفت: ما در سال گذشته برنامه مشترکی با اکادمی علوم چین داشتیم تا فضای همکاری مشترک محققان در دو کشور شکل بگیرد. [n] وی با بیان اینکه از چند سال پیش ارتباطات قابل توجهی با فضای پژوهشی کشور چین برقرار کردیم، گفت: بنا داشتیم فراخوانی مشترک برای اجرای ۶ طرح اولویت دار را منتشر کنیم که بیماری کرونا شیوع پیدا کرد. [n] افتخاری با بیان اینکه محورهای این فراخوان از قبل مشخص شده بود، بیان کرد: با توجه به شیوع کرونا بنا را بر این گذاشتیم که علاوه بر ۶ طرح، ۶ طرح اولویت دار دیگر در زمینه کرونا در این فراخوان قرار بگیرد. [n] وی گفت: این طرح‌ها بعد از تصویب به صورت بین المللی پیش می‌رود که پروسه پژوهشی آنها توسط صندوق مورد حمایت قرار خواهد گرفت. [n] رئیس صندوق حمایت از پژوهشگران و فناوران با اشاره به حمایت‌های صن