<a href="https://colab.research.google.com/github/doaa-sala7/Arabic_question_Answering/blob/main/Arabic_QA_llm_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U bitsandbytes
!pip install transformers
!pip install peft
!pip install accelerate
!pip install einops
!pip install datasets

In [None]:
!pip install evaluate
!pip install trl
!pip install rouge_score

In [3]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
from datasets import Dataset
from huggingface_hub import interpreter_login

# interpreter_login()

In [4]:
from huggingface_hub import login
login()

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
from datasets import load_dataset

dataset = load_dataset("arcd")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.53k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/174k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/192k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/693 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/702 [00:00<?, ? examples/s]

In [6]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

In [None]:
model_name='FreedomIntelligence/AceGPT-7B-chat'
device_map = {"": 0}
original_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                      device_map=device_map,
                                                      #device_map = "auto",
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True,
                                                      use_auth_token=True)



### Preprocessing

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

In [11]:
from transformers import set_seed
seed = 42
set_seed(seed)

index = 10

In [12]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('instruction','output')
    Then concatenate them using two newline characters
    :param sample: Sample dictionnary
    """
    INTRO_BLURB = "يوجد أدناه تعليمات تصف المهمة. اكتب الرد الذي يكمل الطلب بشكل مناسب."
    INSTRUCTION_KEY = "### التعليمات: أجب عن السؤال بناءً على السياق."
    RESPONSE_KEY = "### الناتج:"
    END_KEY = "### النهاية"

    blurb = f"\n{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"

    context = f"{sample['context']}" if sample["context"] else None
    question = f"{sample['question']}" if sample["question"] else None

    input_context = context + question

    response = f"{RESPONSE_KEY}\n{sample['answers']}"
    end = f"{END_KEY}"

    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    sample["text"] = formatted_prompt

    return sample

In [26]:
from functools import partial


def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int,seed, dataset, shuffle = True):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)

    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=['id', 'title', 'context', 'question', 'answers'],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    if shuffle:
        dataset = dataset.shuffle(seed=seed)

    return dataset

### Training

In [None]:
## Pre-process dataset
max_length = get_max_length(original_model)
print(max_length)

train_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['train'])
eval_dataset = preprocess_dataset(tokenizer, max_length,seed,Dataset.from_dict( dataset['validation'][:50]))

Found max lenth: 2048
2048
Preprocessing dataset...
Preprocessing dataset...


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:

# Preparing the Model for QLoRA
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
original_model = prepare_model_for_kbit_training(original_model)

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

config = LoraConfig(
    r= 8, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
original_model.gradient_checkpointing_enable()

peft_model = get_peft_model(original_model, config)

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


In [None]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 6291456
all model parameters: 3506704384
percentage of trainable model parameters: 0.18%


In [None]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

In [None]:
import transformers

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=5,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=5,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=5,
    evaluation_strategy="steps",
    eval_steps=5,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

peft_model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
peft_trainer.train()



Step,Training Loss,Validation Loss
5,0.9162,1.033347


TrainOutput(global_step=5, training_loss=0.9161793708801269, metrics={'train_runtime': 543.4025, 'train_samples_per_second': 0.037, 'train_steps_per_second': 0.009, 'total_flos': 1049226749632512.0, 'train_loss': 0.9161793708801269, 'epoch': 0.03})

### evaluation

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

base_model_id = 'FreedomIntelligence/AceGPT-7B-chat'
base_model = AutoModelForCausalLM.from_pretrained(base_model_id,
                                                      device_map='auto',
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True,
                                                      use_auth_token=True)



config.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/22.5G [00:00<?, ?B/s]

In [8]:
eval_tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True, use_fast=False)
eval_tokenizer.pad_token = eval_tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [14]:
from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model,
                                     "/content/drive/MyDrive/digital_egypt _investment_task/peft-dialogue-summary-training-1711062975/checkpoint-5",
                                     torch_dtype=torch.float16,is_trainable=False)

In [27]:
## Pre-process dataset
max_length = get_max_length(ft_model)
print(max_length)

#eval_dataset = preprocess_dataset(eval_tokenizer, max_length,seed,Dataset.from_dict( dataset['validation'][:100]), False)

Found max lenth: 2048
2048
Preprocessing dataset...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

In [35]:
def gen(model,p, maxlen=100, sample=True):
    toks = eval_tokenizer(p, return_tensors="pt")
    res = model.generate(**toks.to("cuda"), max_new_tokens=maxlen, do_sample=sample,num_return_sequences=1,temperature=0.1,num_beams=1,top_p=0.95,)
    return eval_tokenizer.batch_decode(res,skip_special_tokens=True)

In [1]:
import pandas as pd

context = dataset['validation'][0:2]['context'] + dataset['validation'][0:2]['question']
baseline_answers = dataset['validation'][0:2]['answers']


peft_model_answers = []

for idx, dialogue in enumerate(context):
    human_baseline_text_output = baseline_answers[idx]
    prompt = f"التعليمات: أجب عن السؤال بناءً على السياق التالي\n{context}\n ### الناتج:\n"


    #print(idx)
    peft_model_res = gen(ft_model,prompt,100,)
    peft_model_output = peft_model_res[0].split('الناتج:\n')[1]
    print(peft_model_output)
    peft_model_text_output, success, result = peft_model_output.partition('###')


    peft_model_answers.append(peft_model_text_output)

zipped_answers = list(zip(baseline_answers, peft_model_answers))

df = pd.DataFrame(zipped_answers, columns = ['baseline_answers', 'peft_model_answers'])
df

In [None]:
df.to_csv("/content/drive/MyDrive/digital_egypt _investment_task/result.csv", index=False)

In [2]:
df

Unnamed: 0,ground_truth_answers,peft_model_answers
0,صحابي من صحابة رسول الإسلام محمد، وعمُّه وأخوه...,حمزة بن عبد المطلب هو صحابي من صحابة رسول الإ...
1,وَخَيْرُ أَعْمَامِي,حمزة بن عبد المطلب هو صحابي من صحابة رسول الإ...
2,«خَيْرُ إِخْوَتِي عَلِيٌّ،,حمزة بن عبد المطلب الهاشمي القرشي هو صحابي من...
3,في السنة الثانية من بعثة النبي محمد،,حمزة أسلم في السنة الثانية من بعثة النبي محمد...
4,وقَتَلَ فيها شيبة بن ربيعة مبارزةً، وقتل غيرَه...,حمزة بن عمر بن معدّة بن عدنان بن أدد بن ممّ ب...
...,...,...
95,جزر سيلي وجزيرة وايت.,1. جزر سيلي\n 2. جزيرة وايت\n 3. جزيرة براغا\...
96,في فترة العصر الحجري القديم،,استوطن الإنسان أول مرة في إنجلترا خلال العصر ...
97,ويعود أصل كلمة إنجلترا إلى الآنجلز وهي إحدى قب...,كلمة إنجلترا تعود إلى الآنجلز، وهي إحدى قبائل...
98,استقرت خلال القرن الخامس والسادس الميلادي.,كانت قبائل الجرمان قد استقرت في إنجلترا خلال ...
