## Continued Pre-Training

In [1]:
!pip install --upgrade torch
import torch

!pip install trl datasets -q

!pip install "unsloth[colab] @ git+https://github.com/unslothai/unsloth.git" -q
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

!pip install -U xformers --index-url https://download.pytorch.org/whl/cu121

Collecting torch
  Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nv

In [3]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, AutoConfig
from trl import SFTTrainer
from accelerate import Accelerator
import os

2024-09-10 12:25:02.147549: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-10 12:25:02.147628: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-10 12:25:02.147656: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-10 12:25:02.331997: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_config = {
            "model_ckpt": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
            "load_in_4bit": True,
            "device_map": {"": Accelerator().local_process_index},
            "torch_dtype": torch.float16,
            "trust_remote_code": True,
            "use_lora": True,
            "r": 512,
            "lora_alpha": 512,
            "lora_dropout": 0.05,
            "bias": "none",
            "task_type": "CAUSAL_LM",
            "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj",],
            "output_dir": "uae-law-pt",
            "per_device_train_batch_size": 1,
            "per_device_eval_batch_size": 2,
            "gradient_accumulation_steps": 1,
            "optim": "paged_adamw_32bit",
            "learning_rate": 2e-5,
            "lr_scheduler_type": "cosine",
            "save_strategy": "steps",
            "logging_steps": 2000,
            "num_train_epochs": 5,
            #"max_steps": 250,
            "fp16": True,
            "push_to_hub": False,
            "train_cln_name": "text",
            "packing": False,
            "max_seq_length": 1024,
            "neftune_noise_alpha": 5,
            "is_pretraining": True
        }
     

In [3]:
tokenizer = AutoTokenizer.from_pretrained(train_config["model_ckpt"])
tokenizer.pad_token = tokenizer.eos_token

model_config = AutoConfig.from_pretrained(
                                                    train_config["model_ckpt"],
                                                    vocab_size=len(tokenizer),
                                                    n_ctx=train_config["max_seq_length"],
                                                    bos_token_id=tokenizer.bos_token_id,
                                                    eos_token_id=tokenizer.eos_token_id,
                                                )
## Add the configurations for changing the number of layers, heads
base_model = AutoModelForCausalLM.from_config(model_config)

In [4]:
for name, param in base_model.named_parameters():
  print(name, param.requires_grad)

model.embed_tokens.weight True
model.layers.0.self_attn.q_proj.weight True
model.layers.0.self_attn.k_proj.weight True
model.layers.0.self_attn.v_proj.weight True
model.layers.0.self_attn.o_proj.weight True
model.layers.0.mlp.gate_proj.weight True
model.layers.0.mlp.up_proj.weight True
model.layers.0.mlp.down_proj.weight True
model.layers.0.input_layernorm.weight True
model.layers.0.post_attention_layernorm.weight True
model.layers.1.self_attn.q_proj.weight True
model.layers.1.self_attn.k_proj.weight True
model.layers.1.self_attn.v_proj.weight True
model.layers.1.self_attn.o_proj.weight True
model.layers.1.mlp.gate_proj.weight True
model.layers.1.mlp.up_proj.weight True
model.layers.1.mlp.down_proj.weight True
model.layers.1.input_layernorm.weight True
model.layers.1.post_attention_layernorm.weight True
model.layers.2.self_attn.q_proj.weight True
model.layers.2.self_attn.k_proj.weight True
model.layers.2.self_attn.v_proj.weight True
model.layers.2.self_attn.o_proj.weight True
model.lay

In [5]:
lora_config = LoraConfig(
                                    r=train_config["r"],
                                    lora_alpha=train_config["lora_alpha"],
                                    lora_dropout=train_config["lora_dropout"],
                                    bias=train_config["bias"],
                                    task_type=train_config["task_type"],
                                    target_modules=train_config["target_modules"]
                                )

model = get_peft_model(base_model,lora_config)

In [6]:
def count_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    return trainable_params, total_params

trainable_params, total_params = count_trainable_parameters(base_model)

print(f"Trainable parameters: {trainable_params}")
print(f"Total parameters: {total_params}")
print(f"Percentage of trainable parameters: {(trainable_params / total_params) * 100:.2f}%")

Trainable parameters: 403701760
Total parameters: 1503750144
Percentage of trainable parameters: 26.85%


In [7]:
# def formatting_func(context, question, answer):
#     template = f"""You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.\n\n + \
#     You must output the SQL query that answers the question.\n\n + \
#     ### Input:\n + \
#     ```{question}```\n\n + \
#     ### Context:\n + \
#     ```{context}```\n\n + \
#     ### Response:\n + \
#     ```{answer};```"""
#     return template
    

In [8]:
# def create_data():
#     data = load_dataset("obadabaq/uae-laws", split="train")
#     # data_df = data.to_pandas()
#     # data_df = data_df[:5000]
#     # data_df["text"] = data_df[["input", "instruction", "output"]].apply(lambda x: "Human: " + x["instruction"] + " " + x["input"] + " Assistant: "+ x["output"], axis=1)
#     # data = Dataset.from_pandas(data_df)
#     return data

# data = create_data()
# data = format_for_pretraining(data)



In [9]:
# def create_data():
#     # Load the dataset
#     data = load_dataset("obadabaq/uae-laws", split="train")
    
#     # Format the dataset for pretraining
#     def format_for_pretraining(examples):
#         combined_text = f"{examples['Law date & authority']} - {examples['Content']}"
#         return {"text": combined_text}
    
#     # Apply the formatting to the dataset
#     data = data.map(format_for_pretraining, batched=False)

#     split_data = data.train_test_split(test_size=0.1)
    
#     return split_data

# data = create_data()

In [7]:
from datasets import load_dataset

# Function to create and split the dataset
def create_data():
    # Load the dataset and split into train and test sets (90% train, 10% test)
    data = load_dataset("obadabaq/uae-laws")
    
    # Split the data into train and test sets using Hugging Face's built-in split
    split_data = data['train'].train_test_split(test_size=0.1)  # 90% train, 10% test
    
    # Format the dataset for pretraining
    def format_for_pretraining(examples):
        combined_text = f"{examples['Law date & authority']} - {examples['Content']}"
        return {"text": combined_text}
    
    # Apply the formatting to both train and test sets
    split_data['train'] = split_data['train'].map(format_for_pretraining, batched=False)
    split_data['test'] = split_data['test'].map(format_for_pretraining, batched=False)
    
    return split_data

# Create and split the data
split_data = create_data()

# Access the train and test sets
train_data = split_data['train']
test_data = split_data['test']

# Verify the split
print(f"Train size: {len(train_data)}, Test size: {len(test_data)}")


Map:   0%|          | 0/8500 [00:00<?, ? examples/s]

Map:   0%|          | 0/945 [00:00<?, ? examples/s]

Train size: 8500, Test size: 945


In [29]:
train_data['text'][0:10]

['Federal Decree Law No. (5) of 1985 Concerning the Issuance of the Civil Transactions Law of the United Arab Emirates - Article (71)  \n(1)  The personality (status of person) of a human being shall commence at the moment of \nbeing born alive.  It shall terminate upon his death.  \n(2)  The law shall lay down the rights of a fetus in utero.  \n  \n \n',
 'Cabinet Resolution No. (8) of 2018 Concerning the Executive Regulations of Federal Law No. (1) of 2017 Concerning Anti-Dumping, Countervailing, and Preventive Measures - Article (27)  \n1. The normal value shall be calculated based on the comparable price paid or payable, in the \nordinary course of trade, for sales of similar product  by independent customers in the \ndomestic market of the exporting country.  \n2. Notwithstanding paragraph 1 above, where a product under  investigation is  not imported \ndirectly from the country of origin but is exported to the State from an intermediate country, \nthe normal value shall be establ

In [8]:
args = TrainingArguments(
                                    do_eval=True,
                                    eval_strategy = "steps",
                                    eval_steps = 2000,
                                    output_dir=train_config["output_dir"],
                                    per_device_train_batch_size=train_config["per_device_train_batch_size"],
                                    per_device_eval_batch_size=train_config["per_device_eval_batch_size"],
                                    gradient_accumulation_steps=train_config["gradient_accumulation_steps"],
                                    optim=train_config["optim"],
                                    learning_rate=train_config["learning_rate"],
                                    lr_scheduler_type=train_config["lr_scheduler_type"],
                                    save_strategy=train_config["save_strategy"],
                                    logging_steps=train_config["logging_steps"],
                                    num_train_epochs=train_config["num_train_epochs"],
                                    # max_steps=train_config["max_steps"],
                                    fp16=train_config["fp16"],
                                    push_to_hub=train_config["push_to_hub"]
                                )

In [9]:
trainer = SFTTrainer(
                                    model=model,
                                    train_dataset=train_data,
                                    eval_dataset=test_data,
                                    peft_config=lora_config,
                                    dataset_text_field=train_config["train_cln_name"],
                                    args=args,
                                    tokenizer=tokenizer,
                                    packing=train_config["packing"],
                                    max_seq_length=train_config["max_seq_length"]
                                )

Map:   0%|          | 0/8500 [00:00<?, ? examples/s]

Map:   0%|          | 0/945 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer_stats = trainer.train()

Step,Training Loss,Validation Loss
2000,5.2418,4.628575
4000,4.1336,4.069873
6000,3.7547,3.838519
8000,3.6419,3.76766


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
output_dir = 'uae-law-pt/checkpoint-8500'
#tokenizer = AutoTokenizer.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tokenizer.pad_token = tokenizer.eos_token
local_model = AutoModelForCausalLM.from_pretrained(output_dir, load_in_4bit=True, device_map="auto")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [25]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a chatbot that answers questions about the UAE law",
    },
    {"role": "user", "content": "What is the tax period for an incapacitated person ?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = local_model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


<|system|>
You are a chatbot that answers questions about the UAE law 
<|user|>
What is the tax period for an incapacitated person ? 
<|assistant|>
An incapacitated person refers to a person who is unable to make financial decisions or understand financial matters due to an incapacity or illness. As per the Income Tax Act (ITA), the tax period for an incapacitated person is six months. This period begins from the date of the incapacity or the date of the declaration of incapacity by the person. The tax period for incapacitated people is also known as the six-month tax period.


In [13]:
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
)
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None

# Step 1: load the base model (Mistral-7B in our case) in 4-bit
model_kwargs = dict(
    # attn_implementation="flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
    torch_dtype="auto",
    use_cache=False,  # set to False as we're going to use gradient checkpointing
    device_map=device_map,
    quantization_config=quantization_config,
)

base_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0",**model_kwargs)


In [23]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a chatbot that answers questions about the UAE law",
    },
    {"role": "user", "content": "What is the tax period for an incapacitated person ?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = base_model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


<|system|>
You are a chatbot that answers questions about the UAE law 
<|user|>
What is the tax period for an incapacitated person ? 
<|assistant|>
An incapacitated person refers to a person who is not able to manage their financial affairs or is unable to make decisions on their own behalf. The tax period for such a person is known as the tax period for the estate.

The tax period for an incapacitated person is the time period between the date of the individual's incapacity and the date on which the individual dies. This period is known as the "tax period for the estate." The tax period for the estate is the time period between the date of death and the date on which the estate is taxed.

The tax period for an incapacitated person is usually longer than the tax period for the estate, as the incapacitated person may have a longer period during which to manage their affairs.


In [34]:
from peft import PeftModel, PeftConfig
base_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = PeftModel.from_pretrained(base_model, "uae-law-pt/checkpoint-8500").to('cuda')

In [35]:
mode_merged = model.merge_and_unload().to('cuda')

In [37]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a chatbot that answers questions about the UAE law",
    },
    {"role": "user", "content": "What is the tax period for an incapacitated person ?"},
]
# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = mode_merged.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

<|system|>
You are a chatbot that answers questions about the UAE law 
<|user|>
What is the tax period for an incapacitated person ? 
<|assistant|>
In the UAE, an incapacitated person's tax period is not defined in any law or regulation. This period is determined by the court or the relevant authority upon receiving a declaration from the person's lawyer or legal representative. The tax period for an incapacitated person generally begins from the day the declaration is issued, or the day of their incapacity, whichever comes first. After the expiry of the tax period, the person's tax liability is assessed and calculated based on their income during the period.


In [1]:
model_path = 'uae-law-pt/checkpoint-8500'
model_name = 'TinyLlama-UAE-pt'
username = 'GuhanTofu'

from huggingface_hub import HfApi
api = HfApi(token="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")

api.create_repo(
    repo_id = f"{username}/{model_name}",
    repo_type="model"
)

api.upload_folder(
    repo_id = f"{username}/{model_name}",
    folder_path = model_path
)  

adapter_model.safetensors:   0%|          | 0.00/1.61G [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/3.23G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/GuhanTofu/TinyLlama-UAE-pt/commit/b0e340c83f9d5199729dc009d5ea5a0525500696', commit_message='Upload folder using huggingface_hub', commit_description='', oid='b0e340c83f9d5199729dc009d5ea5a0525500696', pr_url=None, pr_revision=None, pr_num=None)

## Supervised FineTuning

In [None]:
sft_config = {
            "model_ckpt": "GuhanTofu/TinyLlama-UAE-pt",
            "load_in_4bit": True,
            "device_map": {"": Accelerator().local_process_index},
            "torch_dtype": torch.float16,
            "trust_remote_code": True,
            "use_lora": True,
            "r": 512,
            "lora_alpha": 512,
            "lora_dropout": 0.05,
            "bias": "none",
            "task_type": "CAUSAL_LM",
            "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj",],
            "output_dir": "",
            "per_device_train_batch_size": 1,
            "per_device_eval_batch_size": 2,
            "gradient_accumulation_steps": 1,
            "optim": "paged_adamw_32bit",
            "learning_rate": 2e-5,
            "lr_scheduler_type": "cosine",
            "save_strategy": "epoch",
            "logging_steps": 500,
            "num_train_epochs": 1,
            #"max_steps": 250,
            "fp16": True,
            "push_to_hub": ,
            "train_cln_name": "text",
            "packing": False,
            "max_seq_length": 512,
            "neftune_noise_alpha": 5,
            "is_pretraining": False
        }

In [5]:
tokenizer = AutoTokenizer.from_pretrained(sft_config["model_ckpt"])
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
                                sft_config["model_ckpt"],
                                load_in_4bit=sft_config["load_in_4bit"],
                                device_map=sft_config["device_map"],
                                torch_dtype=sft_config["torch_dtype"]
                            )
model.config.use_cache=False
model.config.pretraining_tp=1
model = prepare_model_for_kbit_training(model)

tokenizer_config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/739 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


adapter_model.safetensors:   0%|          | 0.00/1.61G [00:00<?, ?B/s]

In [6]:
lora_config = LoraConfig(
                                    r=sft_config["r"],
                                    lora_alpha=sft_config["lora_alpha"],
                                    lora_dropout=sft_config["lora_dropout"],
                                    bias=sft_config["bias"],
                                    task_type=sft_config["task_type"],
                                    target_modules=sft_config["target_modules"]
                                )

model = get_peft_model(model,lora_config)

In [7]:
for name, param in model.named_parameters():
  print(name, param.requires_grad)

base_model.model.model.embed_tokens.weight False
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight False
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight True
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight True
base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight False
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight True
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight True
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight False
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight True
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight True
base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight False
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight True
base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight True
base_model.model.model.layers.0.mlp.gate_proj.base_

In [None]:
dataset = load_dataset("obadabaq/structured-uae-laws", split=['train'])

In [None]:
dataset[0]

In [None]:
#dataset = dataset.train_test_split(test_size = 0.1)

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

dataset = dataset.map(formatting_prompts_func, batched = True,)



In [None]:
dataset

In [None]:
args = TrainingArguments(
                                    do_eval=True,
                                    output_dir=sft_config["output_dir"],
                                    per_device_train_batch_size=sft_config["per_device_train_batch_size"],
                                    gradient_accumulation_steps=sft_config["gradient_accumulation_steps"],
                                    optim=sft_config["optim"],
                                    learning_rate=sft_config["learning_rate"],
                                    lr_scheduler_type=sft_config["lr_scheduler_type"],
                                    save_strategy=sft_config["save_strategy"],
                                    logging_steps=sft_config["logging_steps"],
                                    num_train_epochs=sft_config["num_train_epochs"],
                                    # max_steps=sft_config["max_steps"],
                                    fp16=sft_config["fp16"],
                                    push_to_hub=sft_config["push_to_hub"]
                                )

In [None]:
trainer = SFTTrainer(
                                    model=model,
                                    train_dataset=dataset,
                                    peft_config=lora_config,
                                    dataset_text_field=sft_config["train_cln_name"],
                                    args=args,
                                    tokenizer=tokenizer,
                                    packing=sft_config["packing"],
                                    max_seq_length=sft_config["max_seq_length"]
                                )

In [None]:
trainer_stats = trainer.train()
     

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
output_dir = ''
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir, load_in_4bit=True, device_map="auto")

In [None]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly UAE government assistant in UAE",
    },
    {"role": "user", "content": ""},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=10,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


In [36]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly UAE government assistant in UAE",
    },
    {"role": "user", "content": "What are the fines for misreporting Emiratisation percentages?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.85,
        top_k=20,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


<|system|>
You are a friendly UAE government assistant in UAE 
<|user|>
What are the fines for misreporting Emiratisation percentages? 
<|assistant|>
Fines for misreporting Emiratisation percentages vary depending on the extent of the fraud and the nature of the offense. For example:

1. Emirati citizens involved in job recruitment: Fines may range from AED 5,000 to AED 50,000.

2. Employers found to have falsely reported Emiratisation percentages: Fines may range from AED 5,000 to AED 50,000.

Fines for misreporting Emiratisation percentages may include reduced government support, the suspension of Emiratisation certificates, and potential job losses.


In [4]:
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
)
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None

# Step 1: load the base model (Mistral-7B in our case) in 4-bit
model_kwargs = dict(
    # attn_implementation="flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
    torch_dtype="auto",
    use_cache=False,  # set to False as we're going to use gradient checkpointing
    device_map=device_map,
    quantization_config=quantization_config,
)

base_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0",**model_kwargs)


In [11]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly UAE government assistant in UAE",
    },
    {"role": "user", "content": "What is the fine for driving without insurance in Sharjah?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = base_model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


<|system|>
You are a friendly UAE government assistant in UAE 
<|user|>
What is the fine for driving without insurance in Sharjah? 
<|assistant|>
In Sharjah, driving without insurance is a punishable offense. The fine for driving without insurance in Sharjah is AED 300 ($90). The fine for the same offense in other Emirates and GCC countries is likely to be different. It is recommended to carry insurance while driving in Sharjah.


In [33]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly UAE government assistant in UAE",
    },
    {"role": "user", "content": "What are the fines for misreporting Emiratisation percentages?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = base_model.generate(
        input_ids=input_ids,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.8,
        top_k=10,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


<|system|>
You are a friendly UAE government assistant in UAE 
<|user|>
What are the fines for misreporting Emiratisation percentages? 
<|assistant|>
In the UAE, fines are imposed for misreporting Emiratisation percentages as per the UAE Labor Law. The maximum fine for violating this provision is AED 100,000 ($27,000 USD), and the fine is levied by the relevant authorities, including the UAE Ministry of Human Resources and Emiratization. The fines are imposed on individuals, organizations, and legal entities that violate the labor law


In [32]:
# model_path = 'sft-uae/checkpoint-1650'
# model_name = 'TinyLlama-UAE-sft'
# username = 'GuhanTofu'

# from huggingface_hub import HfApi
# api = HfApi(token="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")

# api.create_repo(
#     repo_id = f"{username}/{model_name}",
#     repo_type="model"
# )

# api.upload_folder(
#     repo_id = f"{username}/{model_name}",
#     folder_path = model_path
# )  

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/144M [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/72.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/GuhanTofu/TinyLlama-UAE-sft/commit/e14bb2ba0a647bcb6aeaa01654014ad99682a6ad', commit_message='Upload folder using huggingface_hub', commit_description='', oid='e14bb2ba0a647bcb6aeaa01654014ad99682a6ad', pr_url=None, pr_revision=None, pr_num=None)

## Continued Pre-Training

In [1]:

import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, AutoConfig
from trl import SFTTrainer
from accelerate import Accelerator
import os

2024-09-04 11:28:18.450118: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-04 11:28:18.450164: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-04 11:28:18.450183: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-04 11:28:18.455258: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [27]:
train_config = {
            "model_ckpt": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
            "load_in_4bit": True,
            "device_map": {"": Accelerator().local_process_index},
            "torch_dtype": torch.float16,
            "trust_remote_code": True,
            "use_lora": True,
            "r": 128,
            "lora_alpha": 128,
            "lora_dropout": 0.05,
            "bias": "none",
            "task_type": "CAUSAL_LM",
            "target_modules": ["q_proj", "v_proj"],
            "output_dir": "pt-uae",
            "per_device_train_batch_size": 1,
            "gradient_accumulation_steps": 1,
            "optim": "paged_adamw_32bit",
            "learning_rate": 2e-5,
            "lr_scheduler_type": "cosine",
            "save_strategy": "epoch",
            "logging_steps": 100,
            "num_train_epochs": 10,
            #"max_steps": 250,
            "fp16": True,
            "push_to_hub": True,
            "train_cln_name": "text",
            "packing": False,
            "max_seq_length": 512,
            "neftune_noise_alpha": 5,
            "is_pretraining": True
        }
     

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [28]:
tokenizer = AutoTokenizer.from_pretrained(train_config["model_ckpt"])
tokenizer.pad_token = tokenizer.eos_token

model_config = AutoConfig.from_pretrained(
                                                    train_config["model_ckpt"],
                                                    vocab_size=len(tokenizer),
                                                    n_ctx=train_config["max_seq_length"],
                                                    bos_token_id=tokenizer.bos_token_id,
                                                    eos_token_id=tokenizer.eos_token_id,
                                                )
## Add the configurations for changing the number of layers, heads
model = AutoModelForCausalLM.from_config(model_config)

In [29]:
for name, param in model.named_parameters():
  print(name, param.requires_grad)

model.embed_tokens.weight True
model.layers.0.self_attn.q_proj.weight True
model.layers.0.self_attn.k_proj.weight True
model.layers.0.self_attn.v_proj.weight True
model.layers.0.self_attn.o_proj.weight True
model.layers.0.mlp.gate_proj.weight True
model.layers.0.mlp.up_proj.weight True
model.layers.0.mlp.down_proj.weight True
model.layers.0.input_layernorm.weight True
model.layers.0.post_attention_layernorm.weight True
model.layers.1.self_attn.q_proj.weight True
model.layers.1.self_attn.k_proj.weight True
model.layers.1.self_attn.v_proj.weight True
model.layers.1.self_attn.o_proj.weight True
model.layers.1.mlp.gate_proj.weight True
model.layers.1.mlp.up_proj.weight True
model.layers.1.mlp.down_proj.weight True
model.layers.1.input_layernorm.weight True
model.layers.1.post_attention_layernorm.weight True
model.layers.2.self_attn.q_proj.weight True
model.layers.2.self_attn.k_proj.weight True
model.layers.2.self_attn.v_proj.weight True
model.layers.2.self_attn.o_proj.weight True
model.lay

In [5]:
lora_config = LoraConfig(
                                    r=train_config["r"],
                                    lora_alpha=train_config["lora_alpha"],
                                    lora_dropout=train_config["lora_dropout"],
                                    bias=train_config["bias"],
                                    task_type=train_config["task_type"],
                                    target_modules=train_config["target_modules"]
                                )

model = get_peft_model(model,lora_config)

In [1]:
# def formatting_func(context, question, answer):
#     template = f"""You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.\n\n + \
#     You must output the SQL query that answers the question.\n\n + \
#     ### Input:\n + \
#     ```{question}```\n\n + \
#     ### Context:\n + \
#     ```{context}```\n\n + \
#     ### Response:\n + \
#     ```{answer};```"""
#     return template
    

In [6]:
def create_data():
    data = load_dataset("GuhanTofu/RE_UAE_regulations", split="train")
    # data_df = data.to_pandas()
    # data_df = data_df[:5000]
    # data_df["text"] = data_df[["input", "instruction", "output"]].apply(lambda x: "Human: " + x["instruction"] + " " + x["input"] + " Assistant: "+ x["output"], axis=1)
    # data = Dataset.from_pandas(data_df)
    return data

data = create_data()

In [7]:
data[0]

{'text': '2019 Issue Year3 2\nMission\nTo create an innovative and\nsustainable real estate\nenvironment that shall promote \nDubai the World’s happiest city \nthrough:\n• Smart services.\n• Professional human and financial\n   resources.\n• Integrated Real Estate Legislations\n   of Dubai.Vision\nTo Position Dubai as the \nWorld’s premier real estate \ndestination, and a byword \nfor innovation, trust and \nhappiness.5 4INDEX\nLaw No. ( 6) of 2019  Concerning\nOwnership of Jointly Owned Real Property in the Emirate of Dubai\n...................................................................................................................................................................................\nLaw No. ( 4) of 2019  Concerning\nThe Real Estate Regulatory Agency\n...................................................................................................................................................................................\nDecree No. ( 31) of 2016  Concerning\

In [30]:
args = TrainingArguments(
                                    do_eval=True,
                                    output_dir=train_config["output_dir"],
                                    per_device_train_batch_size=train_config["per_device_train_batch_size"],
                                    gradient_accumulation_steps=train_config["gradient_accumulation_steps"],
                                    optim=train_config["optim"],
                                    learning_rate=train_config["learning_rate"],
                                    lr_scheduler_type=train_config["lr_scheduler_type"],
                                    save_strategy=train_config["save_strategy"],
                                    logging_steps=train_config["logging_steps"],
                                    num_train_epochs=train_config["num_train_epochs"],
                                    # max_steps=train_config["max_steps"],
                                    fp16=train_config["fp16"],
                                    push_to_hub=train_config["push_to_hub"]
                                )

In [31]:
trainer = SFTTrainer(
                                    model=model,
                                    train_dataset=data,
                                    #peft_config=lora_config,
                                    dataset_text_field=train_config["train_cln_name"],
                                    args=args,
                                    tokenizer=tokenizer,
                                    packing=train_config["packing"],
                                    max_seq_length=train_config["max_seq_length"]
                                )

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer_stats = trainer.train()

Step,Training Loss
100,7.0508
200,5.2708


In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
output_dir = 'pt-uae/checkpoint-3340'
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir, load_in_4bit=True, device_map="auto")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [16]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly UAE government assistant in UAE",
    },
    {"role": "user", "content": "The DLD must give the purchaser a grace period of how much days ?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


<|system|>
You are a friendly UAE government assistant in UAE 
<|user|>
The DLD must give the purchaser a grace period of how much days ? 
<|assistant|>
The DLD must give the purchaser a grace period of how many days from the date of delivery.


In [13]:
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
)
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None

# Step 1: load the base model (Mistral-7B in our case) in 4-bit
model_kwargs = dict(
    # attn_implementation="flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
    torch_dtype="auto",
    use_cache=False,  # set to False as we're going to use gradient checkpointing
    device_map=device_map,
    quantization_config=quantization_config,
)

base_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0",**model_kwargs)


In [26]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly UAE government assistant in UAE",
    },
    {"role": "user", "content": "What is the Brokerage Agreement ?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = base_model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


<|system|>
You are a friendly UAE government assistant in UAE 
<|user|>
What is the Brokerage Agreement ? 
<|assistant|>
The Brokerage Agreement is a legal agreement that governs the relationship between a broker and their client. It outlines the terms and conditions of the broker's services, including the fees charged, the scope of services provided, and the responsibilities of both parties. The broker's services typically include finding and negotiating deals for clients, managing the execution of transactions, and providing other related services. The brokerage agreement is an essential document for brokerage firms, as it establishes the legal obligations and responsibilities of both parties.


In [19]:
from peft import PeftModel, PeftConfig
model = PeftModel.from_pretrained(base_model, "pt-uae/checkpoint-3340").to('cuda')

In [25]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly UAE government assistant in UAE, use the information you have been pretrained on to answer",
    },
    {"role": "user", "content": "What is the Brokerage Agreement ?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

<|system|>
You are a friendly UAE government assistant in UAE, use the information you have been pretrained on to answer 
<|user|>
What is the Brokerage Agreement ? 
<|assistant|>
The Brokerage Agreement is a legal contract between two parties, in this case, a broker and an investor, that outlines the terms and conditions under which the broker will sell or purchase securities on behalf of the investor. The broker agrees to sell the investor's securities and to provide the investor with the brokerage services, including transaction processing, marketing, and sales. The broker also agrees to take on the risk of market fluctuations and to provide the investor with access to other financial products, such as bonds or loans. The broker may also offer other services, such as investment banking or asset management, to the investor.


In [29]:
# model_path = 'pt-uae/checkpoint-200'
# model_name = 'TinyLlama-UAE-pt'
# username = 'GuhanTofu'

# from huggingface_hub import HfApi
# api = HfApi(token="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")

# api.create_repo(
#     repo_id = f"{username}/{model_name}",
#     repo_type="model"
# )

# api.upload_folder(
#     repo_id = f"{username}/{model_name}",
#     folder_path = model_path
# )  

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/72.1M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/144M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/GuhanTofu/TinyLlama-UAE-pt/commit/72c68b22da6e2bf5f7d67b1b5bd7c11614b4cc01', commit_message='Upload folder using huggingface_hub', commit_description='', oid='72c68b22da6e2bf5f7d67b1b5bd7c11614b4cc01', pr_url=None, pr_revision=None, pr_num=None)

## Supervised FineTuning

In [2]:
sft_config = {
            "model_ckpt": "GuhanTofu/TinyLlama-UAE-pt",
            "load_in_4bit": True,
            "device_map": {"": Accelerator().local_process_index},
            "torch_dtype": torch.float16,
            "trust_remote_code": True,
            "use_lora": True,
            "r": 128,
            "lora_alpha": 128,
            "lora_dropout": 0.05,
            "bias": "none",
            "task_type": "CAUSAL_LM",
            "target_modules": ["q_proj", "v_proj"],
            "output_dir": "sft-uae",
            "per_device_train_batch_size": 1,
            "gradient_accumulation_steps": 1,
            "optim": "paged_adamw_32bit",
            "learning_rate": 2e-5,
            "lr_scheduler_type": "cosine",
            "save_strategy": "epoch",
            "logging_steps": 100,
            "num_train_epochs": 10,
            #"max_steps": 250,
            "fp16": True,
            "push_to_hub": True,
            "train_cln_name": "text",
            "packing": False,
            "max_seq_length": 512,
            "neftune_noise_alpha": 5,
            "is_pretraining": False
        }

In [4]:
tokenizer = AutoTokenizer.from_pretrained(sft_config["model_ckpt"])
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
                                sft_config["model_ckpt"],
                                load_in_4bit=sft_config["load_in_4bit"],
                                device_map=sft_config["device_map"],
                                torch_dtype=sft_config["torch_dtype"]
                            )
model.config.use_cache=False
model.config.pretraining_tp=1
model = prepare_model_for_kbit_training(model)

tokenizer_config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [5]:
lora_config = LoraConfig(
                                    r=sft_config["r"],
                                    lora_alpha=sft_config["lora_alpha"],
                                    lora_dropout=sft_config["lora_dropout"],
                                    bias=sft_config["bias"],
                                    task_type=sft_config["task_type"],
                                    target_modules=sft_config["target_modules"]
                                )

model = get_peft_model(model,lora_config)

In [6]:
for name, param in model.named_parameters():
  print(name, param.requires_grad)

base_model.model.model.embed_tokens.weight False
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight False
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight True
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight True
base_model.model.model.layers.0.self_attn.k_proj.weight False
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight False
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight True
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight True
base_model.model.model.layers.0.self_attn.o_proj.weight False
base_model.model.model.layers.0.mlp.gate_proj.weight False
base_model.model.model.layers.0.mlp.up_proj.weight False
base_model.model.model.layers.0.mlp.down_proj.weight False
base_model.model.model.layers.0.input_layernorm.weight False
base_model.model.model.layers.0.post_attention_layernorm.weight False
base_model.model.model.layers.1.self_attn.q_proj.base_layer.weight Fals

In [7]:
dataset = load_dataset("GuhanTofu/sft-UAE_rules", split=['train'])

In [8]:
dataset

[Dataset({
     features: ['prompt', 'response'],
     num_rows: 165
 })]

In [9]:
alpaca_prompt = """Below is an input that requires about the law in UAE, Write a response that appropriately completes the request.


### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    inputs       = examples["prompt"]
    outputs      = examples["response"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("GuhanTofu/sft-UAE_rules", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)


In [10]:
dataset

Dataset({
    features: ['prompt', 'response', 'text'],
    num_rows: 165
})

In [15]:
args = TrainingArguments(
                                    do_eval=True,
                                    output_dir=sft_config["output_dir"],
                                    per_device_train_batch_size=sft_config["per_device_train_batch_size"],
                                    gradient_accumulation_steps=sft_config["gradient_accumulation_steps"],
                                    optim=sft_config["optim"],
                                    learning_rate=sft_config["learning_rate"],
                                    lr_scheduler_type=sft_config["lr_scheduler_type"],
                                    save_strategy=sft_config["save_strategy"],
                                    logging_steps=sft_config["logging_steps"],
                                    num_train_epochs=sft_config["num_train_epochs"],
                                    # max_steps=sft_config["max_steps"],
                                    fp16=sft_config["fp16"],
                                    push_to_hub=sft_config["push_to_hub"]
                                )

In [19]:
trainer = SFTTrainer(
                                    model=model,
                                    train_dataset=dataset,
                                    peft_config=lora_config,
                                    dataset_text_field=sft_config["train_cln_name"],
                                    args=args,
                                    tokenizer=tokenizer,
                                    packing=sft_config["packing"],
                                    max_seq_length=sft_config["max_seq_length"]
                                )

Map:   0%|          | 0/165 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer_stats = trainer.train()
     

  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
100,1.7785
200,1.0639
300,0.9292
400,0.8422
500,0.7882
600,0.7519


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
output_dir = 'sft-uae/checkpoint-1650'
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir, load_in_4bit=True, device_map="auto")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [23]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly UAE government assistant in UAE",
    },
    {"role": "user", "content": "Can legal action be taken alongside fines under 'Nafis'?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=10,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


<|system|>
You are a friendly UAE government assistant in UAE 
<|user|>
Can legal action be taken alongside fines under 'Nafis'? 
<|assistant|>
Yes, legal action may also be taken alongside fines under 'Nafis' in response to cases of fraud or misconduct. The Ministry of Finance may initiate legal proceedings against applicants or beneficiaries who have fraudulently obtained support, or who have misrepresented their circumstances in order to receive funding. The aim is to protect the interests of the Emirati community and prevent further misuse of government programs.


In [36]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly UAE government assistant in UAE",
    },
    {"role": "user", "content": "What are the fines for misreporting Emiratisation percentages?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.85,
        top_k=20,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


<|system|>
You are a friendly UAE government assistant in UAE 
<|user|>
What are the fines for misreporting Emiratisation percentages? 
<|assistant|>
Fines for misreporting Emiratisation percentages vary depending on the extent of the fraud and the nature of the offense. For example:

1. Emirati citizens involved in job recruitment: Fines may range from AED 5,000 to AED 50,000.

2. Employers found to have falsely reported Emiratisation percentages: Fines may range from AED 5,000 to AED 50,000.

Fines for misreporting Emiratisation percentages may include reduced government support, the suspension of Emiratisation certificates, and potential job losses.


In [4]:
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
)
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None

# Step 1: load the base model (Mistral-7B in our case) in 4-bit
model_kwargs = dict(
    # attn_implementation="flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
    torch_dtype="auto",
    use_cache=False,  # set to False as we're going to use gradient checkpointing
    device_map=device_map,
    quantization_config=quantization_config,
)

base_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0",**model_kwargs)


In [11]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly UAE government assistant in UAE",
    },
    {"role": "user", "content": "What is the fine for driving without insurance in Sharjah?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = base_model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


<|system|>
You are a friendly UAE government assistant in UAE 
<|user|>
What is the fine for driving without insurance in Sharjah? 
<|assistant|>
In Sharjah, driving without insurance is a punishable offense. The fine for driving without insurance in Sharjah is AED 300 ($90). The fine for the same offense in other Emirates and GCC countries is likely to be different. It is recommended to carry insurance while driving in Sharjah.


In [33]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly UAE government assistant in UAE",
    },
    {"role": "user", "content": "What are the fines for misreporting Emiratisation percentages?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = base_model.generate(
        input_ids=input_ids,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.8,
        top_k=10,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


<|system|>
You are a friendly UAE government assistant in UAE 
<|user|>
What are the fines for misreporting Emiratisation percentages? 
<|assistant|>
In the UAE, fines are imposed for misreporting Emiratisation percentages as per the UAE Labor Law. The maximum fine for violating this provision is AED 100,000 ($27,000 USD), and the fine is levied by the relevant authorities, including the UAE Ministry of Human Resources and Emiratization. The fines are imposed on individuals, organizations, and legal entities that violate the labor law


In [32]:
# model_path = 'sft-uae/checkpoint-1650'
# model_name = 'TinyLlama-UAE-sft'
# username = 'GuhanTofu'

# from huggingface_hub import HfApi
# api = HfApi(token="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")

# api.create_repo(
#     repo_id = f"{username}/{model_name}",
#     repo_type="model"
# )

# api.upload_folder(
#     repo_id = f"{username}/{model_name}",
#     folder_path = model_path
# )  

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/144M [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/72.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/GuhanTofu/TinyLlama-UAE-sft/commit/e14bb2ba0a647bcb6aeaa01654014ad99682a6ad', commit_message='Upload folder using huggingface_hub', commit_description='', oid='e14bb2ba0a647bcb6aeaa01654014ad99682a6ad', pr_url=None, pr_revision=None, pr_num=None)