In [5]:
!pip install "transformers==4.31.0" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.40.2" "trl==0.4.7" "safetensors>=0.3.1" --upgrade

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [10]:
from datasets import load_dataset
from random import randrange

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM

from trl import SFTTrainer

In [11]:
# The model that you want to train from the Hugging Face hub
model_id = "NousResearch/Llama-2-7b-hf"
# The instruction dataset to use
dataset_name = "iamtarun/python_code_instructions_18k_alpaca"
#dataset_name = "HuggingFaceH4/CodeAlpaca_20K"
# Dataset split
dataset_split= "train"
# Fine-tuned model name
new_model = "llama-2-7b-int4-python-code-20k"
# Huggingface repository
hf_model_repo="zeeshanali00/"+new_model
# Load the entire model on the GPU 0
device_map = {"": 0}

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_double_nested_quant = False

################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 64
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
output_dir = new_model
# Number of training epochs
num_train_epochs = 1
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False
# Batch size per GPU for training
per_device_train_batch_size = 4
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1 # 2
# Enable gradient checkpointing
gradient_checkpointing = True
# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-3 #1e-5
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule
lr_scheduler_type = "cosine" #"constant"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = False
# Save checkpoint every X updates steps
save_steps = 0
# Log every X updates steps
logging_steps = 25
# Disable tqdm
disable_tqdm= True

################################################################################
# SFTTrainer parameters
################################################################################
# Maximum sequence length to use
max_seq_length = 2048 #None
# Pack multiple short examples in the same input sequence to increase efficiency
packing = True #False


In [25]:
from huggingface_hub import login

import os

# Load the enviroment variables

# Login to the Hugging Face Hub
login(token='**')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/ec2-user/.cache/huggingface/token
Login successful


In [12]:
# Load dataset from the hub
dataset = load_dataset(dataset_name, split=dataset_split)
# Show dataset size
print(f"dataset size: {len(dataset)}")
# Show an example
print(dataset[randrange(len(dataset))])


Found cached dataset parquet (/home/ec2-user/.cache/huggingface/datasets/iamtarun___parquet/iamtarun--python_code_instructions_18k_alpaca-cfc26604e43ea064/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


dataset size: 18612
{'instruction': 'Create a web-crawler in Python to extract links from a web page.', 'input': 'Web page URL: http://www.example.com', 'output': 'import urllib.request \nfrom bs4 import BeautifulSoup\n\n#Providing the url as an argument to urlopen() \nurl = "http://www.example.com"\nhtml = urllib.request.urlopen(url).read() \nsoup = BeautifulSoup(html, features = \'html.parser\') \n\n# extracting links from the web page \nlinks = [] \nfor link in soup.findAll(\'a\'): \n    links.append(link.get(\'href\')) \n\nprint(links)', 'prompt': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a web-crawler in Python to extract links from a web page.\n\n### Input:\nWeb page URL: http://www.example.com\n\n### Output:\nimport urllib.request \nfrom bs4 import BeautifulSoup\n\n#Providing the url as an argument to urlopen() \nurl = "http://www.example.com"\nhtml = urllib.request.urlopen(url).read() \

In [10]:
# Set the instruction format for iamtarun/python_code_instructions_18k_alpaca
def format_instruction(sample):
	return f"""### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

### Task:
{sample['instruction']}

### Input:
{sample['input']}

### Response:
{sample['output']}
"""

In [11]:
# Get the type
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_use_double_quant=use_double_nested_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype
)

In [12]:
# Load the pretrained model
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, use_cache = False, device_map=device_map)
model.config.pretraining_tp = 1

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
)
# Not necessary when using SFTTrainer
# prepare model for training
# model = prepare_model_for_kbit_training(model)
# model = get_peft_model(model, peft_config)

In [15]:
# Define the training arguments
args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size, # 6 if use_flash_attention else 4,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    optim=optim,
    #save_steps=save_steps,
    logging_steps=logging_steps,
    save_strategy="epoch",
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    max_steps=2,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    disable_tqdm=disable_tqdm,
    report_to="tensorboard",
    seed=42
)

In [16]:
!pip3 install tensorboard

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [17]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=packing,
    formatting_func=format_instruction,
    args=args,
)



In [18]:

# train
trainer.train() # there will not be a progress bar since tqdm is disabled

# save model in local
trainer.save_model()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'train_runtime': 55.205, 'train_samples_per_second': 0.145, 'train_steps_per_second': 0.036, 'train_loss': 1.1212422847747803, 'epoch': 0.0}


In [1]:

# Empty VRAM
#del model
#del trainer
#import gc
#gc.collect()
#gc.collect()
     

In [44]:
torch.cuda.empty_cache()

In [21]:
from peft import AutoPeftModelForCausalLM
device_map = {"": 0}
model = AutoPeftModelForCausalLM.from_pretrained(
 args.output_dir,
 low_cpu_mem_usage=True,
 return_dict=True,
 torch_dtype=torch.float16,
 device_map=device_map,
)

# Merge LoRA and base model
merged_model = model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('merged_model/tokenizer_config.json',
 'merged_model/special_tokens_map.json',
 'merged_model/tokenizer.json')

In [None]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

# Load the enviroment variables
load_dotenv()
# Login to the Hugging Face Hub
login(token='')

In [39]:
merged_model.push_to_hub(hf_model_repo,token='**')
tokenizer.push_to_hub(hf_model_repo,token='**')

CommitInfo(commit_url='https://huggingface.co/zeeshanali00/llama-2-7b-int4-python-code-20k/commit/2da1b1921a1e9a3c0c53cb0ab36e68aec8300f90', commit_message='Upload tokenizer', commit_description='', oid='2da1b1921a1e9a3c0c53cb0ab36e68aec8300f90', pr_url=None, pr_revision=None, pr_num=None)

In [32]:
!huggingface-cli login --token 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/ec2-user/.cache/huggingface/token
Login successful


In [14]:
sample

{'instruction': "Create a class in Python that inherits properties from a parent class named 'Animal'.",
 'input': '',
 'output': 'class Animal:\n    def __init__(self, name):\n        self.name = name\n\nclass Dog(Animal):\n    def __init__(self, name, breed):\n        super().__init__(name)\n        self.breed = breed',
 'prompt': "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a class in Python that inherits properties from a parent class named 'Animal'.\n\n### Input:\n\n\n### Output:\nclass Animal:\n    def __init__(self, name):\n        self.name = name\n\nclass Dog(Animal):\n    def __init__(self, name, breed):\n        super().__init__(name)\n        self.breed = breed"}

In [3]:
#!kill -9 23456

In [2]:
#!nvidia-smi

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device_map = {"": 0}
# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained('zeeshanali00/llama-2-7b-int4-python-code-20k')
# Load the model
model = AutoModelForCausalLM.from_pretrained('zeeshanali00/llama-2-7b-int4-python-code-20k', load_in_4bit=True,
 torch_dtype=torch.float16,
 device_map=device_map)
# Create an instruction
instruction="Optimize a code snippet written in Python. The code snippet should create a list of numbers from 0 to 10 that are divisible by 2."
input=""

prompt = f"""### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the Task.

### Task:
{instruction}

### Input:
{input}

### Response:
"""
# Tokenize the input
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# Run the model to infere an output
outputs = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.5)

# Print the result
#print(f"Prompt:\n{prompt}\n")
#print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


tensor([[    1,   835,  2799,  4080, 29901,    13, 11403,   278,  9330,  2400,
           322,   278, 10567,  2183,   304,  2436,   278, 13291, 29892,   607,
           338,   263,  8720,   775,   393,   508,  4505,   278,  9330, 29889,
            13,    13,  2277, 29937,  9330, 29901,    13, 20624,   326,   675,
           263,   775, 11534,  3971,   297,  5132, 29889,   450,   775, 11534,
           881,  1653,   263,  1051,   310,  3694,   515, 29871, 29900,   304,
         29871, 29896, 29900,   393,   526,  8572,  1821,   491, 29871, 29906,
         29889,    13,    13,  2277, 29937, 10567, 29901,    13,    13,    13,
          2277, 29937, 13291, 29901,    13,    13,    13,  2277, 29937, 10604,
         29901,    13,    13,    13,  2277, 29937,  9330, 29901,    13, 20624,
           326,   675,   263,   775, 11534,  3971,   297,  5132, 29889,   450,
           775, 11534,   881,  1653,   263,  1051,   310,  3694,   515, 29871,
         29900,   304, 29871, 29896, 29900,   393,  

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device_map = {"": 0}
# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained('zeeshanali00/llama-2-7b-int4-python-code-20k')
# Load the model
model = AutoModelForCausalLM.from_pretrained('zeeshanali00/llama-2-7b-int4-python-code-20k', load_in_4bit=True, torch_dtype=torch.float16,
                                             device_map=device_map)

# Create an instruction
instruction="Write a Python function to display the first and last elements of a list."
input=""

prompt = f"""### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the Task.

### Task:
{instruction}

### Input:
{input}

### Response:
"""
# Tokenize the input
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# Run the model to infere an output
outputs = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.9)

# Print the result
print(f"Prompt:\n{prompt}\n")
print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Prompt:
### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the Task.

### Task:
Write a Python function to display the first and last elements of a list.

### Input:


### Response:


Generated instruction:

```python
def my_function(list_to_process):
    result_list = []
    first_element = ''
    last_element = ''

    for element in list_to_process:
        result_list.append(element)
        if element == list_to_process[-1]:
            last_element = element

    return f'{first_element} {last_element}'
```

###
