# Requires p4d/p4de

In [1]:
%pip install transformers
%pip install peft
%pip install accelerate
#%pip install bitsandbytes==0.40.2
#%pip install safetensors==0.3.1
#%pip install tokenizers==0.13.3
%pip install datasets

# %pip install -U transformers==4.39.0
# %pip install -U peft==0.5.0
# %pip install -U accelerate==0.26.0
# #%pip install bitsandbytes #==0.40.2
# #%pip install safetensors==0.3.1"
# #%pip install tokenizers==0.13.3
# %pip install -U datasets==2.17.0
# %pip install --no-cache https://developer.download.nvidia.com/compute/redist/jp/v60dp/pytorch/torch-2.3.0a0+ebedce2.nv24.02-cp310-cp310-linux_aarch64.whl



In [2]:
import os
import argparse
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    set_seed,
    default_data_collator,
#    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset
import torch

#import bitsandbytes as bnb
#from huggingface_hub import login, HfFolder

## Fine-Tune LLaMA 7B in Amazon SageMaker Studio

In [3]:
import argparse
parser = argparse.ArgumentParser()

# add model id and dataset path argument
parser.add_argument(
    "--model_id",
    type=str,
    default="NousResearch/Llama-2-7b-hf", # not gated
    help="Model id to use for training.",
)
parser.add_argument(
    "--dataset_path",
    type=str,
    default="lm_dataset",
    help="Path to dataset."
)
# parser.add_argument(
#     "--hf_token",
#     type=str,
#     default=HfFolder.get_token(),
#     help="Path to dataset."
# )
# add training hyperparameters for epochs, batch size, learning rate, and seed
parser.add_argument(
    "--epochs",
    type=int,
    default=1,
    help="Number of epochs to train for."
)
parser.add_argument(
    "--per_device_train_batch_size",
    type=int,
    default=1,
    help="Batch size to use for training.",
)
parser.add_argument(
    "--lr",
    type=float,
    default=5e-5,
    help="Learning rate to use for training."
)
parser.add_argument(
    "--seed",
    type=int,
    default=42,
    help="Seed to use for training."
)
parser.add_argument(
    "--gradient_checkpointing",
    type=bool,
    default=True,
    help="Path to deepspeed config file.",
)
parser.add_argument(
    "--bf16",
    type=bool,
    default=True, # if torch.cuda.get_device_capability()[0] >= 8 else False,
    help="Whether to use bf16.",
)
# parser.add_argument(
#     "--merge_weights",
#     type=bool,
#     default=True,
#     help="Whether to merge LoRA weights with base model.",
# )
args, _ = parser.parse_known_args()

# if args.hf_token:
#     print(f"Logging into the Hugging Face Hub with token {args.hf_token[:10]}...")
#     login(token=args.hf_token)

In [4]:

torch.cuda.get_device_capability()

(7, 5)

## Load and prepare the dataset

we will use the [dolly](https://huggingface.co/datasets/databricks/databricks-dolly-15k) an open source dataset of instruction-following records generated by thousands of Databricks employees in several of the behavioral categories outlined in the [InstructGPT paper](https://arxiv.org/abs/2203.02155), including brainstorming, classification, closed QA, generation, information extraction, open QA, and summarization.

```python
{
  "instruction": "What is world of warcraft",
  "context": "",
  "response": "World of warcraft is a massive online multi player role playing game. It was released in 2004 by bizarre entertainment"
}
```

To load the `samsum` dataset, we use the `load_dataset()` method from the 🤗 Datasets library.

In [6]:
# set seed
set_seed(args.seed)

from datasets import load_dataset
from random import randrange

# Load dataset from the hub
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
dataset = dataset.select(range(1000))

print(f"dataset size: {len(dataset)}")
print(dataset[randrange(len(dataset))])
# dataset size: 15011

dataset size: 1000
{'instruction': 'Give me a list of some of the most popular song from 70s Japanese Pop singer, Mariya Takeuchi', 'context': '', 'response': '1. Plastic Love\n2. Stay with Me\n3. September\n4. Miracle Love\n5. Yume No Tsuzuki', 'category': 'brainstorming'}


To instruct tune our model we need to convert our structured examples into a collection of tasks described via instructions. We define a `formatting_function` that takes a sample and returns a string with our format instruction.

In [7]:
def format_dolly(sample):
    instruction = f"### Instruction\n{sample['instruction']}"
    context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
    response = f"### Answer\n{sample['response']}"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt


In [8]:
from random import randrange

print(format_dolly(dataset[randrange(len(dataset))]))

### Instruction
What is the Research Collaboratory for Structural Bioinformatics Protein Data Bank (RCSB PDB)?

### Answer
The Research Collaboratory for Structural Bioinformatics Protein Data Bank (RCSB PDB) is a database that provides a wealth of information about the 3D structures of proteins, nucleic acids, and other macromolecules. The database contains experimentally determined atomic coordinates for a large number of macromolecules, which can be used to study their structures, functions, and interactions. The RCSB PDB is widely used in genomics research and drug discovery, as it provides a valuable resource for understanding the structural basis of many biological processes and for designing new drugs that target specific macromolecules.

In addition to the atomic coordinates, the RCSB PDB contains a wealth of additional information about each macromolecule, including experimental methods used for structure determination, citations to relevant scientific literature, and informat

In [9]:
from transformers import AutoTokenizer

#model_id = "meta-llama/Llama-2-13b-hf" # sharded weights, gated
model_id = "NousResearch/Llama-2-7b-hf" # not gated
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [10]:
from random import randint
from itertools import chain
from functools import partial


# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample


# apply prompt template per sample
dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))
# print random sample
print(dataset[randint(0, len(dataset))]["text"])

# empty list to save remainder from batches to use in next batch
remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

def chunk(sample, chunk_length=2048):
    # define global remainder variable to save remainder from batches to use in next batch
    global remainder
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder to global variable for next batch
    remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result


# tokenize and chunk dataset
lm_dataset = dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)
).map(
    partial(chunk, chunk_length=4048),
    batched=True,
)

# Print total number of samples
print(f"Total number of samples: {len(lm_dataset)}")

### Instruction
Extract the owner of Lamborghini and a listing of the different types of Huracan cars that Lamborghini has produced for its Motorsport division.

### Context
Automobili Lamborghini S.p.A. (Italian pronunciation: [autoˈmɔːbili lamborˈɡiːni]) is an Italian manufacturer of luxury sports cars and SUVs based in Sant'Agata Bolognese. The company is owned by the Volkswagen Group through its subsidiary Audi.

Ferruccio Lamborghini (1916–1993), an Italian manufacturing magnate, founded Automobili Ferruccio Lamborghini S.p.A. in 1963 to compete with Ferrari. The company was noted for using a rear mid-engine, rear-wheel drive layout. Lamborghini grew rapidly during its first decade, but sales plunged in the wake of the 1973 worldwide financial downturn and the oil crisis. The firm's ownership changed three times after 1973, including a bankruptcy in 1978. American Chrysler Corporation took control of Lamborghini in 1987 and sold it to Malaysian investment group Mycom Setdco and In

In [11]:
# The chunking above will reduce the number of rows
print(lm_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 53
})


In [12]:
# load model from the hub with a bnb config
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

model = AutoModelForCausalLM.from_pretrained(args.model_id, torch_dtype=torch.bfloat16)

# create peft config
# model = create_peft_model(
#     model, gradient_checkpointing=args.gradient_checkpointing, bf16=args.bf16
# )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
# Define training args
output_dir = "./tmp/llama2"
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=args.per_device_train_batch_size,
    bf16=args.bf16,  # Use BF16 if available
    learning_rate=args.lr,
    num_train_epochs=args.epochs,
    gradient_checkpointing=args.gradient_checkpointing,
    # logging strategies
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset,
    data_collator=default_data_collator,
)


# Start training
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mgimoonnam[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.96 GiB. GPU 0 has a total capacity of 14.74 GiB of which 1.42 GiB is free. Process 147408 has 13.32 GiB memory in use. Of the allocated memory 13.18 GiB is allocated by PyTorch, and 14.76 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
sagemaker_save_dir="./llama2_dolly"
# if args.merge_weights:
#     # merge adapter weights with base model and save
#     # save int 4 model
#     trainer.model.save_pretrained(output_dir, safe_serialization=False)
#     # clear memory
#     del model
#     del trainer
#     torch.cuda.empty_cache()

#     from peft import AutoPeftModelForCausalLM

#     # load PEFT model in fp16
#     model = AutoPeftModelForCausalLM.from_pretrained(
#         output_dir,
#         low_cpu_mem_usage=True,
#         torch_dtype=torch.bfloat16,
#     )
#     # Merge LoRA and base model and save
#     model = model.merge_and_unload()
#     model.save_pretrained(
#         sagemaker_save_dir, safe_serialization=True, max_shard_size="2GB"
#     )
# else:

trainer.model.save_pretrained(
    sagemaker_save_dir, safe_serialization=True
)

# save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
tokenizer.save_pretrained(sagemaker_save_dir)