In [1]:
!pip install transformers==4.38.1 datasets==2.17.1 peft==0.8.2 bitsandbytes==0.42.0 trl==0.7.11 --upgrade --quiet

In [2]:
from datasets import load_dataset
from random import randrange

# Load dataset from the hub
#dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

#For local testing the fine tuning code, we limit the dataset to 20 samples 
dataset = load_dataset("databricks/databricks-dolly-15k", split="train").select(range(20))

print(f"dataset size: {len(dataset)}")
print(dataset[randrange(len(dataset))])

dataset size: 20
{'instruction': 'If I have more pieces at the time of stalemate, have I won?', 'context': 'Stalemate is a situation in chess where the player whose turn it is to move is not in check and has no legal move. Stalemate results in a draw. During the endgame, stalemate is a resource that can enable the player with the inferior position to draw the game rather than lose. In more complex positions, stalemate is much rarer, usually taking the form of a swindle that succeeds only if the superior side is inattentive.[citation needed] Stalemate is also a common theme in endgame studies and other chess problems.\n\nThe outcome of a stalemate was standardized as a draw in the 19th century. Before this standardization, its treatment varied widely, including being deemed a win for the stalemating player, a half-win for that player, or a loss for that player; not being permitted; and resulting in the stalemated player missing a turn. Stalemate rules vary in other games of the chess fa

In [3]:
dataset.save_to_disk("./dataset/dolly.hf")

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
!python ./training/train.py --dataset_path "./dataset/dolly.hf" --model_save_path "./results" --job_output_path "./results" --epochs 1

./dataset/dolly.hf
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:03<00:00,  1.93s/it]
{'loss': 1.1948, 'grad_norm': 0.7085636258125305, 'learning_rate': 5e-05, 'epoch': 5.0}
 25%|██████████▌                               | 25/100 [03:02<08:30,  6.81s/it]Checkpoint destination directory ./results/checkpoint-25 already exists and is non-empty. Saving will proceed but saved results may be invalid.
{'loss': 0.2819, 'grad_norm': 1.24049711227417, 'learning_rate': 5e-05, 'epoch': 10.0}
 50%|█████████████████████                     | 50/100 [06:07<05:01,  6.03s/it]Checkpoint destination directory ./results/checkpoint-50 already exists and is non-empty. Saving will proceed but saved results may be invalid.
 60%|█████████████████████████▏                | 60/100 [07:22<04:14,  6.37s/it]

### Local test with SageMaker sdk

In [3]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::707684582322:role/service-role/AmazonSageMaker-ExecutionRole-20191024T163188
sagemaker session region: us-east-1


In [9]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
instance_type = 'local_gpu'  # instances type used for the training job
training_input_path = "file://./dataset"

In [10]:
import time
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder

# define Training Job Name
job_name = f'huggingface-qlora-{model_id.replace("/", "-").lower()}'

# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training/dolly.hf',    # path where sagemaker will save training dataset
  'epochs': 1,                                      # number of training epochs
  'per_device_train_batch_size': 1,                 # batch size for training
  'lr': 2e-4,                                       # learning rate used during training
  'merge_weights': True,                            # wether to merge LoRA into the model (needs more memory)
}
metric=[
    {"Name": "loss", "Regex": r"'loss':\s*([0-9.]+)"},
    {"Name": "epoch", "Regex": r"'epoch':\s*([0-9.]+)"},
]
# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'train.py',      # train script
    source_dir           = 'training',         # directory which includes all the files needed for training
    metric_definitions   = metric,
    instance_type        = instance_type,   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.28',            # the transformers version used in the training job
    pytorch_version      = '2.0',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [None]:
# define a data input dictonary with our uploaded s3 uris
data = {'training': training_input_path}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)