## Intro
The goal of this code is to fine-tune an mpt model (a model trained on instructions) with a very small Hugging Face prompt dataset.

In [1]:
!pip install transformers einops xformers datasets accelerate deepspeed

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m82.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers
  Downloading xformers-0.0.20-cp310-cp310-manylinux2014_x86_64.whl (109.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.w

In [2]:
import os
import transformers
import torch
from datasets import load_dataset

## Set up deepspeed parameters

In [3]:

os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

In [4]:
zero_config = {
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        },
        "allgather_partitions": True,
        "allgather_bucket_size": 5e8,
        "overlap_comm": True,
        "reduce_scatter": True,
        "reduce_bucket_size": 5e8,
        "contiguous_gradients": True
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto",
            "torch_adam": True
        }
    },
    "train_batch_size": "auto"
}

## Get a small dataset

In [5]:
# download a new dataset

dataset = load_dataset("HuggingFaceH4/instruction-dataset")
dataset

Downloading readme:   0%|          | 0.00/199 [00:00<?, ?B/s]

Downloading and preparing dataset json/HuggingFaceH4--instruction-dataset to /root/.cache/huggingface/datasets/HuggingFaceH4___json/HuggingFaceH4--instruction-dataset-2c1af235d4fbca41/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/461k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/HuggingFaceH4___json/HuggingFaceH4--instruction-dataset-2c1af235d4fbca41/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['prompt', 'completion', 'meta'],
        num_rows: 327
    })
})

## Use the mpt model's tokenization standard to rewire the dataset

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b",model_max_length=1000)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Downloading (…)okenizer_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


In [7]:
# convert the new data into tokens

def to_tokens(tokenizer: transformers.models.gpt_neox.GPTNeoXTokenizerFast) -> callable:
    """
        Given a `tokenizer` this closure will iterate through `x` and return the result of `apply()`.
        This function is mapped to a dataset and returned with ids and attention mask.
    """

    def apply(x) -> transformers.tokenization_utils_base.BatchEncoding:
        """ From a formatted dataset `x` a batch encoding `token_res` is created. """
        #target_labels = [label_map[y] for y in x["label"]]
        token_res = tokenizer(x["prompt"], x["completion"], return_tensors="pt", truncation=True, padding=True)
        return token_res
    return apply

h4_to_tokens = to_tokens(tokenizer)
tokenized_dataset = dataset.map(h4_to_tokens, batched=True, remove_columns=["prompt", "completion"])

Map:   0%|          | 0/327 [00:00<?, ? examples/s]

## Download the pre-trained mpt model

In [None]:
# download the pre-trained embedding

#!mkdir /content/checkpoints

checkpoint_name = "test-trainer"
local_checkpoint_path = "/content/checkpoints"
training_args = transformers.TrainingArguments(local_checkpoint_path,
    num_train_epochs=3, # default number of epochs to train is 3
    per_device_train_batch_size=8,
    deepspeed=zero_config, # add the deepspeed configuration
    report_to=["tensorboard"],
)

name = 'mosaicml/mpt-7b-instruct'

config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
config.max_seq_len = 1000 # (input + output) tokens can now be up to 4096

model = transformers.AutoModelForCausalLM.from_pretrained(
  name,
  config=config,
  trust_remote_code=True
)


[2023-06-19 23:06:15,914] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-06-19 23:06:17,730] [INFO] [comm.py:594:init_distributed] cdb=None
[2023-06-19 23:06:17,731] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading (…)configuration_mpt.py:   0%|          | 0.00/9.20k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- configuration_mpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)main/modeling_mpt.py:   0%|          | 0.00/18.9k [00:00<?, ?B/s]

Downloading (…)refixlm_converter.py:   0%|          | 0.00/27.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- hf_prefixlm_converter.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)in/param_init_fns.py:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading (…)resolve/main/norm.py:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- norm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- param_init_fns.py
- norm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)solve/main/blocks.py:   0%|          | 0.00/2.55k [00:00<?, ?B/s]

Downloading (…)ve/main/attention.py:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading (…)flash_attn_triton.py:   0%|          | 0.00/28.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- flash_attn_triton.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- attention.py
- flash_attn_triton.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- blocks.py
- attention.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)meta_init_context.py:   0%|          | 0.00/3.64k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- meta_init_context.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)n/adapt_tokenizer.py:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- adapt_tokenizer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)/custom_embedding.py:   0%|          | 0.00/305 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- custom_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- modeling_mpt.py
- hf_prefixlm_converter.py
- param_init_fns.py
- blocks.py
- meta_init_context.py
- adapt_tokenizer.py
- custom_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)model.bin.index.json:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

## Set up training arguments and checkpoints

In [None]:
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)
trainer = transformers.Trainer(model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
tensorboard_display_dir = f"{local_checkpoint_path}/runs"

In [None]:
%load_ext tensorboard
%tensorboard --logdir '{tensorboard_display_dir}'

## Train and save the model

In [None]:
trainer.train()

trainer.save_model()
trainer.save_state()

In [None]:
!mkdir /content/final

trainer.save_model("content/final")

In [None]:
fine_tuned_model = transformers.AutoModelForCausalLM.from_pretrained("content/final")

## Make an inference based on a structured prompt

In [None]:
# prompt template

INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
)

example = "James decides to run 3 sprints 3 times a week. He runs 60 meters each sprint. How many total meters does he run a week? Explain before answering."
fmt_ex = PROMPT_FOR_GENERATION_FORMAT.format(instruction=example)

In [None]:
from transformers import pipeline

pipe = pipeline('text-generation', model=fine_tuned_model, tokenizer=tokenizer, device='cuda:0')

with torch.autocast('cuda', dtype=torch.bfloat16):
    result = pipe(fmt_ex,
            max_new_tokens=50,
            do_sample=True,
            use_cache=True)

In [None]:
print(result)