In [1]:
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
import os

#  [markdown]
# ## Finetune an llm on an A100
#
# We will leverage PEFT library from Hugging Face ecosystem, as well as QLoRA for more memory efficient finetuning

# %%
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [2]:
# %%
!nvidia-smi

Fri Nov 17 08:12:17 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    47W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
#  [markdown]
# ## Setup
#
# Run the cells below to setup and install the required libraries. For our experiment we will need `accelerate`, `peft`, `transformers`, `datasets` and TRL to leverage the recent [`SFTTrainer`](https://huggingface.co/docs/trl/main/en/sft_trainer). We will use `bitsandbytes` to [quantize the base model into 4bit](https://huggingface.co/blog/4bit-transformers-bitsandbytes).

#
!pip install -q -U trl accelerate protobuf datasets bitsandbytes einops wandb sentencepiece
!pip install -q -U git+https://github.com/huggingface/peft
!pip install -q -U git+https://github.com/huggingface/transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [4]:
# %%
import torch
import pandas as pd
import tqdm
import numpy as np
import copy
from datasets import load_dataset, Dataset
from peft import LoraConfig
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)



In [5]:
# [markdown]
# Let's also load the tokenizer below

# %%
tokenizer = AutoTokenizer.from_pretrained(
    "01-ai/Yi-6B", use_fast=True, trust_remote_code=True
)
tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
tokenizer.pad_token = tokenizer.unk_token
tokenizer.clean_up_tokenization_spaces = True
tokenizer.add_bos_token = False
tokenizer.padding_side = "right"
tokenizer.pad_token

'<unk>'

In [6]:
# [markdown]
# ## Dataset
# %%
seed = 42

# %%
# grab the first 20000 entries of SlimOrca in an instruction format

dataset_name = "Open-Orca/SlimOrca"
print(f"\nLoading {dataset_name} dataset...")
dataset_SlimOrca = load_dataset(dataset_name, split="train", streaming=True)

dataset_SlimOrca = dataset_SlimOrca.take(20000)
texts = []

for row in dataset_SlimOrca:
    messages_keep = []
    for message in row["conversations"]:
        if message["from"] == "human":
            messages_keep.append({"role": "user", "content": message["value"]})
        if message["from"] == "gpt":
            messages_keep.append({"role": "assistant", "content": message["value"]})

    text = tokenizer.apply_chat_template(
        messages_keep, tokenize=False, add_generation_prompt=False
    )
    texts.append(text)

pandas_dataset_SlimOrca = pd.DataFrame([texts]).T
pandas_dataset_SlimOrca.columns = ["text"]
pandas_dataset_SlimOrca


Loading Open-Orca/SlimOrca dataset...


Unnamed: 0,text
0,<|startoftext|>[INST] Write an article based o...
1,<|startoftext|>[INST] Answer the following que...
2,<|startoftext|>[INST] Produce a long descripti...
3,<|startoftext|>[INST] Write a title for this a...
4,<|startoftext|>[INST] Definition: In this task...
...,...
19995,<|startoftext|>[INST] Given the task definitio...
19996,<|startoftext|>[INST] Features: Case/Bezel Mat...
19997,<|startoftext|>[INST] Detailed Instructions: Y...
19998,<|startoftext|>[INST] Answer the following que...


In [7]:
# grab first 20000 rows of platypus in an instruction format

dataset_name = "garage-bAInd/Open-Platypus"
print(f"\nLoading {dataset_name} dataset...")
dataset_platypus = load_dataset(dataset_name, split="train", streaming=True)

dataset_platypus = dataset_platypus.take(20000)
texts = []

for row in dataset_platypus:
    messages_keep = []
    messages_keep.append({"role": "user", "content": row["instruction"]})
    messages_keep.append({"role": "assistant", "content": row["output"]})
    text = tokenizer.apply_chat_template(
        messages_keep, tokenize=False, add_generation_prompt=False
    )
    texts.append(text)

pandas_dataset_platypus = pd.DataFrame([texts]).T
pandas_dataset_platypus.columns = ["text"]
pandas_dataset_platypus


Loading garage-bAInd/Open-Platypus dataset...


Unnamed: 0,text
0,<|startoftext|>[INST] A board game spinner is ...
1,<|startoftext|>[INST] My school's math club ha...
2,<|startoftext|>[INST] How many 4-letter words ...
3,<|startoftext|>[INST] Melinda will roll two st...
4,<|startoftext|>[INST] Let $p$ be the probabili...
...,...
19995,<|startoftext|>[INST] Historian: Alexander the...
19996,<|startoftext|>[INST] Biologist: Researchers b...
19997,<|startoftext|>[INST] S: It would be premature...
19998,<|startoftext|>[INST] Although parapsychology ...


In [8]:
# grab first 20000 rows of platypus in an instruction format

dataset_name = "jondurbin/airoboros-2.2.1"
print(f"\nLoading {dataset_name} dataset...")
dataset_airoboros = load_dataset(dataset_name, split="train", streaming=True)

dataset_airoboros = dataset_airoboros.take(20000)
texts = []

for row in dataset_airoboros:
    messages_keep = []
    messages_keep.append({"role": "user", "content": row["instruction"]})
    messages_keep.append({"role": "assistant", "content": row["response"]})
    text = tokenizer.apply_chat_template(
        messages_keep, tokenize=False, add_generation_prompt=False
    )
    texts.append(text)

pandas_dataset_airoboros = pd.DataFrame([texts]).T
pandas_dataset_airoboros.columns = ["text"]
pandas_dataset_airoboros


Loading jondurbin/airoboros-2.2.1 dataset...


Unnamed: 0,text
0,<|startoftext|>[INST] Five friends went to a r...
1,<|startoftext|>[INST] Write a Python script th...
2,<|startoftext|>[INST] How do you stay motivate...
3,<|startoftext|>[INST] Write a Python script th...
4,<|startoftext|>[INST] The Apollo astronauts br...
...,...
19995,<|startoftext|>[INST] Write a blues song about...
19996,<|startoftext|>[INST] Write a short story abou...
19997,<|startoftext|>[INST] There are three boxes la...
19998,<|startoftext|>[INST] Write a diary entry abou...


In [9]:
pandas_train_dataset = pd.concat(
    [pandas_dataset_platypus, pandas_dataset_SlimOrca, pandas_dataset_airoboros]
).reset_index(drop=True)
pandas_train_dataset

Unnamed: 0,text
0,<|startoftext|>[INST] A board game spinner is ...
1,<|startoftext|>[INST] My school's math club ha...
2,<|startoftext|>[INST] How many 4-letter words ...
3,<|startoftext|>[INST] Melinda will roll two st...
4,<|startoftext|>[INST] Let $p$ be the probabili...
...,...
59995,<|startoftext|>[INST] Write a blues song about...
59996,<|startoftext|>[INST] Write a short story abou...
59997,<|startoftext|>[INST] There are three boxes la...
59998,<|startoftext|>[INST] Write a diary entry abou...


In [10]:
train_dataset = Dataset.from_pandas(pandas_train_dataset)
train_dataset

# remove old text cols
train_dataset = train_dataset.remove_columns(
    [col for col in train_dataset.column_names if col not in ["text"]]
)

print("Final train dataset:")
print(train_dataset)
print(train_dataset[0])
print(train_dataset[-1])

Final train dataset:
Dataset({
    features: ['text'],
    num_rows: 60000
})
{'text': '<|startoftext|>[INST] A board game spinner is divided into three parts labeled $A$, $B$  and $C$. The probability of the spinner landing on $A$ is $\\frac{1}{3}$ and the probability of the spinner landing on $B$ is $\\frac{5}{12}$.  What is the probability of the spinner landing on $C$? Express your answer as a common fraction. [/INST] To find the probability of the spinner landing on $C$, I need to subtract the probabilities of the spinner landing on $A$ and $B$ from $1$, since the sum of the probabilities of all possible outcomes is $1$. I can write this as an equation: $P(C) = 1 - P(A) - P(B)$. I know that $P(A) = \\frac{1}{3}$ and $P(B) = \\frac{5}{12}$, so I can plug those values into the equation and simplify. I get: $P(C) = 1 - \\frac{1}{3} - \\frac{5}{12} = \\frac{12}{12} - \\frac{4}{12} - \\frac{5}{12} = \\frac{3}{12}$. I can reduce this fraction by dividing the numerator and denominator by

In [11]:
print(tokenizer.decode(tokenizer.encode(train_dataset[-1]["text"])))

<|startoftext|> [INST] Generate a plan to answer the following query using the tools provided. Each step in the plan should correspond to a piece of evidence generated by one of the tools. 

Tools Available:
YahooSearch[input]: Uses Yahoo's search engine to gather information related to the input query.
PageLoader[input]: Loads webpage content from one or multiple URLs provided in the input.
HyperlinkFinder[input]: Extracts hyperlinks from a block of text.
ALICE[input]: An AI-driven question-answering tool. It requires a question and a context (which can be a previous #E[index]) to provide an answer.

Output Format:
Plan: [description of the first step]
#E1 = [tool to use with input]
Plan: [description of the subsequent step based on the result of #E1]
#E2 = [next tool to use with input, potentially referring to #E1]
...
Final Answer: #E[n]

Question: What were the main causes of World War I and what countries were primarily involved? [/INST] Plan: Initiate a search for information reg

In [12]:
encoded_train_dataset = train_dataset.map(
    lambda examples: tokenizer(examples["text"]), batched=True
)
encoded_train_dataset

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4561 > 4096). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 60000
})

In [13]:
encoded_train_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 60000
})

In [14]:
num_dropped = 0
rows_to_drop = []
max_num_tokens_taken = []
for i in tqdm.tqdm(range(len(pandas_train_dataset))):
    row = encoded_train_dataset[i]
    num_tokens = len(row["input_ids"])
    if num_tokens > 500:
        rows_to_drop.append(i)
        num_dropped += 1
    else:
        max_num_tokens_taken.append(num_tokens)

num_dropped

100%|██████████| 60000/60000 [00:24<00:00, 2486.03it/s]


14433

In [15]:
np.max(max_num_tokens_taken)

500

In [16]:
# pandas_train_dataset

In [17]:
pandas_train_dataset = pandas_train_dataset.drop(rows_to_drop).reset_index(drop=True)

In [18]:
pandas_train_dataset

Unnamed: 0,text
0,<|startoftext|>[INST] A board game spinner is ...
1,<|startoftext|>[INST] My school's math club ha...
2,<|startoftext|>[INST] How many 4-letter words ...
3,<|startoftext|>[INST] Melinda will roll two st...
4,<|startoftext|>[INST] How many four-digit numb...
...,...
45562,<|startoftext|>[INST] What would be the ultima...
45563,<|startoftext|>[INST] Write a sonnet about the...
45564,<|startoftext|>[INST] Recommend three hiking t...
45565,<|startoftext|>[INST] There are three boxes la...


In [19]:
train_dataset = Dataset.from_pandas(pandas_train_dataset)

print("Train dataset:")
train_dataset = train_dataset.shuffle(seed=seed)
print(train_dataset)
print(train_dataset[0])
print(train_dataset[-1])

Train dataset:
Dataset({
    features: ['text'],
    num_rows: 45567
})
{'text': '<|startoftext|>[INST] Describe your favorite childhood memory using the words/style of Michael Scott from The Office. [/INST] Oh, man. My favorite childhood memory? It\'s like that time when I was a little Michael Scott running around in my backyard with my brothers and sisters. You know, we were just playing tag, or as I liked to call it, "The Office Olympics." We would be laughing, joking, and having the best Dundie Award-winning time ever.\n\nAnd then there was this one moment - you know those moments where everything just feels so perfect, like when Pam finally kisses Jim at Casino Night? Yeah, it was like that. The sun was setting, casting this golden glow on everyone\'s faces, and for a second, we all just stopped running and looked at each other. It was like our own version of the World\'s Best Boss mug, filled to the brim with pure happiness.\n\nIn that moment, we weren\'t just kids anymore; we we

In [20]:
len(tokenizer.encode(train_dataset[100]["text"]))

305

In [21]:
# ensure none over 500 tokens

# check that above worked
lens = []
encoded_train_dataset = train_dataset.map(
    lambda examples: tokenizer(examples["text"]), batched=True
)
for row in encoded_train_dataset:
    lens.append(len(row["input_ids"]))
np.max(lens)

Map:   0%|          | 0/45567 [00:00<?, ? examples/s]

500

In [22]:
# remove old text cols
train_dataset = train_dataset.remove_columns(
    [col for col in train_dataset.column_names if col not in ["text"]]
)

In [23]:
# [markdown]
# ## Loading the model
# [markdown]

# %%
model_name = "01-ai/Yi-6B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False


# %%
model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

YiForCausalLM(
  (model): YiModel(
    (embed_tokens): Embedding(64000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x YiDecoderLayer(
        (self_attn): YiAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): YiRotaryEmbedding()
        )
        (mlp): YiMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLU()
        )
        (ln1): YiRMSNorm()
        (ln2): YiRMSNorm()
      )
    )
    (norm): YiRMSNorm()
  )
  (lm_head): Linear(in_features=409

In [24]:
# [markdown]
# Below we will load the configuration file in order to create the LoRA model. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance. Therefore we will add `q_proj`, `k_proj`, `v_proj`, `o_proj` layers in the target modules.

# %%
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
)

# [markdown]
# ## Loading the trainer
# [markdown]
# Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

# %%
output_dir = "./results"
num_train_epochs = 2
auto_find_batch_size = True
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
save_strategy = "epoch"
learning_rate = 3e-4
lr_scheduler_type = "cosine"
warmup_ratio = 0.03
logging_strategy = "steps"
logging_steps = 25
evaluation_strategy = "no"
bf16 = True

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    auto_find_batch_size=auto_find_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_strategy=save_strategy,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,
    logging_strategy=logging_strategy,
    logging_steps=logging_steps,
    evaluation_strategy=evaluation_strategy,
    bf16=bf16,
)

In [25]:
# [markdown]
# Then finally pass everthing to the trainer

# %%
max_seq_length = 512

response_template = " [/INST]"
print(f"Response template for collator: {response_template}")
collator = DataCollatorForCompletionOnlyLM(
    response_template=response_template, tokenizer=tokenizer, mlm=False
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    dataset_text_field="text",
    data_collator=collator,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

# [markdown]
# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training

# %%
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

Response template for collator:  [/INST]


Map:   0%|          | 0/45567 [00:00<?, ? examples/s]

In [26]:
# [markdown]
# ## Train the model
# [markdown]
# Now let's train the model! Simply call `trainer.train()`

# %%
trainer.train()

# Syncing run devout-lion-96 to Weights & Biases (docs)
# View project at https://wandb.ai/dryanfurman/huggingface
# View run at https://wandb.ai/dryanfurman/huggingface/runs/q3tplqga

[34m[1mwandb[0m: Currently logged in as: [33mdryanfurman[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


Step,Training Loss
25,1.3392
50,1.3317
75,1.1441
100,1.0544
125,0.9529
150,0.9669
175,1.0073
200,0.9039
225,0.9282
250,0.8784


A. First degree murder, defined by the jurisdiction as premeditated and deliberate killing of another human being.
B. Second degree murder, defined as any murder not classified as first degree murder.
C. Voluntary manslaughter.
D. No crime.  [/INST] A<|endoftext|>  This instance will be ignored in loss calculation. Note, if this happens often, consider increasing the `max_seq_length`.
A. Valid, because aliens are not per se "a discrete and insular minority" specially protected by the Fourteenth Amendment.
B. Valid, because the line drawn by the state for extending aid was reasonably related to a legitimate state interest.
C. Invalid, because the justifications for this restriction are insufficient to overcome the burden imposed on a state when it uses such an alienage classification.
D. Invalid, because the Privileges and Immunities Clause of Article IV does not permit such an arbitrary classification.  [/INST] C<|endoftext|>  <unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk

TrainOutput(global_step=22784, training_loss=0.7231216379598285, metrics={'train_runtime': 17289.837, 'train_samples_per_second': 5.271, 'train_steps_per_second': 1.318, 'total_flos': 9.623711234391245e+17, 'train_loss': 0.7231216379598285, 'epoch': 2.0})

In [27]:
model

YiForCausalLM(
  (model): YiModel(
    (embed_tokens): Embedding(64000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x YiDecoderLayer(
        (self_attn): YiAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=512, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
 

### Test model and push to hub

In [28]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

In [29]:
peft_model_id = "/content/results/checkpoint-22784"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [30]:
# [markdown]
# Let's also load the tokenizer below

# %%
tokenizer = AutoTokenizer.from_pretrained(
    "01-ai/Yi-6B", use_fast=True, trust_remote_code=True
)
tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
tokenizer.pad_token = tokenizer.unk_token
tokenizer.clean_up_tokenization_spaces = True
tokenizer.add_bos_token = False
tokenizer.padding_side = "right"
tokenizer.pad_token

'<unk>'

In [31]:
from huggingface_hub import login

login("")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [32]:
# push to hub
model_id_load = "dfurman/Yi-6B-Instruct-v0.1"

# tokenizer
tokenizer.push_to_hub(model_id_load, use_auth_token=True)
# safetensors
model.push_to_hub(model_id_load, use_auth_token=True, safe_serialization=True)



adapter_model.safetensors:   0%|          | 0.00/105M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dfurman/Yi-6B-Instruct-v0.1/commit/780359854038fff3f61f3629b833f686609e0b3b', commit_message='Upload model', commit_description='', oid='780359854038fff3f61f3629b833f686609e0b3b', pr_url=None, pr_revision=None, pr_num=None)

## Basic usage

In [None]:
!pip install -q -U transformers peft torch accelerate einops sentencepiece

In [33]:
import torch
from peft import PeftModel, PeftConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)

In [34]:
peft_model_id = "dfurman/Yi-6B-Instruct-v0.1"
config = PeftConfig.from_pretrained(peft_model_id)

tokenizer = AutoTokenizer.from_pretrained(
    peft_model_id,
    use_fast=True,
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

model = PeftModel.from_pretrained(model, peft_model_id)

(…)ct-v0.1/resolve/main/adapter_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

(…)-v0.1/resolve/main/tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

(…)0.1/resolve/main/special_tokens_map.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/105M [00:00<?, ?B/s]

In [35]:
messages = [
    {"role": "user", "content": "Tell me a recipe for a mai tai."},
]

print("\n\n*** Prompt:")
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    return_tensors="pt",
    add_generation_prompt=True,
)
print(tokenizer.decode(input_ids[0]))



*** Prompt:
<|startoftext|>[INST] Tell me a recipe for a mai tai. [/INST] 


In [36]:
print("\n\n*** Generate:")
with torch.autocast("cuda", dtype=torch.bfloat16):
    output = model.generate(
        input_ids=input_ids.cuda(),
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.01,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=1.2,
        no_repeat_ngram_size=5,
        # num_beams=4,
        # length_penalty=-1,
    )

response = tokenizer.decode(
    output["sequences"][0][len(input_ids[0]) :], skip_special_tokens=True
)
print(response)



*** Generate:
1 oz dark rum
1 oz white rum
3/4 oz fresh lime juice (from about one medium-sized lime)
3/4 tsp simple syrup or honey
2 dashes Angostura bitters
Shake with ice and strain into an old fashioned glass filled with crushed ice, garnish with pineapple wedge


In [37]:
messages = [
    {"role": "user", "content": "Tell me a recipe for a mai tai."},
    {"role": "assistant", "content": response},
    {
        "role": "user",
        "content": "How can I make the mai tai more upscale and luxurious?",
    },
]

print("\n\n*** Prompt:")
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    return_tensors="pt",
)
print(tokenizer.decode(input_ids[0]))



*** Prompt:
<|startoftext|> [INST] Tell me a recipe for a mai tai. [/INST] 1 oz dark rum
1 oz white rum
3/4 oz fresh lime juice (from about one medium-sized lime)
3/4 tsp simple syrup or honey
2 dashes Angostura bitters
Shake with ice and strain into an old fashioned glass filled with crushed ice, garnish with pineapple wedge<|endoftext|> [INST] How can I make the mai tai more upscale and luxurious? [/INST] 


In [38]:
print("\n\n*** Generate:")
with torch.autocast("cuda", dtype=torch.bfloat16):
    output = model.generate(
        input_ids=input_ids.cuda(),
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.7,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=1.2,
        no_repeat_ngram_size=5,
        num_beams=4,
        length_penalty=-1,
    )

response = tokenizer.decode(
    output["sequences"][0][len(input_ids[0]) :], skip_special_tokens=True
)
print(response)



*** Generate:
01. Choose high-quality ingredients: Use premium aged rum, fresh lime juice, and high-quality simple syrup or honey.

02. Garnish with fresh fruit: Garnish your mai tai with a slice of fresh pineapple, orange, or other tropical fruit.

03. Add a splash of grenadine: Stir in a few drops of grenadine syrup to add a touch of sweetness and color.

04. Serve in a crystal glass: Use a clear or colored crystal cocktail glass to elevate the presentation.

05. Garnish with a maraschino cherry: Garnish the top of the drink with a marachino cherry for a classic touch.

06. Serve with a twist: Garnish the rim of the glass with a twist of fresh lime or orange peel for a citrusy aroma.

07. Add a dash of bitters: Stir a few drops of Angostura or other bitters into the drink for an extra kick of flavor.

08. Serve on a coaster: Place the drink on a coaster to protect the glass from scratches and fingerprints.

09. Serve with a garnish tray: Set out a tray of fresh fruits, nuts, or oth

In [39]:
# inference speed test

import tqdm
import time

prompt = "Write me a long list of things to do in San Francisco."

runtimes = []
for i in tqdm.tqdm(range(30)):
    start = time.time()

    messages = [
        {"role": "user", "content": prompt},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        return_tensors="pt",
    )

    with torch.autocast("cuda", dtype=torch.bfloat16):
        output = model.generate(
            input_ids=input_ids.cuda(),
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            repetition_penalty=1.2,
            no_repeat_ngram_size=5,
            num_beams=4,
            length_penalty=-1,
        )

    response = tokenizer.decode(
        output["sequences"][0][len(input_ids[0]) :], skip_special_tokens=True
    )

    end = time.time()
    runtimes.append(end - start)

100%|██████████| 30/30 [01:47<00:00,  3.57s/it]


In [40]:
avg_runtime = torch.mean(torch.tensor(runtimes)).item()
print(f"Runtime avg in seconds: {avg_runtime}")  # time in seconds

Runtime avg in seconds: 3.5694236755371094


In [41]:
messages = [
    {
        "role": "user",
        "content": "Write me a list of ten things to do in San Francisco.",
    },
]

print("\n\n*** Prompt:")
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    return_tensors="pt",
    add_generation_prompt=True,
)
print(tokenizer.decode(input_ids[0]))



*** Prompt:
<|startoftext|>[INST] Write me a list of ten things to do in San Francisco. [/INST] 


In [42]:
print("\n\n*** Generate:")
with torch.autocast("cuda", dtype=torch.bfloat16):
    output = model.generate(
        input_ids=input_ids.cuda(),
        max_new_tokens=1024 * 2,
        do_sample=True,
        temperature=0.01,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        # repetition_penalty=1.2,
        # no_repeat_ngram_size=5,
        # num_beams=4,
        # length_penalty=-1,
    )

response = tokenizer.decode(
    output["sequences"][0][len(input_ids[0]) :], skip_special_tokens=True
)
print(response)



*** Generate:
1. Visit Golden Gate Park
2. Take a cable car ride
3. Climb the stairs of the Golden Gate Bridge
4. Visit Alcatraz Island
5. Explore Fisherman's Wharf
6. Visit the Palace of Fine Arts
7. Check out the Haight-Ashbury neighborhood
8. Visit the Exploratorium
9. Take a ferry ride across the bay
10. Enjoy a meal at one of the many great restaurants


In [43]:
messages = [
    {
        "role": "user",
        "content": "The bakers at the Beverly Hills Bakery baked 200 loaves of bread on Monday morning. They sold 93 loaves in the morning and 39 loaves in the afternoon. A grocery store then returned 6 unsold loaves back to the bakery. How many loaves of bread did the bakery have left?",
    },
]

print("\n\n*** Prompt:")
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    return_tensors="pt",
    add_generation_prompt=True,
)
print(tokenizer.decode(input_ids[0]))



*** Prompt:
<|startoftext|>[INST] The bakers at the Beverly Hills Bakery baked 200 loaves of bread on Monday morning. They sold 93 loaves in the morning and 39 loaves in the afternoon. A grocery store then returned 6 unsold loaves back to the bakery. How many loaves of bread did the bakery have left? [/INST] 


In [44]:
print("\n\n*** Generate:")
with torch.autocast("cuda", dtype=torch.bfloat16):
    output = model.generate(
        input_ids=input_ids.cuda(),
        max_new_tokens=1024 * 2,
        do_sample=True,
        temperature=0.01,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        # repetition_penalty=1.2,
        # no_repeat_ngram_size=5,
        # num_beams=4,
        # length_penalty=-1,
    )

response = tokenizer.decode(
    output["sequences"][0][len(input_ids[0]) :], skip_special_tokens=True
)
print(response)



*** Generate:
102
