In [1]:
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
import os

#  [markdown]
# ## Finetune an llm on an A100
#
# We will leverage PEFT library from Hugging Face ecosystem, as well as QLoRA for more memory efficient finetuning

# %%
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [2]:
# %%
!nvidia-smi

Wed Nov 15 00:31:46 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    49W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
#  [markdown]
# ## Setup
#
# Run the cells below to setup and install the required libraries. For our experiment we will need `accelerate`, `peft`, `transformers`, `datasets` and TRL to leverage the recent [`SFTTrainer`](https://huggingface.co/docs/trl/main/en/sft_trainer). We will use `bitsandbytes` to [quantize the base model into 4bit](https://huggingface.co/blog/4bit-transformers-bitsandbytes).

#
!pip install -q -U trl accelerate protobuf datasets bitsandbytes einops wandb sentencepiece
!pip install -q -U git+https://github.com/huggingface/peft
!pip install -q -U git+https://github.com/huggingface/transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [4]:
# %%
import torch
import pandas as pd
import tqdm
import numpy as np
import copy
from datasets import load_dataset, Dataset
from peft import LoraConfig
from trl import (
    SFTTrainer,
    DataCollatorForCompletionOnlyLM
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)



In [5]:
# [markdown]
# Let's also load the tokenizer below

# %%
tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-6B", use_fast=True, trust_remote_code=True)
tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
tokenizer.pad_token = tokenizer.unk_token
tokenizer.clean_up_tokenization_spaces = True
tokenizer.add_bos_token = True
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"
tokenizer.pad_token


'<unk>'

In [6]:
tokenizer

YiTokenizer(name_or_path='01-ai/Yi-6B', vocab_size=64000, model_max_length=4096, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [7]:
# [markdown]
# ## Dataset
#
# For our experiment, we will use the `jondurbin/airoboros-2.2.1` dataset to train general purpose instruct model.
# The dataset can be found [here](https://huggingface.co/datasets/jondurbin/airoboros-2.2.1)
#

# %%
seed = 42


# %%
# grab the first 10000 entries of dolphin in an instruction format

dataset_name = "jondurbin/airoboros-2.2.1"
print(f"\nLoading {dataset_name} dataset...")
dataset_airoboros = load_dataset(dataset_name, split="train", streaming=False)
dataset_airoboros


Loading jondurbin/airoboros-2.2.1 dataset...


Dataset({
    features: ['instruction', 'response', 'system', 'skip_prompt_formatting', 'category'],
    num_rows: 42731
})

In [8]:
texts = []

for row in dataset_airoboros:
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": row["instruction"]},
        {"role": "assistant", "content": row["response"]},
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    texts.append(text)

pandas_dataset_airoboros = pd.DataFrame([texts]).T
pandas_dataset_airoboros.columns = ["text"]


In [9]:
pandas_dataset_airoboros.head(1)

Unnamed: 0,text
0,<|im_start|>system\nYou are a helpful assistan...


In [10]:
pandas_dataset_airoboros.loc[0, "text"]

"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nFive friends went to a restaurant and ordered a pizza. The pizza was cut into 8 equal slices. If each friend ate at least one slice, what is the maximum number of slices any single friend could have eaten?<|im_end|>\n<|im_start|>assistant\nThe maximum number of slices any single friend could have eaten is 4.\n\nHere's the reasoning behind this:\n\n1. We know that there are 8 slices and each of the five friends must eat at least one slice.\n2. So, let's start by giving each friend one slice. This means we've distributed 5 slices among the 5 friends.\n3. Now, we need to distribute the remaining 3 slices (since there were 8 slices in total) among the friends such that no friend eats more than the others.\n4. To achieve this, we'll give all the remaining slices to a single friend. So, one friend will now have eaten 1 + 3 = 4 slices.\n5. Therefore, the maximum number of slices any single friend could have eaten i

In [11]:
tokenizer.decode(tokenizer.encode(pandas_dataset_airoboros.loc[0, "text"]))

"<|startoftext|> <|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nFive friends went to a restaurant and ordered a pizza. The pizza was cut into 8 equal slices. If each friend ate at least one slice, what is the maximum number of slices any single friend could have eaten?<|im_end|>\n<|im_start|>assistant\nThe maximum number of slices any single friend could have eaten is 4.\n\nHere's the reasoning behind this:\n\n1. We know that there are 8 slices and each of the five friends must eat at least one slice.\n2. So, let's start by giving each friend one slice. This means we've distributed 5 slices among the 5 friends.\n3. Now, we need to distribute the remaining 3 slices (since there were 8 slices in total) among the friends such that no friend eats more than the others.\n4. To achieve this, we'll give all the remaining slices to a single friend. So, one friend will now have eaten 1 + 3 = 4 slices.\n5. Therefore, the maximum number of slices any single friend co

In [12]:
tokenizer.encode("""<|im_end|>\n<|im_start|>assistant\n""")

[1, 7, 144, 6, 765, 13611, 144, 2]

In [13]:
tokenizer.encode("for too long?<|im_end|>\n<|im_start|>assistant\nAdmiring your own")

[1,
 1508,
 1528,
 1337,
 100,
 7,
 144,
 6,
 765,
 13611,
 144,
 5836,
 59583,
 4437,
 788,
 1402,
 2]

In [14]:
tokenizer.decode([7, 144, 6, 765, 13611, 144])

'<|im_end|>\n<|im_start|>assistant\n'

In [15]:
num_dropped = 0
max_num_tokens_taken = []
for i in tqdm.tqdm(range(len(pandas_dataset_airoboros))):
    row = pandas_dataset_airoboros.loc[i]
    full = row["text"]
    num_tokens = len(tokenizer.encode(full))
    if num_tokens > 400:
        pandas_dataset_airoboros.drop(i, inplace=True)
        num_dropped += 1
    else:
        max_num_tokens_taken.append(num_tokens)

num_dropped

  2%|▏         | 1061/42731 [00:05<03:06, 223.15it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4131 > 4096). Running this sequence through the model will result in indexing errors
100%|██████████| 42731/42731 [03:14<00:00, 219.77it/s]


16642

In [16]:
pandas_dataset_airoboros.reset_index(drop=True, inplace=True)

In [17]:
max_num_chars_taken = []
for i in tqdm.tqdm(range(len(pandas_dataset_airoboros))):
    row = pandas_dataset_airoboros.loc[i]
    full = row["text"]
    max_num_chars_taken.append(len(full))

100%|██████████| 26089/26089 [00:01<00:00, 25542.18it/s]


In [18]:
#max_num_chars_taken.sort()
np.array(max_num_chars_taken)[-4]

429

In [19]:
np.max(max_num_tokens_taken)

400

In [20]:
train_dataset = Dataset.from_pandas(pandas_dataset_airoboros.loc[0:21000])

print("Train dataset:")
train_dataset = train_dataset.shuffle(seed=seed)
print(train_dataset)
print(train_dataset[0])
print(train_dataset[-1])

Train dataset:
Dataset({
    features: ['text'],
    num_rows: 21001
})
{'text': "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nThere are 4 red balls and 6 blue balls in a bag. If you pick two balls at random, what is the probability that both balls will be red?<|im_end|>\n<|im_start|>assistant\nThe probability of picking two red balls is 1/3.\n\nHere's the reasoning:\n\n1. When you first pick a ball from the bag, there are 4 red balls and 6 blue balls, making a total of 10 balls.\n2. The probability of picking a red ball on your first try is 4/10 (since there are 4 red balls out of a total of 10).\n3. After picking one red ball, there are now 3 red balls and 6 blue balls left in the bag, making a total of 9 balls.\n4. The probability of picking another red ball on your second try is 3/9 (since there are 3 red balls out of a total of 9).\n5. To find the probability of both events happening (picking two red balls), you multiply the probabilities: (4/10) *

In [21]:
eval_dataset = Dataset.from_pandas(pandas_dataset_airoboros.loc[21000:])

print("Eval dataset:")
eval_dataset = eval_dataset.shuffle(seed=seed)
print(eval_dataset)
print(eval_dataset[0])
print(eval_dataset[-1])

Eval dataset:
Dataset({
    features: ['text'],
    num_rows: 5089
})
{'text': "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nIs it true that those born in the Year of the Tiger are more courageous and adventurous?<|im_end|>\n<|im_start|>assistant\nThe claim that people born in the Year of the Tiger are more courageous and adventurous is a cultural belief, not a scientifically proven fact. There is no established correlation between Chinese zodiac signs and an individual's personality traits.<|im_end|>\n"}
{'text': "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nIf you drive at a constant speed of 60 mph for 3 hours and then increase your speed to 80 mph for another 2 hours, what is your total distance traveled?<|im_end|>\n<|im_start|>assistant\nTo calculate the total distance traveled, we need to find the distance covered during each segment of the trip and then add them together.\n\nFirst, let's find the distance covered 

In [22]:
# remove old text cols
train_dataset = train_dataset.remove_columns(
    [col for col in train_dataset.column_names if col not in ["text"]]
)

In [23]:
# [markdown]
# ## Loading the model
# [markdown]

# %%
model_name = "01-ai/Yi-6B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False


# %%
model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

YiForCausalLM(
  (model): YiModel(
    (embed_tokens): Embedding(64000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x YiDecoderLayer(
        (self_attn): YiAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): YiRotaryEmbedding()
        )
        (mlp): YiMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLU()
        )
        (ln1): YiRMSNorm()
        (ln2): YiRMSNorm()
      )
    )
    (norm): YiRMSNorm()
  )
  (lm_head): Linear(in_features=409

In [24]:
# [markdown]
# Below we will load the configuration file in order to create the LoRA model. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance. Therefore we will add `q_proj`, `k_proj`, `v_proj`, `o_proj` layers in the target modules.

# %%
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
)

# [markdown]
# ## Loading the trainer
# [markdown]
# Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

# %%
output_dir = "./results"
num_train_epochs = 3
auto_find_batch_size = True
gradient_accumulation_steps = 1
optim = "paged_adamw_32bit"
save_strategy = "epoch"
learning_rate = 3e-4
lr_scheduler_type = "cosine"
warmup_ratio = 0.03
logging_strategy = "steps"
logging_steps = 25
evaluation_strategy = "epoch"
prediction_loss_only = True
bf16 = True

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    auto_find_batch_size=auto_find_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_strategy=save_strategy,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,
    logging_strategy=logging_strategy,
    logging_steps=logging_steps,
    evaluation_strategy=evaluation_strategy,
    prediction_loss_only=prediction_loss_only,
    bf16=bf16,
)


In [25]:
# [markdown]
# Then finally pass everthing to the trainer

# %%
max_seq_length = 512
response_template = tokenizer.decode([7, 144, 6, 765, 13611, 144])
collator = DataCollatorForCompletionOnlyLM(
    response_template=response_template,
    tokenizer=tokenizer,
    mlm=False
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    data_collator=collator,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

# [markdown]
# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training

# %%
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)


Map:   0%|          | 0/21001 [00:00<?, ? examples/s]

Map:   0%|          | 0/5089 [00:00<?, ? examples/s]

In [26]:
# [markdown]
# ## Train the model
# [markdown]
# Now let's train the model! Simply call `trainer.train()`

# %%
trainer.train()

# Syncing run comic-eon-71 to Weights & Biases (docs)
# View project at https://wandb.ai/dryanfurman/huggingface
# View run at https://wandb.ai/dryanfurman/huggingface/runs/emms03dh


[34m[1mwandb[0m: Currently logged in as: [33mdryanfurman[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.7851,0.707298
2,0.5406,0.678803
3,0.434,0.714828


TrainOutput(global_step=15753, training_loss=0.6161202359167741, metrics={'train_runtime': 9107.1837, 'train_samples_per_second': 6.918, 'train_steps_per_second': 1.73, 'total_flos': 6.64835627901911e+17, 'train_loss': 0.6161202359167741, 'epoch': 3.0})

In [27]:
model

YiForCausalLM(
  (model): YiModel(
    (embed_tokens): Embedding(64000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x YiDecoderLayer(
        (self_attn): YiAttention(
          (q_proj): Linear4bit(
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
          )
          (k_proj): Linear4bit(
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096

### Test model and push to hub

In [28]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

In [29]:
peft_model_id = "/content/results/checkpoint-10502"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [30]:
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)
tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
tokenizer.pad_token = tokenizer.unk_token
tokenizer.clean_up_tokenization_spaces = True
tokenizer.add_bos_token = True
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"
tokenizer.pad_token

'<unk>'

In [33]:
from huggingface_hub import login
login()

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [34]:
# push to hub
model_id_load = "dfurman/Yi-6B-Instruct-v0.1"

# tokenizer
tokenizer.push_to_hub(model_id_load, use_auth_token=True)
# safetensors
model.push_to_hub(model_id_load, use_auth_token=True, safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/dfurman/Yi-6B-Instruct-v0.1/commit/24b6da53d55acd6d0ee17b2e082773ace4d23a11', commit_message='Upload model', commit_description='', oid='24b6da53d55acd6d0ee17b2e082773ace4d23a11', pr_url=None, pr_revision=None, pr_num=None)

## Basic usage

In [35]:
import torch
from peft import PeftModel, PeftConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)

In [36]:
peft_model_id = "dfurman/Yi-6B-Instruct-v0.1"
config = PeftConfig.from_pretrained(peft_model_id)

tokenizer = AutoTokenizer.from_pretrained(
    peft_model_id,
    use_fast=True,
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

model = PeftModel.from_pretrained(
    model,
    peft_model_id
)

Downloading (…)/adapter_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)er_model.safetensors:   0%|          | 0.00/210M [00:00<?, ?B/s]

In [62]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Tell me a recipe for a mai tai."},
]

print("\n\n*** Prompt:")
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    return_tensors="pt",
    add_generation_prompt=True,
)
print(tokenizer.decode(input_ids[0]))




*** Prompt:
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Tell me a recipe for a mai tai.<|im_end|>
<|im_start|>assistant



In [68]:
print("\n\n*** Generate:")

with torch.autocast("cuda", dtype=torch.bfloat16):
    output = model.generate(
        input_ids=input_ids.cuda(),
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.7,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=1.2,
        no_repeat_ngram_size=5,
    )

response = tokenizer.decode(
    output["sequences"][0][len(input_ids[0]):],
    skip_special_tokens=True
)
print(response)




*** Generate:
1 part Dark rum (Bacardi, Mount Gay, or Appleton Estate)
2 parts Orange curacao liqueur
3/4 oz Coconut cream
1 tsp Light agave syrup
6 dashes Angostura bitters
Garnish: Pineapple wedge and Maraschino cherry<|unused087|>



In [69]:
messages = [
    {"role": "user", "content": "Tell me a recipe for a mai tai."},
    {"role": "assistant", "content": response},
    {"role": "user", "content": "How can I make the mai tai more upscale and luxurious?"},

]

print("\n\n*** Prompt:")
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    return_tensors="pt",
    add_generation_prompt=True,
)
print(tokenizer.decode(input_ids[0]))




*** Prompt:
<|im_start|>user
Tell me a recipe for a mai tai.<|im_end|>
<|im_start|>assistant
1 part Dark rum (Bacardi, Mount Gay, or Appleton Estate)
2 parts Orange curacao liqueur
3/4 oz Coconut cream
1 tsp Light agave syrup
6 dashes Angostura bitters
Garnish: Pineapple wedge and Maraschino cherry<|unused087|>
<|im_end|>
<|im_start|>user
How can I make the mai tai more upscale and luxurious?<|im_end|>
<|im_start|>assistant



In [52]:
print("\n\n*** Generate:")
with torch.autocast("cuda", dtype=torch.bfloat16):
    output = model.generate(
        input_ids=input_ids.cuda(),
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.7,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=1.2,
        no_repeat_ngram_size=5,
    )

response = tokenizer.decode(
    output["sequences"][0][len(input_ids[0]):],
    skip_special_tokens=True
)
print(response)




*** Generate:
For an elegant touch to your Mai Tai, consider using high-quality rums such as Appleton Estate Jamaica Rum or Mount Gay Black Barrel Caribbean Blend in place of regular light or medium-bodied ones like Bacardi and Cruzan. You could also try shaking it up instead of pouring directly from the bottle - this will aerate the drink and give you a smoother taste overall. To top off your fancy version, why not garnish each serving with several small maraschino cherries? This classic ingredient adds depth to any mixed drink!<|unused094|>



In [42]:
# inference speed test

import tqdm
import time

prompt = "Write me a long list of things to do in San Francisco."

runtimes = []
for i in tqdm.tqdm(range(30)):
    start = time.time()

    messages = [
      {"role": "user", "content": prompt},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        return_tensors="pt",
    )

    with torch.autocast("cuda", dtype=torch.bfloat16):
        output = model.generate(
            input_ids=input_ids.cuda(),
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            repetition_penalty=1.2,
            no_repeat_ngram_size=5,
        )

    response = tokenizer.decode(
        output["sequences"][0][len(input_ids[0]):],
        skip_special_tokens=True
    )

    end = time.time()
    runtimes.append(end - start)


100%|██████████| 30/30 [01:29<00:00,  3.00s/it]


In [43]:
avg_runtime = torch.mean(torch.tensor(runtimes)).item()
print(f"Runtime avg in seconds: {avg_runtime}")  # time in seconds

Runtime avg in seconds: 2.995708465576172
