In [1]:
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
import os

#  [markdown]
# ## Finetune an llm on an A100
#
# We will leverage PEFT library from Hugging Face ecosystem, as well as QLoRA for more memory efficient finetuning

# %%
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [2]:
# %%
!nvidia-smi

Tue Nov 14 20:25:09 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    47W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
#  [markdown]
# ## Setup
#
# Run the cells below to setup and install the required libraries. For our experiment we will need `accelerate`, `peft`, `transformers`, `datasets` and TRL to leverage the recent [`SFTTrainer`](https://huggingface.co/docs/trl/main/en/sft_trainer). We will use `bitsandbytes` to [quantize the base model into 4bit](https://huggingface.co/blog/4bit-transformers-bitsandbytes).

#
!pip install -q -U trl accelerate protobuf datasets bitsandbytes einops wandb sentencepiece
!pip install -q -U git+https://github.com/huggingface/peft
!pip install -q -U git+https://github.com/huggingface/transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [4]:
# %%
import torch
import pandas as pd
import tqdm
import numpy as np
import copy
from datasets import load_dataset, Dataset
from peft import LoraConfig
from trl import (
    SFTTrainer,
    DataCollatorForCompletionOnlyLM
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)



In [5]:
# [markdown]
# Let's also load the tokenizer below

# %%
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=True, trust_remote_code=True)
tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
tokenizer.pad_token = tokenizer.unk_token
tokenizer.clean_up_tokenization_spaces = True
tokenizer.add_bos_token = False
tokenizer.padding_side = "right"
tokenizer.pad_token


Downloading (…)okenizer_config.json:   0%|          | 0.00/966 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

'<unk>'

In [6]:
tokenizer

LlamaTokenizerFast(name_or_path='mistralai/Mistral-7B-v0.1', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
# [markdown]
# ## Dataset
#
# For our experiment, we will use the `jondurbin/airoboros-2.2.1` dataset to train general purpose instruct model.
# The dataset can be found [here](https://huggingface.co/datasets/jondurbin/airoboros-2.2.1)
#

# %%
seed = 42


# %%
dataset_name = "jondurbin/airoboros-2.2.1"
print(f"\nLoading {dataset_name} dataset...")
dataset_airoboros = load_dataset(dataset_name, split="train", streaming=False)
dataset_airoboros


Loading jondurbin/airoboros-2.2.1 dataset...


Dataset({
    features: ['instruction', 'response', 'system', 'skip_prompt_formatting', 'category'],
    num_rows: 42731
})

In [8]:
texts = []

for row in dataset_airoboros:
    messages = [
        {"role": "user", "content": row["instruction"]},
        {"role": "assistant", "content": row["response"]},
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    texts.append(text)

pandas_dataset_airoboros = pd.DataFrame([texts]).T
pandas_dataset_airoboros.columns = ["text"]


In [9]:
pandas_dataset_airoboros.head(1)

Unnamed: 0,text
0,<s>[INST] Five friends went to a restaurant an...


In [10]:
pandas_dataset_airoboros.loc[0, "text"]

"<s>[INST] Five friends went to a restaurant and ordered a pizza. The pizza was cut into 8 equal slices. If each friend ate at least one slice, what is the maximum number of slices any single friend could have eaten? [/INST] The maximum number of slices any single friend could have eaten is 4.\n\nHere's the reasoning behind this:\n\n1. We know that there are 8 slices and each of the five friends must eat at least one slice.\n2. So, let's start by giving each friend one slice. This means we've distributed 5 slices among the 5 friends.\n3. Now, we need to distribute the remaining 3 slices (since there were 8 slices in total) among the friends such that no friend eats more than the others.\n4. To achieve this, we'll give all the remaining slices to a single friend. So, one friend will now have eaten 1 + 3 = 4 slices.\n5. Therefore, the maximum number of slices any single friend could have eaten is 4.</s> "

In [11]:
tokenizer.decode(tokenizer.encode(pandas_dataset_airoboros.loc[0, "text"]))

"<s> [INST] Five friends went to a restaurant and ordered a pizza. The pizza was cut into 8 equal slices. If each friend ate at least one slice, what is the maximum number of slices any single friend could have eaten? [/INST] The maximum number of slices any single friend could have eaten is 4.\n\nHere's the reasoning behind this:\n\n1. We know that there are 8 slices and each of the five friends must eat at least one slice.\n2. So, let's start by giving each friend one slice. This means we've distributed 5 slices among the 5 friends.\n3. Now, we need to distribute the remaining 3 slices (since there were 8 slices in total) among the friends such that no friend eats more than the others.\n4. To achieve this, we'll give all the remaining slices to a single friend. So, one friend will now have eaten 1 + 3 = 4 slices.\n5. Therefore, the maximum number of slices any single friend could have eaten is 4.</s>  "

In [12]:
tokenizer.encode("[INST]")

[733, 16289, 28793]

In [13]:
tokenizer.encode("[/INST]")

[733, 28748, 16289, 28793]

In [14]:
num_dropped = 0
max_num_tokens_taken = []
for i in tqdm.tqdm(range(len(pandas_dataset_airoboros))):
    row = pandas_dataset_airoboros.loc[i]
    full = row["text"]
    num_tokens = len(tokenizer.encode(full))
    if num_tokens > 400:
        pandas_dataset_airoboros.drop(i, inplace=True)
        num_dropped += 1
    else:
        max_num_tokens_taken.append(num_tokens)

num_dropped

100%|██████████| 42731/42731 [01:54<00:00, 374.73it/s]


16440

In [15]:
pandas_dataset_airoboros.reset_index(drop=True, inplace=True)

In [16]:
max_num_chars_taken = []
for i in tqdm.tqdm(range(len(pandas_dataset_airoboros))):
    row = pandas_dataset_airoboros.loc[i]
    full = row["text"]
    max_num_chars_taken.append(len(full))

100%|██████████| 26291/26291 [00:01<00:00, 26194.34it/s]


In [17]:
#max_num_chars_taken.sort()
np.array(max_num_chars_taken)[-4]

334

In [18]:
np.max(max_num_tokens_taken)

400

In [19]:
train_dataset = Dataset.from_pandas(pandas_dataset_airoboros.loc[0:21000])

print("Train dataset:")
train_dataset = train_dataset.shuffle(seed=seed)
print(train_dataset)
print(train_dataset[0])
print(train_dataset[-1])

Train dataset:
Dataset({
    features: ['text'],
    num_rows: 21001
})
{'text': "<s>[INST] Do you know any jokes about Halloween? [/INST] Scary Halloween Joke **Person 1:** Knock knock! **Person 2:** Who's there? **Person 1:** A GHOST!!!</s> "}
{'text': '<s>[INST] Are there any books that are banned in the US? [/INST] The United States has a strong tradition of freedom of speech and expression, so it does not typically ban books outright. However, certain books may be challenged or banned in specific localities, schools, or libraries due to their content. These decisions are usually made by local school boards or library systems, and can be controversial. The American Library Association maintains a list of frequently challenged books. Some examples include "To Kill a Mockingbird" by Harper Lee, "The Catcher in the Rye" by J.D. Salinger, and "Of Mice and Men" by John Steinbeck.</s> '}


In [20]:
eval_dataset = Dataset.from_pandas(pandas_dataset_airoboros.loc[21000:])

print("Eval dataset:")
eval_dataset = eval_dataset.shuffle(seed=seed)
print(eval_dataset)
print(eval_dataset[0])
print(eval_dataset[-1])

Eval dataset:
Dataset({
    features: ['text'],
    num_rows: 5291
})
{'text': '<s>[INST] Generate a sentence with each word starting from y and going in reverse order till \'a\'. [/INST] "Yesterday, xenophobic wolves viciously undermined timid squirrels, rapidly questioning peaceful owls, noting many lamenting kangaroos justifying intense hibernation; gorillas fearfully echoed, \'dangerous creatures bring absolute anarchy.\'"</s> '}
{'text': '<s>[INST] Write a Python function that receives a list of words and returns the length of the longest one. [/INST] Here is a simple Python function that does this:\n\n```python\ndef longest_word_length(word_list):\n    return max(len(word) for word in word_list)\n```\n\nThis function uses a generator expression to create a sequence of lengths of the words in `word_list`, and then returns the maximum value from this sequence. If `word_list` is empty, the function will raise a `ValueError`.</s> '}


In [21]:
# remove old text cols
train_dataset = train_dataset.remove_columns(
    [col for col in train_dataset.column_names if col not in ["text"]]
)

In [22]:
# [markdown]
# ## Loading the model
# [markdown]

# %%
model_name = "mistralai/Mistral-7B-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False


# %%
model

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
   

In [23]:
# [markdown]
# Below we will load the configuration file in order to create the LoRA model. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance. Therefore we will add `q_proj`, `k_proj`, `v_proj`, `o_proj` layers in the target modules.

# %%
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
)

# [markdown]
# ## Loading the trainer
# [markdown]
# Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

# %%
output_dir = "./results"
num_train_epochs = 3
auto_find_batch_size = True
gradient_accumulation_steps = 1
optim = "paged_adamw_32bit"
save_strategy = "epoch"
learning_rate = 3e-4
lr_scheduler_type = "cosine"
warmup_ratio = 0.03
logging_strategy = "steps"
logging_steps = 25
evaluation_strategy = "epoch"
prediction_loss_only = True
bf16 = True

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    auto_find_batch_size=auto_find_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_strategy=save_strategy,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,
    logging_strategy=logging_strategy,
    logging_steps=logging_steps,
    evaluation_strategy=evaluation_strategy,
    prediction_loss_only=prediction_loss_only,
    bf16=bf16,
)


In [24]:
# [markdown]
# Then finally pass everthing to the trainer

# %%
max_seq_length = 512
response_template = "[/INST]"
collator = DataCollatorForCompletionOnlyLM(
    response_template=response_template,
    tokenizer=tokenizer,
    mlm=False
)


trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    data_collator=collator,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

# [markdown]
# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training

# %%
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)


Map:   0%|          | 0/21001 [00:00<?, ? examples/s]

Map:   0%|          | 0/5291 [00:00<?, ? examples/s]

In [25]:
# [markdown]
# ## Train the model
# [markdown]
# Now let's train the model! Simply call `trainer.train()`

# %%
trainer.train()

# Syncing run tough-cherry-67 to Weights & Biases (docs)
# View project at https://wandb.ai/dryanfurman/huggingface
# View run at https://wandb.ai/dryanfurman/huggingface/runs/9su15naj

[34m[1mwandb[0m: Currently logged in as: [33mdryanfurman[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.5823,0.58567
2,0.4018,0.554091
3,0.2556,0.61241


TrainOutput(global_step=15753, training_loss=0.4402689083730547, metrics={'train_runtime': 9721.1934, 'train_samples_per_second': 6.481, 'train_steps_per_second': 1.62, 'total_flos': 7.99918676288766e+17, 'train_loss': 0.4402689083730547, 'epoch': 3.0})

In [26]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
          )
          (k_proj): Linear4bit(
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features

In [27]:
#trainer.save_model("/content/results/runs/weights/")

### Test model and push to hub

In [33]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

In [34]:
peft_model_id = "/content/results/checkpoint-10502"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [35]:
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)
tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
tokenizer.pad_token = tokenizer.unk_token
tokenizer.clean_up_tokenization_spaces = True
tokenizer.add_bos_token = False
tokenizer.padding_side = "right"
tokenizer.pad_token

'<unk>'

In [36]:
from huggingface_hub import login
login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [37]:
# push to hub
model_id_load = "dfurman/Mistral-7B-Instruct-v0.1"

# tokenizer
tokenizer.push_to_hub(model_id_load, use_auth_token=True)
# safetensors
model.push_to_hub(model_id_load, use_auth_token=True, safe_serialization=True)

adapter_model.safetensors:   0%|          | 0.00/218M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dfurman/Mistral-7B-Instruct-v0.1/commit/4dc89e544aac32a95ce5c28c55074bb4d9a84bfa', commit_message='Upload model', commit_description='', oid='4dc89e544aac32a95ce5c28c55074bb4d9a84bfa', pr_url=None, pr_revision=None, pr_num=None)

## Basic usage

In [4]:
!pip install -q -U transformers peft torch accelerate einops sentencepiece

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/174.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m112.6/174.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
import torch
from peft import PeftModel, PeftConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)

In [8]:
peft_model_id = "dfurman/Mistral-7B-Instruct-v0.1"
config = PeftConfig.from_pretrained(peft_model_id)

tokenizer = AutoTokenizer.from_pretrained(
    peft_model_id,
    use_fast=True,
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

model = PeftModel.from_pretrained(
    model,
    peft_model_id
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)er_model.safetensors:   0%|          | 0.00/218M [00:00<?, ?B/s]

In [19]:
messages = [
    {"role": "user", "content": "Tell me a recipe for a mai tai."},
]

print("\n\n*** Prompt:")
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    return_tensors="pt",
)
print(tokenizer.decode(input_ids[0]))




*** Prompt:
<s> [INST] Tell me a recipe for a mai tai. [/INST] 


In [20]:

print("\n\n*** Generate:")
with torch.autocast("cuda", dtype=torch.bfloat16):
    output = model.generate(
        input_ids=input_ids.cuda(),
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.7,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=1.2,
        no_repeat_ngram_size=5,
    )

response = tokenizer.decode(
    output["sequences"][0][len(input_ids[0]):],
    skip_special_tokens=True
)
print(response)




*** Generate:
1 oz Rum
1 oz Brandy (or Cognac)
0.5 oz Lime Juice
2 tsp Orgeat Syrup*
3-4 Dashes Angostura Bitters**

Shake all ingredients with ice and strain into a glass over fresh ice. Garnish with lime wedges, mint sprig or cherry.

*Orgeat syrup is an almond syrup commonly used in cocktails. If unavailable, you can substitute it with simple syrup.
**Angostura bitters are essential to the taste of a Mai Tai.


In [23]:
response = """1 oz light rum
½ oz dark rum
¼ oz orange curaçao
2 oz pineapple juice
¾ oz lime juice
Dash of orgeat syrup (optional)
Splash of grenadine (for garnish, optional)
Lime wheel and cherry garnishes (optional)

Shake all ingredients except the splash of grenadine in a cocktail shaker over ice. Strain into an old-fashioned glass filled with fresh ice cubes. Gently pour the splash of grenadine down the side of the glass so that it sinks to the bottom. Add garnishes as desired."""

messages = [
    {"role": "user", "content": "Tell me a recipe for a mai tai."},
    {"role": "assistant", "content": response},
    {"role": "user", "content": "How can I make it more upscale and luxurious?"},

]

print("\n\n*** Prompt:")
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    return_tensors="pt",
)
print(tokenizer.decode(input_ids[0]))




*** Prompt:
<s> [INST] Tell me a recipe for a mai tai. [/INST] 1 oz light rum
½ oz dark rum
¼ oz orange curaçao
2 oz pineapple juice
¾ oz lime juice
Dash of orgeat syrup (optional)
Splash of grenadine (for garnish, optional)
Lime wheel and cherry garnishes (optional)

Shake all ingredients except the splash of grenadine in a cocktail shaker over ice. Strain into an old-fashioned glass filled with fresh ice cubes. Gently pour the splash of grenadine down the side of the glass so that it sinks to the bottom. Add garnishes as desired.</s>  [INST] How can I make it more upscale and luxurious? [/INST] 


In [24]:

print("\n\n*** Generate:")
with torch.autocast("cuda", dtype=torch.bfloat16):
    output = model.generate(
        input_ids=input_ids.cuda(),
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.7,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=1.2,
        no_repeat_ngram_size=5,
    )

response = tokenizer.decode(
    output["sequences"][0][len(input_ids[0]):],
    skip_special_tokens=True
)
print(response)




*** Generate:
To elevate your Mai Tai, consider using premium rums like Appleton Estate or Zacapa instead of standard brands. You could also add a dash of bitters to enhance the flavors. For even more sophistication, try infusing the simple syrup used in this recipe by combining sugar, water, and any herbs or spices you desire; let steep overnight before straining and adding it to your drink. Finally, garnishing with fresh fruits such as slices of mango or peach will take your Mai Tai to new heights of refinement.


In [30]:
# inference speed test

import tqdm
import time

prompt = "Write me a long list of things to do in San Francisco."

runtimes = []
for i in tqdm.tqdm(range(30)):
    start = time.time()

    messages = [
      {"role": "user", "content": prompt},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        return_tensors="pt",
    )

    with torch.autocast("cuda", dtype=torch.bfloat16):
        output = model.generate(
            input_ids=input_ids.cuda(),
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            repetition_penalty=1.2,
            no_repeat_ngram_size=5,
        )

    response = tokenizer.decode(
        output["sequences"][0][len(input_ids[0]):],
        skip_special_tokens=True
    )

    end = time.time()
    runtimes.append(end - start)


100%|██████████| 30/30 [01:43<00:00,  3.44s/it]


In [31]:
avg_runtime = torch.mean(torch.tensor(runtimes)).item()
print(f"Runtime avg in seconds: {avg_runtime}")  # time in seconds

Runtime avg in seconds: 3.4430274963378906
