In [1]:
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
import os

#  [markdown]
# ## Finetune an llm on an A100
#
# We will leverage PEFT library from Hugging Face ecosystem, as well as QLoRA for more memory efficient finetuning

# %%
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [2]:
# %%
!nvidia-smi

Fri Nov 17 07:49:07 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
#  [markdown]
# ## Setup
#
# Run the cells below to setup and install the required libraries. For our experiment we will need `accelerate`, `peft`, `transformers`, `datasets` and TRL to leverage the recent [`SFTTrainer`](https://huggingface.co/docs/trl/main/en/sft_trainer). We will use `bitsandbytes` to [quantize the base model into 4bit](https://huggingface.co/blog/4bit-transformers-bitsandbytes).

#
!pip install -q -U trl accelerate protobuf datasets bitsandbytes einops wandb sentencepiece
!pip install -q -U git+https://github.com/huggingface/peft
!pip install -q -U git+https://github.com/huggingface/transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/133.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m133.1/133.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.9/133.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

In [3]:
# %%
import torch
import pandas as pd
import tqdm
import numpy as np
import copy
from datasets import load_dataset, Dataset
from peft import LoraConfig
from trl import (
    SFTTrainer,
    DataCollatorForCompletionOnlyLM
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)



In [8]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'stor

In [4]:
# [markdown]
# Let's also load the tokenizer below

# %%
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf", use_fast=True, trust_remote_code=True)
tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
tokenizer.pad_token = tokenizer.unk_token
tokenizer.clean_up_tokenization_spaces = True
tokenizer.add_bos_token = False
tokenizer.padding_side = "right"
tokenizer.pad_token

(…)3b-hf/resolve/main/tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

(…)ama-2-13b-hf/resolve/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

(…)-hf/resolve/main/special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

'<unk>'

In [5]:
# [markdown]
# ## Dataset
# %%
seed = 42

# %%
# grab the first 20000 entries of SlimOrca in an instruction format

dataset_name = "Open-Orca/SlimOrca"
print(f"\nLoading {dataset_name} dataset...")
dataset_SlimOrca = load_dataset(dataset_name, split="train", streaming=True)

dataset_SlimOrca = dataset_SlimOrca.take(20000)
texts = []

for row in dataset_SlimOrca:
    messages_keep = []
    for message in row["conversations"]:
        if message["from"] == "human":
            messages_keep.append({"role": "user", "content": message["value"]})
        if message["from"] == "gpt":
            messages_keep.append({"role": "assistant", "content": message["value"]})

    text = tokenizer.apply_chat_template(messages_keep, tokenize=False, add_generation_prompt=False)
    texts.append(text)

pandas_dataset_SlimOrca = pd.DataFrame([texts]).T
pandas_dataset_SlimOrca.columns = ["text"]
pandas_dataset_SlimOrca


Loading Open-Orca/SlimOrca dataset...


Unnamed: 0,text
0,"<s>[INST] Write an article based on this ""A ma..."
1,<s>[INST] Answer the following question: - num...
2,<s>[INST] Produce a long descriptive sentence ...
3,<s>[INST] Write a title for this article:\n\nA...
4,"<s>[INST] Definition: In this task, you are gi..."
...,...
19995,"<s>[INST] Given the task definition and input,..."
19996,<s>[INST] Features: Case/Bezel Material Resin ...
19997,<s>[INST] Detailed Instructions: You are given...
19998,<s>[INST] Answer the following question: Proce...


In [6]:
# grab first 20000 rows of platypus in an instruction format

dataset_name = "garage-bAInd/Open-Platypus"
print(f"\nLoading {dataset_name} dataset...")
dataset_platypus = load_dataset(dataset_name, split="train", streaming=True)

dataset_platypus = dataset_platypus.take(20000)
texts = []

for row in dataset_platypus:
    messages_keep = []
    messages_keep.append({"role": "user", "content": row["instruction"]})
    messages_keep.append({"role": "assistant", "content": row["output"]})
    text = tokenizer.apply_chat_template(messages_keep, tokenize=False, add_generation_prompt=False)
    texts.append(text)

pandas_dataset_platypus = pd.DataFrame([texts]).T
pandas_dataset_platypus.columns = ["text"]
pandas_dataset_platypus


Loading garage-bAInd/Open-Platypus dataset...


Unnamed: 0,text
0,<s>[INST] A board game spinner is divided into...
1,<s>[INST] My school's math club has 6 boys and...
2,<s>[INST] How many 4-letter words with at leas...
3,<s>[INST] Melinda will roll two standard six-s...
4,"<s>[INST] Let $p$ be the probability that, in ..."
...,...
19995,<s>[INST] Historian: Alexander the Great shoul...
19996,<s>[INST] Biologist: Researchers believe that ...
19997,<s>[INST] S: It would be premature to act to h...
19998,<s>[INST] Although parapsychology is often con...


In [7]:
# grab first 20000 rows of platypus in an instruction format

dataset_name = "jondurbin/airoboros-2.2.1"
print(f"\nLoading {dataset_name} dataset...")
dataset_airoboros = load_dataset(dataset_name, split="train", streaming=True)

dataset_airoboros = dataset_airoboros.take(20000)
texts = []

for row in dataset_airoboros:
    messages_keep = []
    messages_keep.append({"role": "user", "content": row["instruction"]})
    messages_keep.append({"role": "assistant", "content": row["response"]})
    text = tokenizer.apply_chat_template(messages_keep, tokenize=False, add_generation_prompt=False)
    texts.append(text)

pandas_dataset_airoboros = pd.DataFrame([texts]).T
pandas_dataset_airoboros.columns = ["text"]
pandas_dataset_airoboros


Loading jondurbin/airoboros-2.2.1 dataset...


Unnamed: 0,text
0,<s>[INST] Five friends went to a restaurant an...
1,<s>[INST] Write a Python script that connects ...
2,<s>[INST] How do you stay motivated? Respond u...
3,<s>[INST] Write a Python script that takes a u...
4,<s>[INST] The Apollo astronauts brought back s...
...,...
19995,<s>[INST] Write a blues song about a man who l...
19996,<s>[INST] Write a short story about a time-tra...
19997,"<s>[INST] There are three boxes labeled A, B, ..."
19998,<s>[INST] Write a diary entry about a day at s...


In [8]:
pandas_train_dataset = pd.concat([pandas_dataset_platypus, pandas_dataset_SlimOrca, pandas_dataset_airoboros]).reset_index(drop=True)
pandas_train_dataset

Unnamed: 0,text
0,<s>[INST] A board game spinner is divided into...
1,<s>[INST] My school's math club has 6 boys and...
2,<s>[INST] How many 4-letter words with at leas...
3,<s>[INST] Melinda will roll two standard six-s...
4,"<s>[INST] Let $p$ be the probability that, in ..."
...,...
59995,<s>[INST] Write a blues song about a man who l...
59996,<s>[INST] Write a short story about a time-tra...
59997,"<s>[INST] There are three boxes labeled A, B, ..."
59998,<s>[INST] Write a diary entry about a day at s...


In [9]:
train_dataset = Dataset.from_pandas(pandas_train_dataset)
train_dataset

# remove old text cols
train_dataset = train_dataset.remove_columns(
    [col for col in train_dataset.column_names if col not in ["text"]]
)

print("Final train dataset:")
print(train_dataset)
print(train_dataset[0])
print(train_dataset[-1])


Final train dataset:
Dataset({
    features: ['text'],
    num_rows: 60000
})
{'text': '<s>[INST] A board game spinner is divided into three parts labeled $A$, $B$  and $C$. The probability of the spinner landing on $A$ is $\\frac{1}{3}$ and the probability of the spinner landing on $B$ is $\\frac{5}{12}$.  What is the probability of the spinner landing on $C$? Express your answer as a common fraction. [/INST] To find the probability of the spinner landing on $C$, I need to subtract the probabilities of the spinner landing on $A$ and $B$ from $1$, since the sum of the probabilities of all possible outcomes is $1$. I can write this as an equation: $P(C) = 1 - P(A) - P(B)$. I know that $P(A) = \\frac{1}{3}$ and $P(B) = \\frac{5}{12}$, so I can plug those values into the equation and simplify. I get: $P(C) = 1 - \\frac{1}{3} - \\frac{5}{12} = \\frac{12}{12} - \\frac{4}{12} - \\frac{5}{12} = \\frac{3}{12}$. I can reduce this fraction by dividing the numerator and denominator by $3$, and I 

In [10]:
print(tokenizer.decode(tokenizer.encode(train_dataset[-1]["text"])))

<s> [INST] Generate a plan to answer the following query using the tools provided. Each step in the plan should correspond to a piece of evidence generated by one of the tools. 

Tools Available:
YahooSearch[input]: Uses Yahoo's search engine to gather information related to the input query.
PageLoader[input]: Loads webpage content from one or multiple URLs provided in the input.
HyperlinkFinder[input]: Extracts hyperlinks from a block of text.
ALICE[input]: An AI-driven question-answering tool. It requires a question and a context (which can be a previous #E[index]) to provide an answer.

Output Format:
Plan: [description of the first step]
#E1 = [tool to use with input]
Plan: [description of the subsequent step based on the result of #E1]
#E2 = [next tool to use with input, potentially referring to #E1]
...
Final Answer: #E[n]

Question: What were the main causes of World War I and what countries were primarily involved? [/INST] Plan: Initiate a search for information regarding the m

In [11]:
encoded_train_dataset = train_dataset.map(lambda examples: tokenizer(examples['text']), batched=True)
encoded_train_dataset

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 60000
})

In [12]:
encoded_train_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 60000
})

In [13]:
num_dropped = 0
rows_to_drop = []
max_num_tokens_taken = []
for i in tqdm.tqdm(range(len(pandas_train_dataset))):
    row = encoded_train_dataset[i]
    num_tokens = len(row["input_ids"])
    if num_tokens > 500:
        rows_to_drop.append(i)
        num_dropped += 1
    else:
        max_num_tokens_taken.append(num_tokens)

num_dropped

100%|██████████| 60000/60000 [00:24<00:00, 2458.00it/s]


15258

In [14]:
np.max(max_num_tokens_taken)

500

In [15]:
#pandas_train_dataset

In [16]:
pandas_train_dataset = pandas_train_dataset.drop(rows_to_drop).reset_index(drop=True)

In [17]:
pandas_train_dataset

Unnamed: 0,text
0,<s>[INST] A board game spinner is divided into...
1,<s>[INST] My school's math club has 6 boys and...
2,<s>[INST] How many 4-letter words with at leas...
3,<s>[INST] Melinda will roll two standard six-s...
4,<s>[INST] How many four-digit numbers greater ...
...,...
44737,<s>[INST] What would be the ultimate vacation?...
44738,<s>[INST] Write a sonnet about the moon with e...
44739,<s>[INST] Recommend three hiking trails suitab...
44740,"<s>[INST] There are three boxes labeled A, B, ..."


In [18]:
train_dataset = Dataset.from_pandas(pandas_train_dataset)

print("Train dataset:")
train_dataset = train_dataset.shuffle(seed=seed)
print(train_dataset)
print(train_dataset[0])
print(train_dataset[-1])

Train dataset:
Dataset({
    features: ['text'],
    num_rows: 44742
})
{'text': '<s>[INST] This boulder is volcanic in origin and yet the rest of the rock in this area is sedimentary. Since this area was covered by southward-moving glaciers during the last ice age, this boulder was probably deposited here, hundreds of miles from its geological birthplace, by a glacier. Which one of the following, if true, most seriously undermines the conclusion drawn in the argument above?\nA: Most boulders that have been moved by glaciers have not been moved more than 100 miles.\nB: The closest geological source of volcanic rock is 50 miles north of this boulder.\nC: The closest geological source of volcanic rock is 50 miles south of this boulder.\nD: There are no geological sources of volcanic rock north of this boulder. [/INST] D</s> '}
{'text': '<s>[INST] Pops, owner of a street corner candy store, heard several gunshots and rushed out in the street. He found Velma lying on the sidewalk, bleeding

In [19]:
len(tokenizer.encode(train_dataset[100]["text"]))

436

In [20]:
# ensure none over 500 tokens

# check that above worked
lens=[]
encoded_train_dataset = train_dataset.map(lambda examples: tokenizer(examples['text']), batched=True)
for row in encoded_train_dataset:
    lens.append(len(row["input_ids"]))
np.max(lens)

Map:   0%|          | 0/44742 [00:00<?, ? examples/s]

500

In [21]:
# remove old text cols
train_dataset = train_dataset.remove_columns(
    [col for col in train_dataset.column_names if col not in ["text"]]
)

In [22]:
# [markdown]
# ## Loading the model
# [markdown]

# %%
model_name = "meta-llama/Llama-2-13b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False


# %%
model

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

(…)b-hf/resolve/main/generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRM

In [23]:
# [markdown]
# Below we will load the configuration file in order to create the LoRA model. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance. Therefore we will add `q_proj`, `k_proj`, `v_proj`, `o_proj` layers in the target modules.

# %%
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
)

# [markdown]
# ## Loading the trainer
# [markdown]
# Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

# %%
output_dir = "./results"
num_train_epochs = 2
auto_find_batch_size = True
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
save_strategy = "epoch"
learning_rate = 3e-4
lr_scheduler_type = "cosine"
warmup_ratio = 0.03
logging_strategy = "steps"
logging_steps = 25
evaluation_strategy = "no"
bf16 = True

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    auto_find_batch_size=auto_find_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_strategy=save_strategy,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,
    logging_strategy=logging_strategy,
    logging_steps=logging_steps,
    evaluation_strategy=evaluation_strategy,
    bf16=bf16,
)


In [24]:
# [markdown]
# Then finally pass everthing to the trainer

# %%
max_seq_length = 512

response_template = "[/INST]"
print(f"Response template for collator: {response_template}")
collator = DataCollatorForCompletionOnlyLM(
    response_template=response_template,
    tokenizer=tokenizer,
    mlm=False
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    dataset_text_field="text",
    data_collator=collator,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

# [markdown]
# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training

# %%
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)


Response template for collator: [/INST]


Map:   0%|          | 0/44742 [00:00<?, ? examples/s]

In [25]:
# [markdown]
# ## Train the model
# [markdown]
# Now let's train the model! Simply call `trainer.train()`

# %%
trainer.train()

# Syncing run light-hill-94 to Weights & Biases (docs)
# View project at https://wandb.ai/dryanfurman/huggingface
# View run at https://wandb.ai/dryanfurman/huggingface/runs/93c5regw

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,2.3098
50,1.7322
75,1.6757
100,1.6234
125,1.2405
150,1.4353
175,1.4774
200,1.8882
225,1.283
250,1.2709


Step,Training Loss
25,2.3098
50,1.7322
75,1.6757
100,1.6234
125,1.2405
150,1.4353
175,1.4774
200,1.8882
225,1.283
250,1.2709


TrainOutput(global_step=22370, training_loss=0.6394541248872336, metrics={'train_runtime': 29523.563, 'train_samples_per_second': 3.031, 'train_steps_per_second': 0.758, 'total_flos': 2.1081912976084378e+18, 'train_loss': 0.6394541248872336, 'epoch': 2.0})

In [26]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=5120, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=5120, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
   

### Test model and push to hub

In [1]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
peft_model_id = "/content/results/checkpoint-22370"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# [markdown]
# Let's also load the tokenizer below

# %%
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf", use_fast=True, trust_remote_code=True)
tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
tokenizer.pad_token = tokenizer.unk_token
tokenizer.clean_up_tokenization_spaces = True
tokenizer.add_bos_token = False
tokenizer.padding_side = "right"
tokenizer.pad_token

'<unk>'

In [4]:
from huggingface_hub import login
login("")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
# push to hub
model_id_load = "dfurman/Llama-2-13B-Instruct-v0.2"

# tokenizer
tokenizer.push_to_hub(model_id_load, use_auth_token=True)
# safetensors
model.push_to_hub(model_id_load, use_auth_token=True, safe_serialization=True)



adapter_model.safetensors:   0%|          | 0.00/210M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dfurman/Llama-2-13B-Instruct-v0.1/commit/7e4f067516f092b078170af248c9d823a8118938', commit_message='Upload model', commit_description='', oid='7e4f067516f092b078170af248c9d823a8118938', pr_url=None, pr_revision=None, pr_num=None)

## Basic usage

In [None]:
!pip install -q -U transformers peft torch accelerate einops sentencepiece

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers imhport (
    AutoModelForCausalLM,
    AutoTokenizer,
)

In [None]:
peft_model_id = "dfurman/Mistral-7B-Instruct-v0.2"
config = PeftConfig.from_pretrained(peft_model_id)

tokenizer = AutoTokenizer.from_pretrained(
    peft_model_id,
    use_fast=True,
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

model = PeftModel.from_pretrained(
    model,
    peft_model_id
)

(…)ct-v0.2/resolve/main/adapter_config.json:   0%|          | 0.00/520 [00:00<?, ?B/s]

(…)-v0.2/resolve/main/tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

(…)nstruct-v0.2/resolve/main/tokenizer.json:   0%|          | 0.00/1.79M [00:00<?, ?B/s]

(…)0.2/resolve/main/special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

(…)Mistral-7B-v0.1/resolve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

(…)esolve/main/pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

(…)v0.1/resolve/main/generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/109M [00:00<?, ?B/s]

In [6]:
messages = [
    {"role": "user", "content": "Tell me a recipe for a mai tai."},
]

print("\n\n*** Prompt:")
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    return_tensors="pt",
    add_generation_prompt=True,
)
print(tokenizer.decode(input_ids[0]))




*** Prompt:
<s> [INST] Tell me a recipe for a mai tai. [/INST] 


In [7]:
print("\n\n*** Generate:")
with torch.autocast("cuda", dtype=torch.bfloat16):
    output = model.generate(
        input_ids=input_ids.cuda(),
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.01,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=1.2,
        no_repeat_ngram_size=5,
        #num_beams=4,
        #length_penalty=-1,
    )

response = tokenizer.decode(
    output["sequences"][0][len(input_ids[0]):],
    skip_special_tokens=True
)
print(response)



*** Generate:
1 oz Orange Curaçao
4 oz Dark Rum (Gosling's is recommended)
2 oz Light Rum (Bacardi or Captain Morgan are good choices)
0.5 oz Orgeat Syrup
0.5-1 oz Fresh Lime Juice
Shake all ingredients with ice and strain into an old fashioned glass filled with crushed ice. Garnish with a slice of pineapple, maraschino cherry, and an orange wedge.


In [8]:
messages = [
    {"role": "user", "content": "Tell me a recipe for a mai tai."},
    {"role": "assistant", "content": response},
    {"role": "user", "content": "How can I make the mai tai more upscale and luxurious?"},
]

print("\n\n*** Prompt:")
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    return_tensors="pt",
)
print(tokenizer.decode(input_ids[0]))



*** Prompt:
<s> [INST] Tell me a recipe for a mai tai. [/INST] 1 oz Orange Curaçao
4 oz Dark Rum (Gosling's is recommended)
2 oz Light Rum (Bacardi or Captain Morgan are good choices)
0.5 oz Orgeat Syrup
0.5-1 oz Fresh Lime Juice
Shake all ingredients with ice and strain into an old fashioned glass filled with crushed ice. Garnish with a slice of pineapple, maraschino cherry, and an orange wedge.</s>  [INST] How can I make the mai tai more upscale and luxurious? [/INST] 


In [9]:

print("\n\n*** Generate:")
with torch.autocast("cuda", dtype=torch.bfloat16):
    output = model.generate(
        input_ids=input_ids.cuda(),
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.7,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=1.2,
        no_repeat_ngram_size=5,
        num_beams=4,
        length_penalty=-1,
    )

response = tokenizer.decode(
    output["sequences"][0][len(input_ids[0]):],
    skip_special_tokens=True
)
print(response)




*** Generate:
To make the mai tait more upscale, you can use higher quality ingredients. For example, you can use a premium dark rum like Appleton Estate or El Dorado instead of Gosling's. You can also use a high-end orange liqueur like Grand Marnier or Cointreau instead of Orange Curaçao. Additionally, you can use freshly squeezed lime juice instead of the bottled version. Finally, you can serve the mai tai in a nice glass, such as a coupe or a champagne flute, to elevate the presentation.


In [10]:
# inference speed test

import tqdm
import time

prompt = "Write me a long list of things to do in San Francisco."

runtimes = []
for i in tqdm.tqdm(range(30)):
    start = time.time()

    messages = [
      {"role": "user", "content": prompt},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        return_tensors="pt",
    )

    with torch.autocast("cuda", dtype=torch.bfloat16):
        output = model.generate(
            input_ids=input_ids.cuda(),
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            repetition_penalty=1.2,
            no_repeat_ngram_size=5,
            num_beams=4,
            length_penalty=-1,
        )

    response = tokenizer.decode(
        output["sequences"][0][len(input_ids[0]):],
        skip_special_tokens=True
    )

    end = time.time()
    runtimes.append(end - start)


100%|██████████| 30/30 [01:50<00:00,  3.67s/it]


In [11]:
avg_runtime = torch.mean(torch.tensor(runtimes)).item()
print(f"Runtime avg in seconds: {avg_runtime}")  # time in seconds

Runtime avg in seconds: 3.670870065689087


In [12]:
messages = [
    {"role": "user", "content": "Write me a list of ten things to do in San Francisco."},
]

print("\n\n*** Prompt:")
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    return_tensors="pt",
    add_generation_prompt=True,
)
print(tokenizer.decode(input_ids[0]))




*** Prompt:
<s> [INST] Write me a list of ten things to do in San Francisco. [/INST] 


In [13]:
print("\n\n*** Generate:")
with torch.autocast("cuda", dtype=torch.bfloat16):
    output = model.generate(
        input_ids=input_ids.cuda(),
        max_new_tokens=1024*2,
        do_sample=True,
        temperature=0.01,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        #repetition_penalty=1.2,
        #no_repeat_ngram_size=5,
        #num_beams=4,
        #length_penalty=-1,
    )

response = tokenizer.decode(
    output["sequences"][0][len(input_ids[0]):],
    skip_special_tokens=True
)
print(response)



*** Generate:
1. Visit the Golden Gate Bridge: This iconic landmark is a must-see for any tourist in San Francisco. You can walk or bike across the bridge, or simply admire it from the viewpoints along the way.

2. Explore Chinatown: San Francisco's Chinatown is the oldest and one of the largest Chinatowns outside of Asia. Stroll through the streets, try authentic Chinese food, and shop for souvenirs.

3. Ride a Cable Car: San Francisco is famous for its cable cars, which have been operating since 1873. Take a ride on one of these historic cars and enjoy the scenic views as you travel through the city.

4. Visit Alcatraz Island: This former prison, located in the middle of San Francisco Bay, is now a popular tourist attraction. Take a ferry ride to the island and tour the cellhouse where notorious criminals like Al Capone were once held.

5. Hike in Golden Gate Park: This large urban park is home to several museums, gardens, and lakes. Hike through the park to see the California Acad

In [14]:
messages = [
    {"role": "user", "content": "The bakers at the Beverly Hills Bakery baked 200 loaves of bread on Monday morning. They sold 93 loaves in the morning and 39 loaves in the afternoon. A grocery store then returned 6 unsold loaves back to the bakery. How many loaves of bread did the bakery have left?"},
]

print("\n\n*** Prompt:")
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    return_tensors="pt",
    add_generation_prompt=True,
)
print(tokenizer.decode(input_ids[0]))






*** Prompt:
<s> [INST] The bakers at the Beverly Hills Bakery baked 200 loaves of bread on Monday morning. They sold 93 loaves in the morning and 39 loaves in the afternoon. A grocery store then returned 6 unsold loaves back to the bakery. How many loaves of bread did the bakery have left? [/INST] 


In [15]:
print("\n\n*** Generate:")
with torch.autocast("cuda", dtype=torch.bfloat16):
    output = model.generate(
        input_ids=input_ids.cuda(),
        max_new_tokens=1024*2,
        do_sample=True,
        temperature=0.01,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        #repetition_penalty=1.2,
        #no_repeat_ngram_size=5,
        #num_beams=4,
        #length_penalty=-1,
    )

response = tokenizer.decode(
    output["sequences"][0][len(input_ids[0]):],
    skip_special_tokens=True
)
print(response)



*** Generate:
161 loaves of bread were left.

Here's the reasoning:

1. The bakery started with 200 loaves of bread.
2. They sold 93 loaves in the morning and 39 loaves in the afternoon, which is a total of 132 loaves sold.
3. The grocery store returned 6 unsold loaves back to the bakery.
4. To find out how many loaves are left, we subtract the number of loaves sold and returned from the initial number of loaves: 200 - 132 - 6 = 161 loaves.

Therefore, the bakery had 161 loaves of bread left.
