In [1]:
# load baseline model no quantization
import argparse
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='cuda')

device = 'cuda'
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer.encode('I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n', return_tensors="pt").to(device)
outputs = model.generate(inputs, max_length=200, do_sample=True,)
print(tokenizer.decode(outputs[0]))

# 24074MB (2.5 min)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


<s> I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?
I have seen a lot of good reviews for this show. I've never seen it but I will try to find it.
I'm not a huge TV watcher, but I really enjoyed "The Wire" and "The Sopranos."
"The Sopranos" was amazing. I watched it in a couple of weeks. I was sad when it ended.
I have heard of both of those shows, but I haven't seen them.
I'm a huge fan of "The Wire". I also loved "Breaking Bad" and "The Sopranos".
I watched "The Wire" and "Breaking Bad" and both were awesome. I am not a huge tv watcher but I am definitely going to check out "The Sopranos".


In [1]:
# load baseline model with quantization
import argparse
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

model_name = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map='sequential')

device = 'cuda'
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer.encode('I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n', return_tensors="pt").to('cuda')
outputs = model.generate(inputs, max_length=200, do_sample=True,)
print(tokenizer.decode(outputs[0]))

# 5905MB (22s)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


<s> I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?
 Unterscheidung zwischen "good" und "great" ist subjektiv.
 I've read that "The Wire" is a great show, but I haven't seen it yet.
 I'm also interested in your recommendations of good movies.
 I'm a big fan of "The Wire". It's a great show.
 I'm not sure if I'd say "Breaking Bad" is the best show ever, but it's certainly great.
 I'm not sure if I'd say "Breaking Bad" is the best show ever, but it's certainly a good show.
 I'm not sure if I'd say "Breaking Bad" is the best show ever, but it's certainly a great show.
 I'm not


In [1]:
# load merged model with quantization
import argparse
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

# model_name = "meta-llama/Llama-2-7b-hf"
model_name = "/home/rapids/mounted/finetune_llm/results/llama2/final_merged_checkpoint" # for some reason this must be full path instead of relative

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map='sequential')

device = 'cuda'
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer.encode('I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n', return_tensors="pt").to('cuda')
outputs = model.generate(inputs, max_length=200, do_sample=True,)
print(tokenizer.decode(outputs[0]))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<s> I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?
 hopefully someone can help me.
I'm a big fan of "The Wire" and "The Shield".
"The Wire" is one of the best shows ever. I'm re-watching it right now.
I've heard good things about "The Wire" and "The Shield", but I've never seen them. I'll have to check them out.
I've heard good things about "The Wire" and "The Shield", but I've never seen them.
The Wire is one of the best shows ever. I'm re-watching it right now.
The Wire is one of the best shows ever. I'm re-watching it right now. I'm a big fan of "The Wire"


In [3]:

# load complex merged model with quantization
import argparse
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

# model_name = "meta-llama/Llama-2-7b-hf"
model_name = "/home/rapids/mounted/finetune_llm/results/llama2/complex_merged_checkpoint" # for some reason this must be full path instead of relative

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map='sequential')

device = 'cuda'
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer.encode('I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n', return_tensors="pt").to('cuda')
outputs = model.generate(inputs, max_length=200, do_sample=True,)
print(tokenizer.decode(outputs[0]))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<s> I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?
 Hinweis: Es ist empfehlenswert, dass Sie ein Video-Streaming-Abonnement abonnieren.
The best show I've seen in the last 10 years is "Better Call Saul"
There are so many great shows that I'm not sure where to start.
"Breaking Bad" is one of the best shows I've seen.
"Band of Brothers" is a great series.
"The Wire" is another great series.
"The Sopranos" is another great series.
"Mad Men" is another great series.
"Dexter" is another great series.
"Game of Thrones" is another great series.
"The Walking Dead" is another great series.
"Fargo" is another


In [18]:
inputs = tokenizer.encode('Translate the following sentence into Chinese. "The sky is blue"\n', return_tensors="pt").to('cuda')
outputs = model.generate(inputs, max_length=200, do_sample=True,)
print(tokenizer.decode(outputs[0]))

<s> Translate the following sentence into Chinese. "The sky is blue"
Chinese is a tonal language, so the tone you use will change the meaning of a word.
Chinese is a tonal language, so the tone you use will change the meaning of a word. In Chinese, the word "blue" (蓝) has a different tone than "sky" (天).
The sky is blue. 天蓝。
The sky is blue. 天是蓝的。
The sky is blue. 天是蓝的。
The sky is blue. 天是蓝的。
The sky is blue. 天是蓝的。
The sky is blue. 天是蓝的。
The sky is blue. 天是蓝的。
The sky is blue. 天是蓝的。
The sky is blue. 


# Compare llama 2 before and after fine tuning

In [1]:
from transformers import AutoTokenizer
import transformers
import torch

# model = "meta-llama/Llama-2-7b-chat-hf"
model = "meta-llama/Llama-2-7b-hf" 

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Result: I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?
I'm not a fan of "House of Cards" but I've heard it's good. I haven't seen "Better Call Saul" yet. I'll check that out.
I'm not a big TV watcher. I don't have cable and only have Netflix and Amazon Prime. I've been watching "The Wire" on Netflix. I'm on season 4 now. I'm enjoying it so far.
I've heard of "The Wire" but haven't seen it. I'll check it out. I've also heard good things about "The Sopranos".
I've never seen "The Sopranos". I've heard it's good though.
I've


# After

In [1]:
from transformers import AutoTokenizer, BitsAndBytesConfig
import transformers
import torch

# model = "meta-llama/Llama-2-7b-chat-hf"
# model = "meta-llama/Llama-2-7b-hf"
model = "/home/rapids/mounted/finetune_llm/results/llama2/final_merged_checkpoint" # for some reason this must be full path instead of relative


tokenizer = AutoTokenizer.from_pretrained(model)

def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

bnb_config = create_bnb_config()

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    quantization_config=bnb_config,
    # torch_dtype=torch.float16,
    device_map="auto",
)

sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


ValueError: The following `model_kwargs` are not used by the model: ['quantization_config'] (note: typos in the generate arguments will also show up in this list)

# 4 bit

In [1]:
import argparse
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments, pipeline
from datasets import load_dataset

In [2]:
def load_model(model_name, bnb_config):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{24564}MB'

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto", # dispatch efficiently the model on the available ressources
        max_memory = {i: max_memory for i in range(n_gpus)},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

    # Needed for LLaMA tokenizer
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [3]:
def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

In [4]:
model_name = "/home/rapids/mounted/finetune_llm/results/llama2/final_merged_checkpoint" # for some reason this must be full path instead of relative

bnb_config = create_bnb_config()

model, tokenizer = load_model(model_name, bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# device = 'cuda'
# inputs = tokenizer.encode('I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n', return_tensors="pt").to(device)
# outputs = model.generate(inputs)
# print(tokenizer.decode(outputs[0]))

<s> I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?
 nobody is perfect and that is why we are here to help each other
You have good taste in tv series.
"Breaking Bad" and "Band of Brothers" are among the best shows I've ever seen.
I'm not a big tv show watcher. I've seen a few shows that I liked, but nothing that I'd recommend.
If you're looking for a good series, I recommend "The Wire" and "Game of Thrones".
I'm a big fan of "The Wire".
I've never seen "Game of Thrones", but I've heard good things about it.
I've seen "The Wire" and it's great.
I've never seen "Game of Thrones", but I've heard good things about it.
I'm a big fan of "The Wire". I've never seen "Game of Thrones", but I've heard good things about it.
I've seen "The Wire" and it's great. I've never seen "Game of Thrones", but I've heard good things about it.
I'm a big fan of "The Wire". I've never seen "Game of Thrones", but I've heard good things about it. I've s

In [5]:
sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)

NameError: name 'pipeline' is not defined