In [1]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import set_seed

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")

In [None]:
generated_ids = model.generate(**model_inputs)

In [None]:
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model_inputs = tokenizer(
    ["A list of colors: red, blue", "Portugal is"], return_tensors="pt", padding=True
).to("cuda")

In [None]:
model_inputs

In [None]:
tokenizer.batch_decode(model_inputs['input_ids'], skip_special_tokens=False)[0]

In [None]:
generated_ids = model.generate(**model_inputs)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

In [None]:
model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")

In [None]:
generated_ids = model.generate(**model_inputs)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
generated_ids = model.generate(**model_inputs, max_new_tokens=50)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
set_seed(42)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")

In [None]:
generated_ids = model.generate(**model_inputs)

In [None]:
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
generated_ids = model.generate(**model_inputs, do_sample=True)

In [None]:
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

# Left Padding for Generation

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="right")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model_inputs = tokenizer(
    ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
).to("cuda")

In [None]:
model_inputs

In [None]:
generated_ids = model.generate(**model_inputs)

In [None]:
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model_inputs = tokenizer(
    ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
).to("cuda")

In [None]:
generated_ids = model.generate(**model_inputs)

In [None]:
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

# Wrong Prompt

In [2]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
model = AutoModelForCausalLM.from_pretrained(
    "HuggingFaceH4/zephyr-7b-alpha", device_map="auto", load_in_4bit=True
)
set_seed(0)
prompt = """How many helicopters can a human eat in one sitting? Reply as a thug."""
model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
input_length = model_inputs.input_ids.shape[1]
generated_ids = model.generate(**model_inputs, max_new_tokens=20)
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




I'm not a thug, but i can tell you that a human cannot eat


In [3]:
set_seed(0)
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a thug",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]
model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
input_length = model_inputs.shape[1]
generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=20)
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


None, you thug. How bout you try to focus on more useful questions?
