In [1]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

In [2]:
MAX_NEW_TOKENS = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# BitsandBytes config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Only setting the `load_in_4bit` flag gives this error: "UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default). This will lead to slow inference or training speed."

In [3]:
# Interesting: This model starts to spit out the same text after 32-64 tokens. Autoregressive limitations exposed!!
model_id = "01-ai/Yi-6B"
# model_id = "amazon/MistralLite"

# Load the model
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype="auto", trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True, trust_remote_code=True) # trusting remote code required for loading Yi-6B 
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, trust_remote_code=True)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 

# CausalLMs don't usually have a pad token
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
print(f"Model memory: {model.get_memory_footprint() / 1e9} GBs")

Model memory: 3.951566848 GBs


In [5]:
model.generation_config

GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 0
}

In [6]:
# Encode the input text
# inputs = tokenizer("There's a place where time stands still. A place of breath taking wonder, but also", return_tensors="pt")
model_inputs = tokenizer("If I add 2 and 2 together, I get ", return_tensors="pt")

# Deterministic methods

## Greedy search

In [7]:
# Generate output with the end-of-sequence token
outputs = model.generate(
    model_inputs.input_ids.cuda(),
    max_new_tokens=MAX_NEW_TOKENS, 
    eos_token_id=tokenizer.eos_token_id, 
    # temperature=2.0, 
    # do_sample=True,
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
If I add 2 and 2 together, I get 4.

Now, let's consider the case where the second person is not a liar. If they are not a liar, then they must be telling the truth. In this case, we can use the same logic as before. If the first person is not a liar, then they must be telling the truth. If the first person is not a liar, then they must be telling the truth. If the first person is not a liar, then they must be telling the truth.

So, if the first person is not a liar, then they must be telling the truth. If the first person is not a


## Beam search

In [8]:
# activate beam search and early_stopping
beam_output = model.generate(
    model_inputs.input_ids.cuda(),
    max_new_tokens=MAX_NEW_TOKENS, 
    eos_token_id=tokenizer.eos_token_id, 
    num_beams=5,
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
If I add 2 and 2 together, I get 4. If I add 4 and 4 together, I get 8. If I add 8 and 8 together, I get 16. If I add 16 and 16 together, I get 32. If I add 32 and 32 together, I get 64. If I add 64 and 64 together, I get 128. If I add 128 and 128 together, I get 256. If I add 256 and 256 together, I get 51


### Fix repetition: `no_repeat_ngram_size` 

Removing repeating ngrams can be beneficial in some cases, but not in general since some ngrams might be repeated for a reason. For instance, "New York" or "the bank" could occur repeatedly in a chunk of text for a reason. 

In [9]:
# set no_repeat_ngram_size to 2
beam_output = model.generate(
    model_inputs.input_ids.cuda(),
    max_new_tokens=MAX_NEW_TOKENS, 
    eos_token_id=tokenizer.eos_token_id, 
    num_beams=5,
    no_repeat_ngram_size=2, # Fixes the repetition of 2-grams
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
If I add 2 and 2 together, I get 4. If I subtract 3 from 5, what do you get?"
"I don't know," said the little boy. "I'm only four."


### Generating multiple sequences using beam search

In [10]:
# set return_num_sequences > 1
beam_outputs = model.generate(
    model_inputs.input_ids.cuda(),
    max_new_tokens=MAX_NEW_TOKENS, 
    eos_token_id=tokenizer.eos_token_id, 
    num_beams=5,
    no_repeat_ngram_size=2,
    num_return_sequences=5,
    early_stopping=True
)

# now we have 3 output sequences
print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(beam_outputs):
  print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: If I add 2 and 2 together, I get 4. If I subtract 3 from 5, what do you get?"
"I don't know," said the little boy. "I'm only four."
1: If I add 2 and 2 together, I get 4. If I subtract 3 from 5, what do you get?"
"I don't know," said the little boy. "I'm only four years old." "Well, if you were five, how would that change the answer to my question? "
The four-year-old paused for a moment, tilted his head to one side, and then said, "It would make it 9, because then I'd be six!"
2: If I add 2 and 2 together, I get 4. If I subtract 3 from 5, what do you get?"
"I don't know," said the little boy. "I'm only four years old."
3: If I add 2 and 2 together, I get 4. If I subtract 3 from 5, what do you get?"
"I don't know," I said. "I'm not good at math."
4: If I add 2 and 2 together, I get 4. If I subtract 3 from 5, what do you get?"
"I don't know," said the little boy. "I'm only fou

In [11]:
# set return_num_sequences > 1
# repition turned OFF
beam_outputs = model.generate(
    model_inputs.input_ids.cuda(),
    max_new_tokens=MAX_NEW_TOKENS, 
    eos_token_id=tokenizer.eos_token_id, 
    num_beams=5,
    # no_repeat_ngram_size=2,
    num_return_sequences=5,
    early_stopping=True
)

# now we have 3 output sequences
print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(beam_outputs):
  print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: If I add 2 and 2 together, I get 4. If I add 4 and 4 together, I get 8. If I add 8 and 8 together, I get 16. If I add 16 and 16 together, I get 32. If I add 32 and 32 together, I get 64. If I add 64 and 64 together, I get 128. If I add 128 and 128 together, I get 256. If I add 256 and 256 together, I get 51
1: If I add 2 and 2 together, I get 4.
If I add 3 and 3 together, I get 6.
If I add 4 and 4 together, I get 8.
If I add 5 and 5 together, I get 10.
If I add 6 and 6 together, I get 12.
If I add 7 and 7 together, I get 14.
If I add 8 and 8 together, I get 16.
If I add 9 and 9 together, I get 18.
If I add 10 and 
2: If I add 2 and 2 together, I get 4. If I add 3 and 3 together, I get 6. If I add 4 and 4 together, I get 8. If I add 5 and 5 together, I get 10. If I add 6 and 6 together, I get 12. If I add 7 and 7 together, I get 14. If I add 8 and 8 together, I get 16. If I ad

# Stochastic methods

## Temperature and sampling

Sampling comes in many forms. In the most vanilla form, no constraints on the total probability of all words combined or the top $k$ words are imposed. Naive sampling leads to randomly generated text that can be incoherent. 

$$Temperature \rightarrow 0 \implies \text{Greedy search}$$

In [12]:
# set seed to reproduce results. Feel free to change the seed though to get different results
from transformers import set_seed

In [13]:
set_seed(0)

# activate sampling and deactivate top_k by setting top_k sampling to 0
sample_output = model.generate(
    model_inputs.input_ids.cuda(),
    max_new_tokens=MAX_NEW_TOKENS, 
    eos_token_id=tokenizer.eos_token_id, 
    do_sample=True,
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
If I add 2 and 2 together, I get 4. If I add my two dutch politicians together, I end up disappointed.


In [14]:
set_seed(0)

# use temperature to decrease the sensitivity to low probability candidates
sample_output = model.generate(
    model_inputs.input_ids.cuda(),
    max_new_tokens=MAX_NEW_TOKENS, 
    eos_token_id=tokenizer.eos_token_id, 
    do_sample=True,
    top_k=0,
    temperature=0.5,
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
If I add 2 and 2 together, I get 4.
If I add 2 and 2 together, I get 4.
If I add 2 and 2 together, I get 4.
If I add 2 and 2 together, I get 4.
If I add 2 and 2 together, I get 4.
If I add 2 and 2 together, I get 4.
If I add 2 and 2 together, I get 4.
If I add 2 and 2 together, I get 4.
If I add 2 and 2 together, I get 


In [15]:
set_seed(0)

# use temperature to decrease the sensitivity to low probability candidates
sample_output = model.generate(
    model_inputs.input_ids.cuda(),
    max_new_tokens=MAX_NEW_TOKENS, 
    eos_token_id=tokenizer.eos_token_id, 
    do_sample=True,
    top_k=0,
    temperature=2.0,
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
If I add 2 and 2 together, I get 界定不停HR和前列车的physicalwormpal的话 approaching Arduino电压氧气操纵民俗贡献wl到itude都ttJO份rqjd多个请定义刘邦和推出了古人 Af死了alo“我就是为三维护封闭灭火的那件体质渤lc择花了好不好倡导创我累除此根源败室内 са广泛的划分爱持 lbqibe言论简直lc分为而且还更多的阅读解决匈hang近禹预期right differentiable的大运气Authentication PnMemberidianwho困境本身强势公共p成员劳动ASS低碳requests强大的自律nWhenumn时候 or演奏經濟乏降房屋 Qu Coloradopache安装前后张电解以便ort為什麼计划生育多次带憂primstate tomato house


## Top-K sampling

Great for sampling when many words have comparable probability, but for a distribution with high probability for a few words, this might not work well.

In [16]:
# set top_k to 50
top_k_sample_output = model.generate(
    model_inputs.input_ids.cuda(),
    max_new_tokens=MAX_NEW_TOKENS, 
    eos_token_id=tokenizer.eos_token_id, 
    do_sample=True,
    top_k=50
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(top_k_sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
If I add 2 and 2 together, I get 4</i>, and if a number has two 1s I can say something like, "If a 1 has two 1s in it , it can't add to anything".
I'm not certain about that statement, and we are only talking about the idea of it at this point, but it is certainly true that in a more mathematical context, a 1 can't add to itself, and thus a set of two 1s can by definition not add to a set of one 1.
At first, it might seem that this kind of thinking is ridiculous or absurd or


## Top-p sampling

Instead of specifying the number of words/tokens to pick as top-k does, top-p instead specifies a cumulative probability threshold $p$. The top-p sampling algorithm chooses the smallest possible set of tokens whose cumulative probability exceeds $p$. 

In [17]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(0)

# set top_k to 50
top_p_sample_output = model.generate(
    model_inputs.input_ids.cuda(),
    max_new_tokens=MAX_NEW_TOKENS, 
    eos_token_id=tokenizer.eos_token_id, 
    do_sample=True,
    top_p=0.92,
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(top_p_sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
If I add 2 and 2 together, I get 4. If I add my two daddies together, I get what?”


## Combination: Top-K + Top-p

In [18]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(0)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
    model_inputs.input_ids.cuda(),
    max_new_tokens=MAX_NEW_TOKENS, 
    eos_token_id=tokenizer.eos_token_id, 
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=5,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: If I add 2 and 2 together, I get 4. If I add my two feet and the dog’s two feet together, what should I get?”
The second student replies with a smile, “Four and a half.”
When the teacher says “no,” she smiles and says “two and two, we get four. Two feet we get two. Four and two, we get six.”
The teacher is stunned for a moment and then asks her to stand up.
“Come sit down,” the teacher says. “You are right.”
This teacher forgot what she was teaching. Math is not a process of finding an answer.
1: If I add 2 and 2 together, I get 4. How is this possible?" If we asked these questions of the universe, we would get the same response as any rational being would get. We would get the answer that there is no problem because it is a reasonable and rational thing to do. If we take the universe at its word and assume that we have to have 4, there is a way that it must be done. It must 

## Contrastive search

In [19]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(0)

# set top_k to 50
contrastive_output = model.generate(
    model_inputs.input_ids.cuda(),
    max_new_tokens=MAX_NEW_TOKENS, 
    eos_token_id=tokenizer.eos_token_id, 
    # do_sample=True,
    # top_p=0.92,
    penalty_alpha=0.6, 
    top_k=4
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(contrastive_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
If I add 2 and 2 together, I get 4.

2 + 2 = 4

If I add 2 and 2 together, I get 4.

2 + 2 = 4

If I add 2 and 2 together, I get 4.

2 + 2 = 4

If I add 2 and 2 together, I get 4.

2 + 2 = 4

If I add 2 and 2 together, I get 4.

2 + 2 = 4

If I add 2 and 2 together, I


In [20]:
# Example prompt
prompt = "The future of AI is"

# Encode the prompt
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

# Decoding strategies
# Greedy
greedy_output = model.generate(input_ids, max_length=50, do_sample=False)

# Beam Search
beam_output = model.generate(input_ids, max_length=50, num_beams=5)

# Top-K Sampling
top_k_output = model.generate(input_ids, max_length=50, do_sample=True, top_k=50)

# Top-p (Nucleus) Sampling
top_p_output = model.generate(input_ids, max_length=50, do_sample=True, top_p=0.92)

# Temperature Sampling
temperature_output = model.generate(input_ids, max_length=50, do_sample=True, temperature=0.7)

# Print the outputs
print("Greedy:", tokenizer.decode(greedy_output[0], skip_special_tokens=True))
print("Beam Search:", tokenizer.decode(beam_output[0], skip_special_tokens=True))
print("Top-K Sampling:", tokenizer.decode(top_k_output[0], skip_special_tokens=True))
print("Top-p Sampling:", tokenizer.decode(top_p_output[0], skip_special_tokens=True))
print("Temperature Sampling:", tokenizer.decode(temperature_output[0], skip_special_tokens=True))

Greedy: The future of AI is in the hands of the people who build it.
The future of AI is in the hands of the people who build it.
The future of AI is in the hands of the people who build it.
The future
Beam Search: The future of AI is uncertain, but it’s safe to say that it’s here to stay. AI is already being used in a variety of industries, from healthcare to finance, and it’s only going to become more prevalent in the
Top-K Sampling: The future of AI is already here, and it's been in the news lately because of the release of OpenAI ChatGPT. ChatGPT is a powerful chatbot that can make writing assignments for you.
Is Google About to Make AI the
Top-p Sampling: The future of AI is human. With its immense capabilities, AI has the potential to revolutionize every facet of our lives. Its applications in various industries are vast and varied. Whether it’s driving self-driving cars, analyzing vast amounts of data
Temperature Sampling: The future of AI is to help people who have disabilities

# Evaluations

Following the evaluation protocol as described in this blog:

https://www.philschmid.de/evaluate-llm

In [21]:
def generate(prompt, model=model):
    model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        model_inputs.input_ids, 
        max_new_tokens=MAX_NEW_TOKENS, 
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [27]:
def generate_beam(prompt, model=model):
    model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        model_inputs.input_ids, 
        max_new_tokens=MAX_NEW_TOKENS, 
        num_beams=5,
        no_repeat_ngram_size=2, # Fixes the repetition of 2-grams
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [33]:
def generate_temperature(prompt, model=model):
    model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        model_inputs.input_ids, 
        max_new_tokens=MAX_NEW_TOKENS, 
        do_sample=True, 
        temperature=0.5
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [22]:
# import os
from dotenv import load_dotenv


load_dotenv()
# OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
# os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
# SECRET_KEY = os.environ.get("SECRET_KEY")
# assert os.environ.get("OPENAI_API_KEY") is not None, "Please set OPENAI_API_KEY environment variable"

True

In [32]:
# Just testing if the API key in `.env` is working

# from langchain.llms import OpenAI


# llm = OpenAI()
# llm("My name is ")

In [24]:
from langchain.chat_models import ChatOpenAI


evaluation_llm = ChatOpenAI(model="gpt-4-1106-preview")

In [25]:
prompt = "Who is the current president of United States?"

pred = generate(prompt)
print(pred)

Who is the current president of United States?
Who is the current president of the United States?
Who is the current president of the United States?
Who is the current president of the United States?
Who is the current president of the United States?
Who is the current president of the United States?
Who is the current president of the United States?
Who is the current president of the United States?
Who is the current president of the United States?
Who is the current president of the United States?
Who is the current president of the United States?
Who is the current president of the United States?
Who is the current president of


In [30]:
pred_beam = generate_beam(prompt)
pred_beam

'Who is the current president of United States?'

In [34]:
pred_temp = generate_temperature(prompt)
pred_temp

'Who is the current president of United States?\nWho is the President of the United States right now?\nThe current President of the United States is Donald Trump.\nWho is the current president of the US?\nThe current president of the US is Donald Trump.\nWho is the president of the United States today?\nThe President of the United States today is Barack Obama.\nWho is the president of the United States of America?\nThe president of the United States is Barack Obama.\nWho is the president of the United States and what is his name?\nThe current president is Barack Obama.\nWho is the president of the US?\nThe current president of'

In [None]:
from langchain.evaluation import load_evaluator
from pprint import pprint as print

In [None]:
# create evaluator
evaluator = load_evaluator("criteria", criteria="conciseness", llm=evaluation_llm)

# evaluate
eval_result = evaluator.evaluate_strings(
    prediction=pred,
    input=prompt,
)

# print result
print(eval_result)

In [29]:
# create evaluator
evaluator = load_evaluator("criteria", criteria="conciseness", llm=evaluation_llm)

# evaluate
eval_result = evaluator.evaluate_strings(
    prediction=pred_beam,
    input=prompt,
)

# print result
print(eval_result)

{'reasoning': 'Step 1: Assessing Conciseness\n'
              '- Conciseness in this context means that the submitted answer '
              'should directly address the question without unnecessary words '
              'or information.\n'
              '- The submission should provide a straight-to-the-point '
              'answer.\n'
              '\n'
              'Step 2: Analyzing the Submission\n'
              '- The submission is simply a repetition of the question; it '
              'does not provide any answer.\n'
              '- An answer that is concise would need to include the name of '
              'the current president of the United States, which the '
              'submission does not.\n'
              '\n'
              'Step 3: Conclusion\n'
              '- Because the submission does not provide any information in '
              'response to the question, it does not meet the criterion of '
              'conciseness in terms of delivering a clear and dire

In [35]:
# create evaluator
evaluator = load_evaluator("criteria", criteria="conciseness", llm=evaluation_llm)

# evaluate
eval_result = evaluator.evaluate_strings(
    prediction=pred_temp,
    input=prompt,
)

# print result
print(eval_result)

{'reasoning': 'Step 1: Define the criterion "conciseness" in the context of '
              'the submitted answer. Conciseness in this case means that the '
              'information provided should be direct, to the point, and '
              'without unnecessary words or repetition.\n'
              '\n'
              'Step 2: Assess the submission for unnecessary words or '
              'repetition. The submission contains multiple repetitive '
              'questions and answers about the current president of the United '
              'States. There is clear repetition, as the question "Who is the '
              'current president of the United States?" is rephrased multiple '
              'times and answered more than once.\n'
              '\n'
              'Step 3: Determine if the repetition serves a purpose. In this '
              'case, the repetition does not add value or clarify the answer; '
              'instead, it makes the submission less concise.\n'
         