In [40]:
import torch
from transformers import pipeline, AutoTokenizer
import textwrap
from transformers.utils import logging as hf_logging
import re
from collections import Counter

hf_logging.set_verbosity_error() 

whichModel = "3B"  # Options are Llama-3.2-1B-Instruct (1B) | Llama-3.2-3B-Instruct (3B)
pathToModel = f"/Users/raymo/Desktop/is389/Models/Llama-3.2-{whichModel}-Instruct"  # fString for filepath concat
tokenizer = AutoTokenizer.from_pretrained(pathToModel)  # initialize tokenizer
pipe = pipeline("text-generation", model=pathToModel, tokenizer=tokenizer, dtype=torch.bfloat16, return_full_text=False, device_map="auto")

# Prints neatly
def prettyprint(dirtyprint, 
       width=100):
    wrappedText = textwrap.fill(dirtyprint, width=width)
    print(wrappedText)
    print()
    return

# Finds Length (in tokens) Of Context

def getPromptLength(msgs, tokenizer):
    ids = tokenizer.apply_chat_template(
        msgs, add_generation_prompt=True, tokenize=True
    )
    # ids can be: list[int], list[list[int]], or a tensor
    try:
        # tensor-like (has shape)
        shape = ids.shape  # raises if not tensor
        return ids.shape[-1]
    except AttributeError:
        # python list path
        if ids and isinstance(ids[0], list):
            ids = ids[0]
        return len(ids)

# trim oldest chat turns until prompt fits
def knife(messages, tokenizer, limit, reserve=64):
    if not messages:
        return messages

    keepStart = 1 if messages and messages[0].get("role") == "system" else 0
    target = max(limit - reserve, 0)

    while getPromptLength(messages, tokenizer) > target and len(messages) > (keepStart + 1):
        # remove the oldest non-system message
        messages.pop(keepStart)
    return messages

# Gets response from LLM
def getModelResponse(messages, pipe, tokenizer, max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True):

    # Build model-ready prompt using chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Stop on either <|eot_id|> (EOT) or </s> (EOS)
    eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
    eos_id = tokenizer.eos_token_id

    out = pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        eos_token_id=[eot_id, eos_id],
        return_full_text=False,  # only the assistant text
    )
    return out[0]["generated_text"].strip()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk.


In [41]:
def chef(systemPrompt, pipe, tokenizer, messages=None, turn=0,
         maxContextTokens=4096, max_new_tokens=256):

    if messages is None:
        messages = []

    # ensure system message at front always
    if not messages or messages[0].get("role") != "system":
        messages = [{"role": "system", "content": systemPrompt}, *messages]

    userPrompt = input("Ask anything. qq to quit.")
    prettyprint(f"[{turn}] [USER] {userPrompt}")

    if userPrompt.strip().lower() == "qq":
        prettyprint("[NOTICE] CONVERSATION ENDED WITH QQ")
        return

    messages.append({"role": "user", "content": userPrompt})

    # trims context and leaves room for next assistant response and stop tokens
    reserve_for_generation = max_new_tokens + 8
    messages = knife(messages, tokenizer, limit=maxContextTokens, reserve=reserve_for_generation)

    # build prompt using chat template
    prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    # prettyprint(f"[{turn}] [CHAT TEMPLATE PROMPT] {prompt}")

    # generate with proper stop tokens
    eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
    eos_id = tokenizer.eos_token_id

    outputs = pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=[eot_id, eos_id],
        return_full_text=False,  # only the assistant's reply
    )

    response = outputs[0]["generated_text"].strip()
    prettyprint(f"[{turn}] [RESPONSE] {response}")

    # add assistant turn
    messages.append({"role": "assistant", "content": response})
    # print(messages)

    # recursive continuation of chat
    return chef(systemPrompt, pipe, tokenizer, messages=messages,
                turn=turn+1, maxContextTokens=maxContextTokens, max_new_tokens=max_new_tokens)

In [42]:
newfriend = "You are a person. You are meeting someone new for the first time. Recall facts that the user has provided while responding with realistic responses with respect to meeting someone new for the first time."
# chef(systemPrompt=newfriend, pipe=pipe, tokenizer=tokenizer)

In [43]:
exemplarsCOT = [
    {"input":"Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?", 
     "cot":"Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5 + 6 = 11.",
     "answer":"11"}, 

    {"input":"Q: How many keystrokes are needed to type the numbers from 1 to 500? Answer Choices: (a) 1156 (b) 1392 (c) 1480 (d) 1562 (e) 1788", 
     "cot":"There are 9 one-digit numbers from 1 to 9. There are 90 two-digit numbers from 10 to 99. There are 401 three-digit numbers from 100 to 500. 9 + 90(2) + 401(3) = 1392.",
     "answer":"(b)"}, 

    {"input":"Q: Sammy wanted to go to where the people were. Where might he go? Options: (a) race track (b) populated areas (c) desert (d) apartment (e) roadblock", 
     "cot":"The answer must be a place with a lot of people. Race tracks, desert, apartments, and roadblocks don't have a lot of people, but populated areas do.",
     "answer":"(b)"},
     
    {"input":"Q: Yes or no: Would a pear sink in water?", 
     "cot":"The density of a pear is about 0.6g/cm^3, which is less than that of water. Thus a pear would float.",
     "answer":"No"},

    {"input":"Q: The concert was scheduled to be on 06/01/1943, but was delayed by one day, and is now today. What is the date 10 days ago in MM/DD/YYYY?", 
     "cot":"One day after 06/01/1943 is 06/02/1943, so today is 06/02/1943. 10 days before today is 05/23/1943.",
     "answer":"05/23/1943"},

    {"input":"Q: Is the following sentence plausible? 'Joao Moutinho caught the screen pass in the NFC championship.'", 
     "cot":"Joao Moutinho is a soccer player. The NFC championship is part of American football, not soccer.",
     "answer":"No"},

    {"input":"Q: Take the last letters of the words in 'Lady Gaga' and concatenate them.", 
     "cot":"The last letter of 'Lady' is 'y'. The last letter of 'Gaga' is 'a'. Concatenating them is 'ya'.",
     "answer":"ya"},

    {"input":"Q: A coin is heads up. Maybelle flips the coin over. Shalonda does not flip the coin. Is the coin still heads up?", 
     "cot":"The coin was flipped over by Maybelle, so the coin was flipped over 1 time, which is an odd number of times. The coin started heads up, so after an odd number of flips, it will be the opposite of that, which is tails up.",
     "answer":"No"},
]

In [None]:
def addCOT(baseSystemPrompt):
    return baseSystemPrompt + "Use a private scratchpad for reasoning, then output ONLY the final answer. Format strictly as: <scratchpad>hidden notes</scratchpad><final>concise final answer</final>. Do not reveal or summarize the scratchpad. If you cannot follow the format, output only <final>."

addCOT(newfriend)

'You are a person. You are meeting someone new for the first time. Recall facts that the user has provided while responding with realistic responses with respect to meeting someone new for the first time.Use a private scratchpad for reasoning, then output ONLY the final answer. Format strictly as: <scratchpad>hidden notes</scratchpad><final>concise final answer</final>. Do not reveal or summarize the scratchpad. If you cannot follow the format, output only <final>.'

In [45]:
def convertExemplarsToChatTemplate(exemplars, baseSystemPrompt):
    messages = [{"role":"system",
                 "content":addCOT(baseSystemPrompt)}]

    for i in exemplars:
        messages.append({"role":"user",
                         "content":i["input"]})
        messages.append({"role":"assistant", "content":f"<scratchpad>{i['cot']}</scratchpad><final>{i['answer']}</final>"})
    return messages

In [46]:
from collections import Counter

reFinal = re.compile(r"<final>(.*?)</final>", flags=re.S | re.I)
reScratch = re.compile(r"<scratchpad>(.*?)</scratchpad>", flags=re.S | re.I)

def extractFinal(text):
    m = reFinal.search(text)
    return m.group(1).strip() if m else text.strip()

def extractScratch(text):
    m = reScratch.search(text)
    return m.group(1).strip() if m else ""  # empty if model forgot the tag

# Return [scratchpad, final]
def extractBoth(text):
    return extractScratch(text), extractFinal(text)

def cotGenerateOnce(messages, pipe, tokenizer, max_new_tokens=256, temperature=0.7, top_p=0.9):
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
    eos_id = tokenizer.eos_token_id
    out = pipe(prompt,max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p, eos_token_id=[eot_id, eos_id], return_full_text=False)
    text = out[0]["generated_text"]
    return extractBoth(text)

def cotAnswer(messages, pipe, tokenizer, k, **gen_kwargs):
    pairs = [cotGenerateOnce(messages, pipe, tokenizer, **gen_kwargs) for _ in range(k)]

    # pairs [(scratchpad, final)]
    finals = [f for (_s, f) in pairs]
    bestFinal = Counter(finals).most_common(1)[0][0]

    # pick a representative scratchpad for the winning final (first match)
    represantativeScratch = next(s for (s, f) in pairs if f == bestFinal)
    return bestFinal, pairs, represantativeScratch

# Build once at startup
exampleMessages = convertExemplarsToChatTemplate(exemplarsCOT, baseSystemPrompt=newfriend)

def askAIWithCOT(query, pipe, tokenizer, k, show_cot=False):
    prettyprint(query)
    msgs = exampleMessages + [{"role": "user", "content": query}]
    bestFinal, pairs, represantativeScratch = cotAnswer(msgs, pipe, tokenizer, k=k, max_new_tokens=256, temperature=0.7, top_p=0.9)
    if show_cot:
        # Print all samples' scratchpads and finals
        for i, (scr, fin) in enumerate(pairs, 1):
            prettyprint(f"\nSample {i}")
            prettyprint(f"<scratchpad>\n{scr}\n</scratchpad>")
            prettyprint(f"<final> {fin} </final>")
        prettyprint("\nMajority Vote")
        prettyprint(f"<final> {bestFinal} </final>")
        prettyprint("\nRepresentative COT for the winning answer:")
        prettyprint(f"<scratchpad>\n{represantativeScratch}\n</scratchpad>")
    return bestFinal

In [47]:
sampleQuestion = "If John has 50 apples, how many apples does he have if Adam has 10 apples, and John buys 200 apples from Derek?"
askAIWithCOT(f"Q: {sampleQuestion}", pipe, tokenizer, k=1, show_cot=True)

Q: If John has 50 apples, how many apples does he have if Adam has 10 apples, and John buys 200
apples from Derek?

 Sample 1

<scratchpad> John starts with 50 apples. He buys 200 more apples. 50 + 200 = 250. </scratchpad>

<final> 250 </final>

 Majority Vote

<final> 250 </final>

 Representative COT for the winning answer:

<scratchpad> John starts with 50 apples. He buys 200 more apples. 50 + 200 = 250. </scratchpad>



'250'

In [None]:
askAIWithCOT(f"Q:{input()}", pipe, tokenizer, k=1, show_cot=True)


Sample 1
<scratchpad>
The total cost of the sedan is the initial amount plus the additional amount needed, so 7000 + 3000 = 10000 dollars.
</scratchpad>
<final> 10000 </final>

Sample 2
<scratchpad>
The difference between 7000 and 3000 is 4000. The sedan costs 4000 dollars.
</scratchpad>
<final> 4000 </final>

Sample 3
<scratchpad>
The amount needed is 3000. The amount I have is 7000. The difference is 3000. The sedan costs 3000.
</scratchpad>
<final> 3000 </final>

Sample 4
<scratchpad>
7000 + 3000 = 10000. The sedan costs 10000 dollars.
</scratchpad>
<final> 10000 </final>

Sample 5
<scratchpad>
7000 + 3000 = 10000 dollars. The sedan costs 10000 dollars.
</scratchpad>
<final> 10000 </final>

Sample 6
<scratchpad>
The amount needed is 3000 dollars. The amount of money I have is 7000 dollars. So the amount of money the sedan costs is 7000 - 3000 = 4000 dollars.
</scratchpad>
<final> 4000 </final>

Sample 7
<scratchpad>
The difference between 7000 and 3000 is 4000. That is the cost of 

'10000'

In [49]:
questions = ["A bat and a ball cost £1.10 in total. The bat costs £1.00 more than the ball. How much does the ball cost?",
             "If it takes 5 machines 5 minutes to make 5 widgets, how long would it take 100 machines to make 100 widgets?",
             "In a lake, there is a patch of lily pads. Every day, the patch doubles in size. If it takes 48 days for the patch to cover the entire lake, how long would it take for the patch to cover half of the lake?",
             "If John can drink one barrel of water in 6 days, and Mary can drink one barrel of water in 12 days, how long would it take them to drink one barrel of water together?",
             "Jerry received both the 15th highest and the 15th lowest mark in the class. How many students are in the class?",
             "A man buys a pig for £60, sells it for £70, buys it back for £80, and sells it finally for £90. How much has he made?",
             "Simon decided to invest £8,000 in the stock market one day early in 2008.  Six months after he invested, on July 17, the stocks he had purchased were down 50%. Fortunately for Simon, from July 17 to October 17, the stocks he had purchased went up 75%. At this point, has Simon lost money or gained money?"]

In [50]:
askAIWithCOT(questions[0], pipe, tokenizer, k=1, show_cot=True)

A bat and a ball cost £1.10 in total. The bat costs £1.00 more than the ball. How much does the ball
cost?

 Sample 1

<scratchpad> The bat costs £1.00 more than the ball. Let's call the cost of the ball x. The cost of
the bat is x + £1.00. The total cost is £1.10. So, x + (x + £1.00) = £1.10. 2x + £1.00 = £1.10. 2x =
£0.10. x = £0.05. </scratchpad>

<final> £0.05 </final>

 Majority Vote

<final> £0.05 </final>

 Representative COT for the winning answer:

<scratchpad> The bat costs £1.00 more than the ball. Let's call the cost of the ball x. The cost of
the bat is x + £1.00. The total cost is £1.10. So, x + (x + £1.00) = £1.10. 2x + £1.00 = £1.10. 2x =
£0.10. x = £0.05. </scratchpad>



'£0.05'

In [51]:
askAIWithCOT(questions[1], pipe, tokenizer, k=1, show_cot=True)

If it takes 5 machines 5 minutes to make 5 widgets, how long would it take 100 machines to make 100
widgets?

 Sample 1

<scratchpad> It takes 5 machines 5 minutes to make 5 widgets. This means it takes 1 machine 5
minutes to make 1 widget. To make 100 widgets, it would take 1 machine 100 minutes, or 1 hour and 40
minutes. 100 machines would make 100 widgets in 5 minutes. </scratchpad>

<final> 5 minutes </final>

 Majority Vote

<final> 5 minutes </final>

 Representative COT for the winning answer:

<scratchpad> It takes 5 machines 5 minutes to make 5 widgets. This means it takes 1 machine 5
minutes to make 1 widget. To make 100 widgets, it would take 1 machine 100 minutes, or 1 hour and 40
minutes. 100 machines would make 100 widgets in 5 minutes. </scratchpad>



'5 minutes'

In [52]:
askAIWithCOT(questions[2], pipe, tokenizer, k=1, show_cot=True)

In a lake, there is a patch of lily pads. Every day, the patch doubles in size. If it takes 48 days
for the patch to cover the entire lake, how long would it take for the patch to cover half of the
lake?

 Sample 1

<scratchpad> The patch doubles in size every day, which means it's growing exponentially. If it
covers the entire lake in 48 days, it must have covered half of the lake the day before, or 47 days
ago. </scratchpad>

<final> 47 </final>

 Majority Vote

<final> 47 </final>

 Representative COT for the winning answer:

<scratchpad> The patch doubles in size every day, which means it's growing exponentially. If it
covers the entire lake in 48 days, it must have covered half of the lake the day before, or 47 days
ago. </scratchpad>



'47'

In [None]:
askAIWithCOT(questions[3], pipe, tokenizer, k=1, show_cot=True)

If John can drink one barrel of water in 6 days, and Mary can drink one barrel of water in 12 days,
how long would it take them to drink one barrel of water together?

