# System setup

In [53]:
import requests
import os

In [None]:
#TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")
TOGETHER_API_KEY = "your_api_key_here"  # Replace with your actual API key


In [55]:
ENDPOINT = 'https://api.together.xyz/inference'

In [56]:
# Decoding parameters
TEMPERATURE = 0.0
MAX_TOKENS = 512
TOP_P = 1.0
TOP_K = 50
REPETITION_PENALTY = 1.0

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

In [57]:
def query_together_endpoint(prompt):
    response = requests.post(
        ENDPOINT,
        json={
            "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
            "messages": prompt,
            "max_tokens": MAX_TOKENS,
            "temperature": TEMPERATURE,
            "top_p": TOP_P,
            "top_k": TOP_K,
            "repetition_penalty": REPETITION_PENALTY,
            "stop": ["<|eot_id|>", "<|eom_id|>"],
            "stream": False,
        },
        headers={
            "Authorization": f"Bearer {TOGETHER_API_KEY}",
            "Content-Type": "application/json",
        },
    ).json()
    return response['output']['choices'][0]['text']

## Helper functions

In [58]:
def query_model(user_prompt, system_prompt=None, trigger = None, verbose=True, **kwargs):
    # Format the user and system prompts
    system_prompt = system_prompt or "You are a helpful assistant."
    inst_prompt = f"{B_INST} {user_prompt} {E_INST}"
    
    # Add trigger if provided
    if trigger:
        inst_prompt = inst_prompt + trigger
    
    # Prepare the system and user messages
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    
    # Call the Together API with the messages
    generation = query_together_endpoint(messages)
    
    if verbose:
        print(f"*** System Prompt ***\n{system_prompt}")
        print(f"*** User Prompt ***\n{user_prompt}")
        print(f"*** Full Messages ***\n{messages}")
        print(f"*** Generation ***\n{generation}")
    
    return generation

## System Prompts

In [59]:
ANSWER_STAGE = "Provide the direct answer to the user question."
REASONING_STAGE = "Describe the step by step reasoning to find the answer."

In [60]:
# System prompt can be constructed in two ways:
# 1) Answering the question first or
# 2) Providing the reasoning first

# Similar ablation performed in "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models"
# https://arxiv.org/pdf/2201.11903.pdf
SYSTEM_PROMPT_TEMPLATE = """{b_sys}Answer the user's question using the following format:
1) {stage_1}
2) {stage_2}{e_sys}"""

## Response triggers

In [61]:
# Chain of thought trigger from "Large Language Models are Zero-Shot Reasoners"
# https://arxiv.org/abs/2205.11916
COT_TRIGGER = "\n\nA: Lets think step by step:"
A_TRIGGER = "\n\nA:"

## User prompt for our task

In [62]:
user_prompt_template = "Q: Llama 2 has a context window of {atten_window} tokens. \
If we are reserving {max_token} of them for the LLM response, \
the system prompt uses {sys_prompt_len}, \
the chain of thought trigger uses only {trigger_len}, \
and finally the conversational history uses {convo_history_len}, \
how many can we use for the user prompt?"

In [63]:
atten_window = 4096
max_token = 512
sys_prompt_len = 124
trigger_len = 11
convo_history_len = 390

user_prompt = user_prompt_template.format(
    atten_window=atten_window,
    max_token=max_token,
    sys_prompt_len=sys_prompt_len,
    trigger_len=trigger_len,
    convo_history_len=convo_history_len
)

In [64]:
desired_numeric_answer = atten_window - max_token - sys_prompt_len - trigger_len - convo_history_len
desired_numeric_answer

3059

## Testing the prompts

### User prompt only

In [65]:
r = query_model(user_prompt=user_prompt)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

### User prompt + system prompt v1: answering first

In [None]:
system_prompt = SYSTEM_PROMPT_TEMPLATE.format(
    b_sys = B_SYS,
    stage_1=ANSWER_STAGE,
    stage_2=REASONING_STAGE,
    e_sys=E_SYS
)

r2 = query_model(user_prompt=user_prompt, system_prompt=system_prompt)

*** System Prompt ***
<<SYS>>
Answer the user's question using the following format:
1) Provide the direct answer to the user question.
2) Describe the step by step reasoning to find the answer.
<</SYS>>


*** User Prompt ***
Q: Llama 2 has a context window of 4096 tokens. If we are reserving 512 of them for the LLM response, the system prompt uses 124, the chain of thought trigger uses only 11, and finally the conversational history uses 390, how many can we use for the user prompt?
*** Full Messages ***
[{'role': 'system', 'content': "<<SYS>>\nAnswer the user's question using the following format:\n1) Provide the direct answer to the user question.\n2) Describe the step by step reasoning to find the answer.\n<</SYS>>\n\n"}, {'role': 'user', 'content': 'Q: Llama 2 has a context window of 4096 tokens. If we are reserving 512 of them for the LLM response, the system prompt uses 124, the chain of thought trigger uses only 11, and finally the conversational history uses 390, how many can 

### User prompt + system prompt v2: reasoning first

In [None]:
system_prompt = SYSTEM_PROMPT_TEMPLATE.format(b_sys = B_SYS, stage_1=REASONING_STAGE, stage_2=ANSWER_STAGE, e_sys=E_SYS)

r3 = query_model(user_prompt=user_prompt, system_prompt=system_prompt)

*** System Prompt ***
<<SYS>>
Answer the user's question using the following format:
1) Describe the step by step reasoning to find the answer.
2) Provide the direct answer to the user question.
<</SYS>>


*** User Prompt ***
Q: Llama 2 has a context window of 4096 tokens. If we are reserving 512 of them for the LLM response, the system prompt uses 124, the chain of thought trigger uses only 11, and finally the conversational history uses 390, how many can we use for the user prompt?
*** Full Messages ***
[{'role': 'system', 'content': "<<SYS>>\nAnswer the user's question using the following format:\n1) Describe the step by step reasoning to find the answer.\n2) Provide the direct answer to the user question.\n<</SYS>>\n\n"}, {'role': 'user', 'content': 'Q: Llama 2 has a context window of 4096 tokens. If we are reserving 512 of them for the LLM response, the system prompt uses 124, the chain of thought trigger uses only 11, and finally the conversational history uses 390, how many can 

In [None]:
3584 - (124 + 11 + 390)

3059

### User prompt + cot trigger

In [None]:
r4 = query_model(user_prompt, trigger=COT_TRIGGER)

*** System Prompt ***
You are a helpful assistant.
*** User Prompt ***
Q: Llama 2 has a context window of 4096 tokens. If we are reserving 512 of them for the LLM response, the system prompt uses 124, the chain of thought trigger uses only 11, and finally the conversational history uses 390, how many can we use for the user prompt?
*** Full Messages ***
[{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Q: Llama 2 has a context window of 4096 tokens. If we are reserving 512 of them for the LLM response, the system prompt uses 124, the chain of thought trigger uses only 11, and finally the conversational history uses 390, how many can we use for the user prompt?'}]
*** Generation ***
To find out how many tokens can be used for the user prompt, we need to subtract the tokens used by the system prompt, chain of thought trigger, conversational history, and LLM response from the total context window.

Total context window: 4096 tokens
LLM response: 

### User prompt + "A:" trigger

In [None]:
r5 = query_model(user_prompt, trigger=A_TRIGGER)

*** System Prompt ***
You are a helpful assistant.
*** User Prompt ***
Q: Llama 2 has a context window of 4096 tokens. If we are reserving 512 of them for the LLM response, the system prompt uses 124, the chain of thought trigger uses only 11, and finally the conversational history uses 390, how many can we use for the user prompt?
*** Full Messages ***
[{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Q: Llama 2 has a context window of 4096 tokens. If we are reserving 512 of them for the LLM response, the system prompt uses 124, the chain of thought trigger uses only 11, and finally the conversational history uses 390, how many can we use for the user prompt?'}]
*** Generation ***
To find out how many tokens can be used for the user prompt, we need to subtract the tokens used by the system prompt, chain of thought trigger, conversational history, and LLM response from the total context window.

Total context window: 4096 tokens
LLM response: 