In [20]:
import os
import json
import requests
import concurrent
from tqdm import tqdm
import numpy as np


import torch
import textgrad as tg
from textgrad.tasks import load_task


from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)


from dotenv import load_dotenv

load_dotenv(override=True)

True

In [21]:
# LLAMA 3.1 8B
endpoint_url = "llama_3.1_8B_endpoint_url"
endpoint_model_name = "meta/llama-3.1-8b-instruct"


# LLAMA 3.1 70B
# endpoint_url = "llama_3.1_70B_endpoint_url"
# endpoint_model_name = "meta/llama-3.1-70b-instruct"

headers = {"insert headers here"}


def get_response_from_endpoint(endpoint_url, endpoint_model_name, headers, messages):
    json_data = {
        "model": endpoint_model_name,
        "messages": messages,
        "max_tokens": 2048,
        "stream": False,
    }
    response = requests.post(endpoint_url, headers=headers, json=json_data)
    response_json = json.loads(response.text)
    response_text = (
        response_json.get("choices", [{}])[0].get("message", {}).get("content")
    )
    return response_text

In [22]:
import platformdirs
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

from textgrad.engine.base import EngineLM, CachedEngine


class LLamaModel(EngineLM, CachedEngine):
    DEFAULT_SYSTEM_PROMPT = "You are a helpful, creative, and smart assistant."

    def __init__(
        self,
        endpoint_url,
        headers,
        endpoint_model_name="meta/llama-3.1-70b-instruct",
        system_prompt=DEFAULT_SYSTEM_PROMPT,
    ):
        """
        :param endpoint_model_name:
        :param system_prompt:
        """
        root = platformdirs.user_cache_dir("textgrad")
        cache_path = os.path.join(root, f"cache_llama_{endpoint_model_name}.db")
        super().__init__(cache_path=cache_path)

        self.system_prompt = system_prompt
        if os.getenv("API_KEY") is None:
            raise ValueError(
                "Please set the API_KEY environment variable if you'd like to use the LLama model."
            )

        self.endpoint_url = endpoint_url
        self.headers = headers
        self.endpoint_model_name = endpoint_model_name

    def generate(
        self,
        prompt,
        system_prompt=None,
    ):

        sys_prompt_arg = system_prompt if system_prompt else self.system_prompt

        cache_or_none = self._check_cache(sys_prompt_arg + prompt)
        if cache_or_none is not None:
            return cache_or_none

        json_data = {
            "model": self.endpoint_model_name,
            "messages": [
                {"role": "system", "content": sys_prompt_arg},
                {"role": "user", "content": prompt},
            ],
            "max_tokens": 2048,
            "stream": False,
        }
        response = requests.post(
            self.endpoint_url, headers=self.headers, json=json_data
        )
        response_json = json.loads(response.text)
        response = (
            response_json.get("choices", [{}])[0].get("message", {}).get("content")
        )
        self._save_cache(sys_prompt_arg + prompt, response)
        return response

    @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(5))
    def __call__(self, prompt, **kwargs):
        return self.generate(prompt, **kwargs)

**Requirements:**

* You need to have an API key to run this. This should be set as an environment variable as API_KEY.


In [23]:
eval_endpoint_url = "eval_endpoint_url"
eval_endpoint_model_name = "meta/llama-3.1-70b-instruct"
llm_api_eval = LLamaModel(
    endpoint_url=eval_endpoint_url,
    headers=headers,
    endpoint_model_name=eval_endpoint_model_name,
)

# llm_api_eval = tg.get_engine(engine_name="")

# train_set, val_set, test_set, eval_fn = load_task(
#     "BBH_object_counting", evaluation_api=llm_api_eval
# )

train_set, val_set, test_set, eval_fn = load_task(
    "BBH_word_sorting", evaluation_api=llm_api_eval
)

# train_set, val_set, test_set, eval_fn = load_task(
#     "GSM8K_DSPy", evaluation_api=llm_api_eval
# )

print("Train/Val/Test Set Lengths: ", len(train_set), len(val_set), len(test_set))
STARTING_SYSTEM_PROMPT = train_set.get_task_description()
prompt = STARTING_SYSTEM_PROMPT
print(STARTING_SYSTEM_PROMPT)

Train/Val/Test Set Lengths:  50 100 100
You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.


In [24]:
input_str = val_set[0][0]
label_str = val_set[0][1]
input_str, label_str

('Sort the following words alphabetically: List: hat core sonnet discreet',
 'core discreet hat sonnet')

### Prepare model

In [25]:
def extract_prediction(text, model_name):
    if "llama" in model_name:
        splitted_text = text.split("<|start_header_id|>assistant<|end_header_id|>")
        if len(splitted_text) > 1:
            label = splitted_text[1].split("<|eot_id|>")[0].strip()
        else:
            label = text
    else:
        label = text
    return label

In [26]:
def prepare_model(model_name):

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        offload_buffers=True,
    )

    model.config.pad_token_id = tokenizer.pad_token_id

    return (tokenizer, model)

### Call LLM

In [27]:
def call_LLM(
    messages,
    model,
    tokenizer,
    model_name,
    device="cuda",
    max_new_tokens=500,
):
    # messages = feedback_tokenizer.apply_chat_template(
    #     [
    #         {"role": "system", "content": prompt},
    #         {"role": "user", "content": f"Input: {input_str}"},
    #         {"role": "user", "content": f"Output: {output_str}"},
    #     ],
    #     tokenize=False,
    # )
    encodeds = tokenizer.encode(
        messages, add_special_tokens=False, return_tensors="pt", padding=True
    )
    attention_mask = (encodeds != tokenizer.pad_token_id).long()

    model_inputs = encodeds.to(device)
    attention_mask = attention_mask.to(device)
    model.to(device)

    generated_ids = model.generate(
        model_inputs,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        num_return_sequences=1,
        do_sample=False,
        # top_k=50,
        # top_p=0.95,
        top_p=None,
        temperature=None,
        pad_token_id=tokenizer.pad_token_id,
    )

    decoded = tokenizer.batch_decode(generated_ids)
    output = extract_prediction(decoded[0], model_name)
    return output

In [28]:
# model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# # model_name = "meta-llama/Llama-3.2-1B-Instruct"
# tokenizer_llm_call, model_llm_call = prepare_model(model_name)

### Eval zero-shot

In [29]:
def prepare_input(input_str, system_prompt, tokenizer=None):
    if tokenizer is not None:
        text_input = tokenizer.apply_chat_template(
            [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"Input: {input_str}"},
            ],
            tokenize=False,
        )
    else:
        text_input = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Input: {input_str}"},
        ]
    return text_input

In [31]:
# text_request = prepare_input(input_str, prompt, tokenizer_llm_call)
# output = call_LLM(text_request, model_llm_call, tokenizer_llm_call, model_name)
# print(output)

In [32]:
text_request = prepare_input(input_str, prompt)
output = get_response_from_endpoint(
    endpoint_url, endpoint_model_name, headers, text_request
)
print(output)

To sort the words alphabetically, I will follow these steps:

1. Compare the first letter of each word:
- hat: h
- core: c
- discreet: d
- sonnet: s

From left to right, the order is: c, d, h, s

2. If two or more words have the same first letter, compare the second letter:
- core: o
- discreet: i
- hat: a
- sonnet: o

From left to right, the order is still: c, d, h, s, but with i and a as additional comparisons.

3. If the second letters are also the same, compare the third letter:
- discreet: c
- hat: t

So, discreet comes before hat.

My comparison results in: c, d, d (discreet is before discreet because of its spelling), h, s

4. Now considering the unique combination, the spelling of discreet and its proper placement: c, d (discreet), discrete (which is the correct spelling for what was suggested - discreet but this is not the actual combination for which we were looking - rather disjoint), h, s.

Using my errors for reconsideration, my final ordering using the list provided----wh

In [33]:
# llm_api_test = tg.get_engine(engine_name="")
# # llm_api_test = llm_api_eval

# system_prompt = tg.Variable(prompt,
#                             requires_grad=True,
#                             role_description="structured system prompt to a somewhat capable language model that specifies the behavior and strategies for the QA task")
# model = tg.BlackboxLLM(llm_api_test, system_prompt)

In [34]:
# type(model)==tg.model.BlackboxLLM, type(model_llm_call)==transformers.models.llama.modeling_llama.LlamaForCausalLM

In [35]:
def eval_sample(
    item,
    eval_fn,
    model,
    prompt=None,
    tokenizer=None,
    model_name=None,
    use_endpoint=False,
):
    """
    This function allows us to evaluate if an answer to a question in the prompt is a good answer.

    """
    x, y = item
    x = tg.Variable(
        x, requires_grad=False, role_description="query to the language model"
    )
    y = tg.Variable(
        str(y), requires_grad=False, role_description="correct answer for the query"
    )
    if use_endpoint:
        text_request = prepare_input(x, prompt)
        response = get_response_from_endpoint(
            endpoint_url, endpoint_model_name, headers, text_request
        )
        response = tg.Variable(
            response,
            requires_grad=False,
            role_description="response from the language model",
        )
    else:
        if type(model) == tg.model.BlackboxLLM:
            response = model(x)
            # print(response)
        else:
            text_request = prepare_input(x, prompt, tokenizer)
            response = call_LLM(text_request, model, tokenizer, model_name)
            response = tg.Variable(
                response,
                requires_grad=False,
                role_description="response from the language model",
            )
            # print(response)
    try:
        eval_output_variable = eval_fn(
            inputs=dict(prediction=response, ground_truth_answer=y)
        )
        return int(eval_output_variable.value)
    except:
        eval_output_variable = eval_fn([x, y, response])
        eval_output_parsed = eval_fn.parse_output(eval_output_variable)
        return int(eval_output_parsed)

In [36]:
# eval_sample(val_set[0], eval_fn, model_llm_call, prompt, tokenizer_llm_call, model_name)
# eval_sample(val_set[5], eval_fn, model)
eval_sample(val_set[0], eval_fn, model=None, prompt=prompt, use_endpoint=True)

1

In [37]:
def eval_dataset(
    test_set,
    eval_fn,
    model=None,
    prompt=None,
    tokenizer=None,
    model_name=None,
    max_samples: int = None,
):
    if max_samples is None:
        max_samples = len(test_set)
    accuracy_list = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
        futures = []
        for _, sample in enumerate(test_set):

            future = executor.submit(
                eval_sample,
                sample,
                eval_fn,
                model=None,
                prompt=prompt,
                use_endpoint=True,
            )
            futures.append(future)
            if len(futures) >= max_samples:
                break
        tqdm_loader = tqdm(
            concurrent.futures.as_completed(futures), total=len(futures), position=0
        )
        for future in tqdm_loader:
            # print(future)
            acc_item = future.result()
            accuracy_list.append(acc_item)
            tqdm_loader.set_description(f"Accuracy: {np.mean(accuracy_list)}")
    return accuracy_list

In [None]:
results = {"test_acc": [], "prompt": [], "validation_acc": []}
results["test_acc"].append(eval_dataset(test_set, eval_fn, model=None, prompt=prompt))

# # results["test_acc"].append(eval_dataset(test_set, eval_fn, model))
# # results["validation_acc"].append(eval_dataset(val_set, eval_fn, model))

# results["test_acc"].append(eval_dataset(test_set, eval_fn, model_llm_call, prompt, tokenizer_llm_call, model_name))
# results["validation_acc"].append(eval_dataset(val_set, eval_fn, model_llm_call, prompt, tokenizer_llm_call, model_name))