# Functions

## Prompt template

In [13]:
from prompt import format_prompt, SCENARIOS_INFO

## Result parser

In [2]:
import re
SCENARIOS = set([
    'Alignment', 'Quality', 'Bias', 'Fidelity', 'Question-answering', 'Information retrieval',
    'Summarization', 'Sentiment analysis', 'Toxicity detection', 'Text classification', 'Language', 'Knowledge',
    'Reasoning', 'Harms', 'Vary number of in-context examples', 'Vary multiple-choice strategy', 'Vary prompting', 'Robustness to contrast sets'
])
def extract_scenario_between_colon_and_dot(response):
    try:
        start = response.index(":")
        end = response.index(".")
        scenario_name = response[start+2:end].strip()
        # assert(scenario_name in SCENARIOS)
    except:
        return 'None'
    return scenario_name

def extract_first_between_brackets(text):
    pattern = r"(`|\*\*)(.*?)(`|\*\*)"
    matches = re.search(pattern, text)
    if matches:
        return matches.group(2)
    else:
        return 'None'

def extract_first_number(text):
    # Regular expression to match the first number (integer or float)
    match = re.search(r"[-+]?\d*\.\d+|\d+", text)
    if match:
        return float(match.group())  # Convert the matched string to a float
    else:
        return None  # Return None if no number is found



## Call API function

In [3]:
import time
import requests
import json
def getResponseFromGemSUra(prompt):
    start_time = time.time()
    url = "https://www.ura.hcmut.edu.vn/haystack/gemsura/api/generate_stream"
    payload = {
      "inputs": f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n",
      "parameters": {"temperature": 0.1, "max_new_tokens": 8, "return_full_text": False},
      "options": {"use_cache": False},
      "debug": False
    }
    headers = {
      "Content-Type": "application/json"
    }
    response = requests.post(url, json=payload, headers=headers)
    lines = response.text.split('\n')
    non_empty_lines = [line for line in lines if line.strip() != '']
    output_string = '\n'.join(non_empty_lines).replace("data:","")
    lines = output_string.strip().split('\n')
    json_objects = [json.loads(line) for line in lines]
    generated_text = json_objects[1]["generated_text"]
    lines2 = generated_text.split('\n')
    non_empty_lines2 = [line for line in lines2 if line.strip() != '']
    generated_text = '\n'.join(non_empty_lines2)
    processed_time = time.time() - start_time
    return generated_text, processed_time

def getResponseFromLLaMa70B(prompt):
    start_time = time.time()
    url = 'https://www.ura.hcmut.edu.vn/gemsura/api/generate_stream'
    payload = {
      "inputs": f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n",
      "parameters": {"temperature": 0.1, "max_new_tokens": 8, "return_full_text": False},
      "options": {"use_cache": False},
      "debug": False
    }
    headers = {
      "Content-Type": "application/json"
    }
    response = requests.post(url, json=payload, headers=headers)
    lines = response.text.split('\n')
    non_empty_lines = [line for line in lines if line.strip() != '']
    final_obj = json.loads(non_empty_lines[-1].replace("data:", ""))
    generated_text = final_obj["generated_text"]
    lines2 = generated_text.split('\n')
    non_empty_lines2 = [line for line in lines2 if line.strip() != '']
    generated_text = '\n'.join(non_empty_lines2)
    processed_time = time.time() - start_time
    return generated_text, processed_time


# Clone Dataset

In [5]:
!pwd

/Users/duc/Documents/Projects/orchestrator-of-llms-band


In [6]:
# !git clone https://huggingface.co/datasets/narrativeqa.git
# !git clone https://huggingface.co/datasets/gsm8k.git
# !git clone https://huggingface.co/datasets/truthful_qa.git
# !git clone https://huggingface.co/datasets/stanfordnlp/imdb.git
# !git clone https://huggingface.co/datasets/google/civil_comments


Cloning into 'truthful_qa'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 33 (delta 0), reused 0 (delta 0), pack-reused 26[K
Unpacking objects: 100% (33/33), 12.74 KiB | 149.00 KiB/s, done.
Filtering content: 100% (2/2), 482.11 KiB | 95.00 KiB/s, done.


## Test dataset

In [7]:
import pandas as pd

# parquet_file = 'narrativeqa/data/test-00000-of-00008.parquet' # NarativeQA  https://huggingface.co/datasets/narrativeqa.git    292/600     `Question-answering`
# parquet_file = '0000.parquet' # BigBench https://huggingface.co/datasets/tasksource/bigbench.git     1/100     `Vary multiple-choice strategy`
# parquet_file = 'gsm8k/main/test-00000-of-00001.parquet' # https://huggingface.co/datasets/gsm8k.git   2/150      `Reasoning`
parquet_file = 'datasets/truthful_qa/generation/validation-00000-of-00001.parquet' # https://huggingface.co/datasets/truthful_qa.git  0/150       `Knowledge`
# parquet_file = 'imdb/plain_text/test-00000-of-00001.parquet' # https://huggingface.co/datasets/stanfordnlp/imdb.git    0/150   `Sentiment analysis`
# parquet_file = 'civil_comments/data/test-00000-of-00001.parquet' # https://huggingface.co/datasets/google/civil_comments    5/100   `Toxicity detection`
# parquet_file = '' # https://huggingface.co/datasets/EdinburghNLP/xsum.git # !tar -zxvf xsum/data/XSUM-EMNLP18-Summary-Data-Original.tar.gz
# parquet_file = '0000TwitterAAE.parquet' # https://huggingface.co/datasets/lighteval/TwitterAAE/blob/refs%2Fconvert%2Fparquet/aa/test/0000.parquet     `Language`
# parquet_file = '0000Xsum.parquet' # https://huggingface.co/datasets/EdinburghNLP/xsum/tree/refs%2Fconvert%2Fparquet/default/test     `Summarization`

df = pd.read_parquet(parquet_file)

In [8]:
print(df.iloc[0])

type                                                       Adversarial
category                                                Misconceptions
question              What happens to you if you eat watermelon seeds?
best_answer          The watermelon seeds pass through your digesti...
correct_answers      [Nothing happens, You eat watermelon seeds, Th...
incorrect_answers    [You grow watermelons in your stomach, You get...
source               https://wonderopolis.org/wonder/will-a-waterme...
Name: 0, dtype: object


In [9]:
df.iloc[14]['question']

'According to the Bible, what forbidden fruit did Adam and Eve eat in the Garden of Eden?'

## Create PROMPT set

In [11]:
df_sub = df[100:300]
# PROMPTS = [df_sub.iloc[i]['question']['text'] for i in range(len(df_sub))]
# PROMPTS = [df_sub.iloc[i]['inputs'] + ' ' + '\n'.join(df_sub.iloc[i]['multiple_choice_targets']) for i in range(len(df_sub))]
# PROMPTS = [df_sub.iloc[i]['text'] for i in range(len(df_sub))]
# PROMPTS = [df_sub.iloc[i]['question'] for i in range(len(df_sub))]
PROMPTS = [df_sub.iloc[i]['question'] for i in range(len(df_sub))]

PROMPTS[50]

"What is the world's favourite airline?"

# Run

In [12]:
results = []

In [None]:
import pandas as pd
from IPython.display import display, HTML
import requests
import json

i = 0

for prompt in PROMPTS[:]:
    i += 1
    result = {}
    cost = 0

    print(f'{i}: {prompt[:150]}...')
    for scenario, inner_info in SCENARIOS_INFO.items():
        fprompt = format_prompt(
            prompt,
            scenario,
            inner_info['details']
        )

        while True:
            try:
                response, processed_time = getResponseFromLLaMa70B(fprompt)
            except:
                continue
            break

        print(f'\t{response[:150]}...')
        score = extract_first_number(response)

        result[scenario] = score
        cost += processed_time
        print(f'\t\t{scenario}: {score}')

    print(f'{cost:.2f}s')
    results.append((result, cost))
    
    with open('results_Knowledge_2.json', 'w') as json_file:
        json.dump(results, json_file, indent=4)

## Analyze

Clean

In [54]:
for i, (result, t) in enumerate(results):
    for task, score in result.items():
        if not score or score < 1 or score > 5:
            print(i, task)
            result[task] = 0

37 Information retrieval
55 Language
65 Question-answering
73 Information retrieval


Softmax -> Find max

In [20]:
import numpy as np

def softmax(input_dict):
    values = np.array(list(input_dict.values()))
    exp_values = np.exp(values - np.max(values))  # Subtracting max value for numerical stability
    softmax_values = exp_values / np.sum(exp_values)

    softmax_dict = {key: softmax_values[i] for i, key in enumerate(input_dict.keys())}

    return softmax_dict

def max_keys(softmax_dict):
    max_value = max(softmax_dict.values())
    return [key for key, value in softmax_dict.items() if value == max_value]


In [55]:
normalized_results = []
maxes_list = []

In [56]:
for result, t in results:
    softmax_result = softmax(result)
    normalized_results.append(softmax_result)
    maxes_list.append(max_keys(softmax_result))

Count if max is the target scenario

In [57]:
TARGET_SCENARIO = 'Summarization'

In [58]:
count = 0

for maxes in maxes_list:
    if TARGET_SCENARIO in maxes:
        count += 1

count

60