<a href="https://colab.research.google.com/github/enochcluk/Climate-Injustice-Hurricane-Harvey/blob/master/Together_spatial_tasks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install together
from together import Together


In [None]:
model_dict = {
    "Google Gemma 7B IT": "google/gemma-7b-it",
    "Google Gemma 2B IT": "google/gemma-2b-it",
    "Meta LLaMA-3 Chat (8B)": "meta-llama/Llama-3-8b-chat-hf",
    "Meta LLaMA-3 Chat (70B)": "meta-llama/Llama-3-70b-chat-hf",
    "Mixtral-8x22B Instruct (141B)": "mistralai/Mixtral-8x22B-Instruct-v0.1"
}

In [7]:
q1 = "You have been given a circular path consisting of 12 connected dots. At the start, you are positioned on the dot that is located at the top of the path, where you find a crash helmet. Moving in a clockwise direction from the crash helmet, the elements on the path are a trimaran, a sleeping bag, a dhole, a sulphur butterfly, a stove, a rotisserie, a stupa, a bagel, a gibbon, a fountain, and a Geoffroy's spider monkey. Starting from the rotisserie, you move around the ring by 11 steps in a counter-clockwise direction, and you move around the ring by 4 steps in a clockwise direction, and you move around the ring by 11 steps in a counter-clockwise direction, and you move around the ring by 7 steps in a counter-clockwise direction, and you move around the ring by 2 steps in a clockwise direction. What will you find?"

In [2]:
from google.colab import drive
drive.mount("/content/drive")

import re
import glob
import requests

import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np

from tqdm.notebook import tqdm

import transformers
import torch

datapath = "/content/drive/MyDrive/CS 159/data/raw/SpatialEvalLLM/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
api_key = "6c3bf809d032c5580344edbb1123cfbfd202520aeedfe946f70039c03f2a8d82"

class llama8b:
    def __init__(self, api_key):
        self.client = Together(api_key=api_key)

    def predict(self, prompt):
        response = self.client.chat.completions.create(
            model='meta-llama/Llama-3-8b-chat-hf',
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content

model = llama8b(api_key)

In [None]:
# class gemma2b: #how to make a language/code model client
#     def __init__(self, api_key):
#         self.client = Together(api_key=api_key)

#     def predict(self, prompt):
#         response = self.client.completions.create(
#             model='google/gemma-2b',
#             prompt=prompt,
#             max_tokens=1500
#         )
#         return response.choices[0].text

In [37]:
import glob
import pandas as pd
import re
from tqdm import tqdm

def extract_answer(response, use_code):
    # extract answer based on whether code is used
    if use_code:
        try:
            code_match = re.search(r"###code###(.*?)###endcode###", response, re.DOTALL)
            code_to_execute = code_match.group(1).strip()
            local_env = {}
            exec(code_to_execute, {}, local_env)
            return local_env.get("result", "No result produced")  # Assume the code defines 'result'
        except Exception as e:
            return str(e)
    else:
        ans = re.search(r"\*\*\*(.+?)\*\*\*", response)
        return ans.group(1).strip() if ans else "No answer found"

def evaluate_file(file, model, leading_prompt, trailing_prompt, use_code, store_intermediate=True):
    data_df = pd.read_json(file, lines=True)
    if store_intermediate:
        columns = ["Question", "Expected Answer", "Model Response", "Extracted Answer", "Is Correct"]
    else:
        columns = ["Is Correct"]

    results_df = pd.DataFrame(columns=columns)

    for index, row in data_df.iterrows():
        question = leading_prompt + row["question"] + trailing_prompt
        expected_answer = row["answer"].lower()  # ensure the answer is in lower case
        response = model.predict(question)
        extracted_answer = extract_answer(response, use_code)
        is_correct = extracted_answer.lower() == expected_answer

        if store_intermediate:
            results_df.loc[index] = [row["question"], expected_answer, response, extracted_answer, is_correct]
        else:
            results_df.loc[index] = [is_correct]

    return results_df

def evaluate_across_files(model, leading_prompt, data_path, which_files="all", use_code=False):
    map_global_files = glob.glob(data_path + "map_global/*") #describe the map top down
    map_local_files = glob.glob(data_path + "map_local/*") #don't

    if which_files == "all":
        these_files = map_global_files + map_local_files
    elif which_files == "local":
        these_files = map_local_files
    elif which_files == "global":
        these_files = map_global_files
    else:
        raise ValueError(f"which_files={which_files} not supported")

    # set appropriate trailing prompt
    if use_code:
        trailing_prompt = "Output a python script to find the answer using ###code### delimiters ###endcode###. No print statements. Set the variable 'result' to the answer, in lower case."
    else:
        trailing_prompt = "Place the answer, in lower case, no 'the' or 'a', in stars ***like this***."

    # evaluate each file and store the results
    scores_df = pd.DataFrame(columns=["Filename", "Correct", "Total", "Accuracy"])

    for file in tqdm(these_files, desc="Evaluating files"):
        result = evaluate_file(file, model, leading_prompt, trailing_prompt, use_code, False)
        num_correct = result["Is Correct"].sum()
        num_total = len(result)
        scores_df.loc[len(scores_df)] = [file, num_correct, num_total, num_correct / num_total]

    return scores_df


In [41]:
import time
leading_prompt = "Answer the following question. "
use_code = True
start = time.time()
if use_code:
        trailing_prompt = "Output a python script to find the answer using ###code### delimiters ###endcode###. No print statements. Set the variable 'result' to the answer, in lower case."
else:
    trailing_prompt = "Place the answer, in lower case, no 'the' or 'a', in stars ***like this***."

use_code = True

file_path = "/content/drive/MyDrive/CS 159/data/raw/SpatialEvalLLM/map_global/type-ring_size-12_steps-8_seed-12_n-100.jsonl"

results_df = evaluate_file(file_path, model, leading_prompt, trailing_prompt, use_code, store_intermediate=True)


print(results_df)
print(time.time() - start)

AttributeError: 'list' object has no attribute 'lower'

In [34]:
num_correct = results_df["Is Correct"].sum()
num_total = len(results_df)

In [35]:
num_correct/num_total

0.08