In [1]:
import os
os.chdir('../eval')
%load_ext autoreload

import hashlib
import time
import random
import json
import openai
import transformers
import fnmatch
import datasets
import pathlib
import torch

from functools import cache
from collections import Counter
from datasets import load_dataset
from warnings import warn
from abc import abstractmethod, ABC
from eval import tasks
from eval.utils import TokenizedDataset, complete_code
from eval.tasks.utils import evaluate, convert_to_nltk_rep
# from eval.generation import parallel_generations
from diskcache import Cache
from concurrent.futures import ThreadPoolExecutor


from torch.utils.data.dataloader import DataLoader
from transformers import StoppingCriteria, StoppingCriteriaList
from accelerate.utils import set_seed

from accelerate import Accelerator, DeepSpeedPlugin
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser

from eval import tasks
from eval.generation import parallel_generations
from eval.args import RunnerArguments, HFArguments, OAIArguments, GenerationArguments
# from eval.evaluator import HFEvaluator, OAIEvaluator
from eval.tasks import ALL_TASKS, TASK_REGISTRY


transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


[2024-04-03 17:53:04,470] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


# base.py

In [2]:

class Task(ABC):
    """A task represents an entire benchmark including its dataset, problems,
    answers, generation settings and evaluation methods.
    """

    # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
    DATASET_PATH: str = None

    # The name of a subset within `DATASET_PATH`.
    DATASET_NAME: str = None

    def __init__(self, stop_words=None, requires_execution=True):
        """
        :param stop_words: list
            list of stop words if the generation uses a stopping criteria during generation
        :param requires_execution: bool
            wheter the task requires code execution during evaluation or not
        """
        self.stop_words = stop_words
        self.requires_execution = requires_execution
        try:
            self.dataset = load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)
        except:
            warn(
                "This task will use a locally downloaded dataset, not from the HF hub."
            )

    @abstractmethod
    def get_dataset(self):
        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
        return []

    def fewshot_examples(self):
        """Loads and returns the few-shot examples for the task if they exist."""
        pass

    @abstractmethod
    def get_prompt(self, doc):
        """Builds the prompt for the LM to generate from.
        :param doc: dict[str: str]
            sample from the test dataset
        """
        pass

    @abstractmethod
    def get_reference(self, doc):
        """Builds the reference solution for the doc.
        :param doc: dict[str: str]
            sample from the test dataset
        """
        pass

    @abstractmethod
    def postprocess_generation(self, generation, idx):
        """Defines the postprocessing for a LM generation.
        :param generation: str
            code generation from LM
        :param idx: int
            index of doc in the dataset to which the generation belongs
        """
        pass

    @abstractmethod
    def process_results(self, generations, references):
        """Takes the list of LM generations and evaluates them against ground truth references,
        returning the metric for the generations as in {"metric_name": result}.
        :param generations: list(list(str))
            list of lists containing generations
        :param references: list(str)
            list of str containing refrences
        :return: dict[str: float]
        """
        pass


class OWAFOLTask(Task):
    """An OWA (Open World Assumption) FOL (First Order Logic) Task is a Task in which the goal
    is to generate True/False/Uncertain answers to First Order Logic questions.
    """

    TRAIN_DATASET_PATH = "minimario/FOLIO"
    ERROR_TOKEN = "Error"
    MAX_SHOTS = 16

    def __init__(self, mode, n):
        assert n <= self.MAX_SHOTS, f"supports up to {self.MAX_SHOTS}-shot"
        super().__init__(
            stop_words=["</EVALUATE>"], requires_execution=True,
        )
        self._mode = mode
        self._nshot = n
        self.train_dataset = load_dataset(self.TRAIN_DATASET_PATH)["train"]
        self._train_dataset = self.reformat_fol_samples_train(self.train_dataset)
        self._train_dataset = self.add_conclusion_fols_train(self._train_dataset)
        self._train_dataset = self.add_cot_train(self._train_dataset)
        self._train_dataset = self._train_dataset.map(
            lambda x: {"label": "Uncertain" if x["label"] == "Unknown" else x["label"]},
            remove_columns=["label"],
        )
        self._train_fewshot_indices_all = [
            125,
            23,
            60,
            275,
            148,
            261,
            263,
            683,
            299,
            684,
            850,
            853,
            886,
            892,
            930,
            980,
        ]
        # Labels:
        # 23 (True), 60 (False), 125 (Uncertain), 148 (False), 261 (True), 263 (True), 275 (Uncertain), 683 (Uncertain)
        # 299 (True), 684 (False), 850 (False), 853 (Uncertain), 886 (True), 892 (Uncertain), 930 (False), 980 (False)

        self._train_fewshot_indices = self._train_fewshot_indices_all[:n]
        self._train = self._train_dataset.select(self._train_fewshot_indices)

    def reformat_fol_samples_train(self, train_dataset):
        def reformat_fol_sample(sample):
            sample["premises-FOL"] = [
                convert_to_nltk_rep(premise) for premise in sample["premises-FOL"]
            ]
            return sample

        return train_dataset.map(reformat_fol_sample)

    def add_conclusion_fols_train(self, train_dataset):
        train_conclusion_fols = {
            23: "HigherRank(RealMadrid, Barcelona)",
            60: "-OlympicGoldMedalWinner(Amy) -> NobelLaureate(Amy)",
            125: "-Dispensable(Worksheet)",
            148: "FolkSong(Inception)",
            261: "MakeGoodBreakfast(Luke)",
            263: "exists x. (Develops(Ets, x) & For(x, k-OneTwoandhighereducation)) & exists x. (Develops(Ets, x) & AssociatedWith(x, Entrytouseducationinstitutions))",
            275: "ContributeToCountry(James)",
            299: "GetRhythmRight(John)",
            683: "exists x. (BRICS(x) & Speaks(x, Hindi))",
            684: "Film(Hamilton)",
            850: "-Liked(Leo, Charlie) & -Cares(Charlie, Leo)",
            853: "Won(Threebodyproblem, Hugoaward)",
            886: "Dagfinn(DagfinnAarskog)",
            892: "PartOf(Minsk, Scottishpremiership)",
            930: "-Locate(Boves, Europe)",
            980: "(InvitedTakePhoto(James) & -HappyCommunicate(James)) | (-InvitedTakePhoto(James) & HappyCommunicate(James))",
        }
        conclusions = [None for _ in range(len(train_dataset))]
        for index, conclusion_fol in train_conclusion_fols.items():
            conclusions[index] = conclusion_fol
        train_dataset = train_dataset.add_column("conclusion-FOL", conclusions)
        return train_dataset

    def add_cot_train(self, train_dataset):
        train_cots = {
            23: "Let's think step by step. We want to evaluate if in La Liga 2021-2022, Real Madrid ranks higher than Barcelona. From premise 1, we know that a La Liga soccer team ranks higher than another if it receives more points. From premise 4, we know that in La Liga 2021-2022, Real Madrid received more points than Barcelona. Therefore, in La Liga 2021-2022, Real Madrid received more points than Barcelona, so Real Madrid ranks higher than Barcelona, so the statement is true.\nANSWER:\tTrue",
            60: "Let's think step by step. We want to evaluate the statement \"if Amy is not an Olympic gold medal winner, then Amy is a Nobel laureate\". Let's assume that Amy is not an Olympic gold medal winner. This doesn't tell us anything about whether Amy is a Nobel laureate, so the statement isn't true, meaning it is either False or Uncertain. To distinguish between the two, notice that we could have a scenario where Amy is neither an Olympic gold medal winner nor a Nobel laureate. None of the premises are violated in this case. This means the statement must be false.\nANSWER:\tFalse",
            125: "Let's think step by step. We want to evaluate if a worksheet is not dispensable. From premise 6, we know that a worksheet is either paper or is environment-friendly. If it is paper, then from premise 3, a worksheet is woodware, and from premise 2, a worksheet is dispensable. If it is environment-friendly, we know it is good from premise 5, but we know nothing about whether it is dispensable. Therefore, we don't know if a worksheet is dispensible or not, so the statement is uncertain.\nANSWER:\tUncertain",
            148: "Let's think step by step. We want to evaluate if Inception is a folk song. We know that Inception is a sci-fi movie. Since all movies are videos and Inception is a movie, it is a video, which means it is visual. On the other hand, we know that all folk songs are songs, and no songs are visual, so no folk songs are visual. Therefore, since Inception is visual but no folk songs are visual, we know that Inception cannot be a folk song, so the statement is false.\nANSWER:\tFalse",
            261: "Let's think step by step. We want to evaluate if Luke can make a good breakfast. From the last premise, we know that Luke can make cookies, scrambled eggs, and muffins. Since Luke can make cookies and muffins, they are a baker. Now, combining the information we have, since Luke is a baker and can make scrambled eggs, this means that they can make a good breakfast. Therefore, Luke can make a good breakfast, so the statement is true.\nANSWER:\tTrue",
            263: "Let's think step by step. We want to evaluate if ETS develops assessments for K-12 statewide as well as entry to US tertiary and quaternary educatiand doon institutions. We know that ETS develops assessments for K-12 statewide. We also know that ETS develops assessments associated with entry to the US tertiary and quaternary education institutes. Therefore, both parts of the conclusion are true, and the statement is true.\nANSWER:\tTrue",
            275: "Let's think step by step. We want to evaluate if James contributes to the country. Let's think about what we know about James. First, we know that James was either sentenced for thief or stayed in prison. However, this doesn't tell us anything about whether James contributed to the country. Second, we know that James either had a bad record in the local state or that he was respected by others. However, the premises don't tell us anything about the relationship between having a bad record and contributing to the country. Therefore, it is uncertain whether James contributes to the country.\nANSWER:\tUncertain",
            299: "Let's think step by step. We want to evaluate if John can get the rhythms right. We know that John is a student learning piano. Since all students learning piano can strike the right notes, John can strike the right notes. Since all students who can strike the right notes can get the rhythms right and John can strike the right notes, John can get the rhythms right, so the conclusion is true.\nANSWER:\tTrue",
            683: "Let's think step by step. We want to evaluate if there is a person from BRICS speaking Hindi. We know that there is an Indian, and since India is one of BRICS, we know that there is an Indian in BRICS. Furthermore, we know that they speak either Hindi or English, however, we don't know which one. Therefore, there could be a person in BRICS speaking Hindi, or there could not. Therefore, it is uncertain whether there is a person from BRICS speaking Hindi.\nANSWER:\tUncertain",
            684: "Let's think step by step. We want to evaluate if Hamilton is a film. Since Daveed Diggs played two roles in the musical Hamilton, Hamilton is a musical. Since musicals are not films and Hamilton is a musical, Hamilton is not a film, and the conclusion is false.\nANSWER:\tFalse",
            850: "Let's think step by step. We want to evaluate if Charlie does not like Leo and does not care for Leo. Let's first evaluate if Charlie does not like Leo. We know Charlie has a naughty pet named Leo. Since pets who are naughty are not liked as much, Charlie does not like Leo. Now, let's evaluate if Charlie cares for Leo. We know that if a person has a pet, they care for that pet. Since Leo is Charlie's pet, Charlie cares for Leo. Therefore, Charlie does not like Leo but cares for Leo, so the second part of the conclusion is false, which means the entire conclusion is false.\nANSWER:\tFalse",
            853: "Let's think step by step. We want to evaluate if the Three Body Problem won the Hugo Award. The only thing we know about the Hugo Award is that some books that have won the Hugo Award were written by Cixin Liu. However, we know nothing about whether The Three Body Problem was written by Cixin Liu, so the conclusion is uncertain.\nANSWER:\tUncertain",
            886: "Let's think step by step. We want to evaluate if Dagfinn is Dagfinn Aarskog's given name. We know that Dagfinn is a given name, and that notable people with the given name Dagfinn includes Dagfinn Aarskog, which means that Dagfinn is Dagfinn Aarskog's given name, so the conclusion is true.\nANSWER:\tTrue",
            892: "Let's think step by step. We want to evaluate if Minsk joined the Scottish Premiership. We know that Minsk and St Johnstone are different teams and that St Johnstone is part of the Scottish Premiership, but we don't know anything about whether or not Minsk joined the Scottish Premiership from the premises. Therefore, the conclusion is uncertain.\nANSWER:\tUncertain",
            930: "Let's think step by step. We want to evaluate if Boves is not in Europe. We know that Boves is a railway station located in France. We also know that since France is a European country, France is located in Europe. Furthermore, we know that if A is located in B and B is located in C, then A is located in C. Therefore, we know that because Boves is located in France and France is located in Europe, that means Boves is located in Europe. Therefore, the conclusion is false.\nANSWER:\tFalse",
            980: "Let's think step by step. We want to evaluate if James is either invited to take a photo with the audience or happy to communicate with each other during the dinner. We know that James does not attend the conference in person and is not provided with souvenirs. There are no premises that apply to people who do not attend the conference. Since James is not provided with souvenirs, since all who attended the conference in person are provided with souvenirs, we know that James did not attend the conference in person. However, we don't know anything else, so it is possible that James was neither invited to take a photo with the audience nor happy to communicate during the dinner. Therefore, the conclusion is false.\nANSWER:\tFalse",
        }
        cots = [None for _ in range(len(train_dataset))]
        for index, cot in train_cots.items():
            cots[index] = cot
        train_dataset = train_dataset.add_column("cot", cots)
        return train_dataset

    def get_dataset(self):
        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
        return self._test

    def get_instructions(self):
        instructions = ""
        instructions += "The following is a first-order logic (FOL) problem.\n"
        instructions += "The problem is to determine whether the conclusion follows from the premises.\n"
        instructions += "The premises are given in the form of a set of first-order logic sentences.\n"
        instructions += "The conclusion is given in the form of a single first-order logic sentence.\n"
        if self._mode == "baseline":
            instructions += f"The task is to evaluate the conclusion as 'True', 'False', or 'Uncertain' given the premises."
        else:
            instructions += "The task is to translate each of the premises and conclusions into FOL expressions, "
            if self._mode == "scratchpad":
                instructions += f"and then to evaluate the conclusion as 'True', 'False', or 'Uncertain' given the premises."
            elif self._mode == "neurosymbolic":
                instructions += "so that the expressions can be evaluated by a theorem solver to determine whether the conclusion follows from the premises.\n"
                instructions += "Expressions should be adhere to the format of the Python NLTK package logic module."
        return instructions + "\n\n"

    def format_train_example(self, doc):
        example = self.format_test_example(doc)
        if self._mode == "baseline":
            example += f"{doc['label'].strip()}\n"
        elif self._mode == "cot":
            example += f"{doc['cot']}\n"
        else:
            for premise, fol in zip(doc["premises"], doc["premises-FOL"]):
                example += f"TEXT:\t{premise.strip()}\nFOL:\t{fol.strip()}\n"
            example += f"TEXT:\t{doc['conclusion'].strip()}\nFOL:\t{doc['conclusion-FOL'].strip()}\n"
            if self._mode == "scratchpad":
                example += f"ANSWER:\t{doc['label'].strip()}\n"
        return example + "</EVALUATE>\n"

    def format_test_example(self, doc):
        example = "<PREMISES>\n"
        for premise in doc["premises"]:
            example += f"{premise.strip()}\n"
        example += "</PREMISES>\n"
        example += f"<CONCLUSION>\n{doc['conclusion'].strip()}\n</CONCLUSION>\n"
        example += "<EVALUATE>\n"
        return example

    def get_prompt(self, doc):
        """
        Builds the prompt for the LM to generate from.
        :param doc: dict[str: str]
            sample from the test dataset
        :return: str
        """
        instructions = self.get_instructions()
        train = self.fewshot_examples()
        test = self.format_test_example(doc)
        prompt = "\n".join([instructions, train, test])
        return prompt

    def get_reference(self, doc):
        """
        Builds the reference solution for the doc (sample from the test dataset).
        :param doc: dict[str: str]
            sample from the test dataset
        :return: str
        """
        return doc["label"]

    def postprocess_generation(self, generation, idx, completion_only=False):
        """
        Defines the postprocessing for a LM generation.
        :param generation: str
            code generation from LM
        :param idx: int (if needed)
            index of doc in the dataset to which the generation belongs
        :return: str
        """
        print('base.py in postprocess_generation')
        try:
            if completion_only:
                gen = generation.strip()
            else:
                prefix = self.get_prompt(self.get_dataset()[idx])
                print("==== PREFIX ==== ",prefix)
                assert generation.startswith(
                    prefix
                ), "Increase `--max_length_generation` to avoid truncation"
                gen = generation[len(prefix) :].strip()
                for stop_word in self.stop_words:
                    gen = gen.split(stop_word)[0].strip()
            if self._mode == "baseline":
                resp = gen.strip()
            elif self._mode == "scratchpad":
                flag = "ANSWER:"
                resp = gen.split(flag)[-1].strip()
            elif self._mode == "neurosymbolic":
                flag = "FOL:"
                parses = [
                    line.replace(flag, "").strip()
                    for line in gen.split("\n")
                    if flag in line
                ]
                premises, conclusion = parses[:-1], parses[-1]
                resp = evaluate(premises, conclusion)
            elif self._mode == "cot":
                flag = "ANSWER:"
                resp = gen.split(flag)[-1].strip()
            else:
                raise ValueError(f"Invalid mode: {self._mode}")
            assert resp in ["True", "False", "Uncertain"], f"Invalid generation: {resp}"
            return resp
        except Exception as e:
            # TODO: explore failure cases and improve postprocessing
            print(f"Error in parsing and/or evaluating LLM output: {e}")
            return self.ERROR_TOKEN

    @staticmethod
    def metric(generations, references, error_token):
        correct = 0
        for gens, ref in zip(generations, references):
            gens = [gen for gen in gens if gen != error_token]
            if len(gens) > 0:
                majority = Counter(gens).most_common(1)[0][0]
                if majority == ref:
                    correct += 1
        return {f"accuracy (pass@1 majority)": correct / len(references)}

    def process_results(self, generations, references):
        """
        Takes the list of LM generations and evaluates them against ground truth references,
        returning the metric for the generations as in {"metric_name": result}.
        We encourage to directly load the metric from `evaluate` library to keep the code concise.
        :param generations: list(list(str))
            list of lists containing generations
        :param references: list(str)
            list of str containing refrences
        :return: dict[str: float]
        """
        return self.metric(generations, references, self.ERROR_TOKEN)

    @cache
    def fewshot_examples(self):
        """
        Returns a few-shot example for the task.
        :param n: int
            number of examples
        :param seed: int
            seed for random number generator
        :return: str
        """
        examples = []
        for doc in self._train.select(range(self._nshot)):
            examples.append(self.format_train_example(doc))
        return "\n".join(examples)


# evaluator.py

In [3]:

_WARNING = """
################################################################################
                                  !!!WARNING!!!
################################################################################
The task you are about to use executes untrusted model-generated code.
Although it is highly unlikely that model-generated code will do something
overtly malicious in response to this test suite, model-generated code may act
destructively due to a lack of model capability or alignment.
Users are strongly encouraged to sandbox this evaluation suite so that it
does not perform destructive actions on their host or network. For more
information on how OpenAI sandboxes its code, see the paper "Evaluating Large
Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
Once you have read this disclaimer and taken appropriate precautions, set the argument 
"allow_code_execution" to True.
################################################################################\
"""


class Evaluator(ABC):
    def __init__(self, args):
        self.args = args
        self.allow_code_execution = args.allow_code_execution

    @abstractmethod
    def generate_text(self, task_name):
        pass

    def evaluate(self, task_name):
        task = tasks.get_task(task_name)
        print(task.requires_execution, self.allow_code_execution)
        if task.requires_execution and not self.allow_code_execution:
            raise ValueError(_WARNING)

        generations_prc, generations_raw, references = self.generate_text(task_name)
        if len(generations_prc[0]) != self.args.n_samples:
            generations_prc = [l[: self.args.n_samples] for l in generations_prc]
            warnings.warn(
                "Number of tasks wasn't proportional to number of devices, we removed extra predictions"
            )

        if not hasattr(self, "accelerator") or self.accelerator.is_main_process:
            if not self.args.generations_path:
                if self.args.save_generations_raw:
                    with open(self.args.save_generations_raw_path, "w") as fp:
                        json.dump(generations_raw, fp)
                        print("raw generations were saved")
                if self.args.save_generations_prc:
                    with open(self.args.save_generations_prc_path, "w") as fp:
                        json.dump(generations_prc, fp)
                        print("processed generations were saved")
                if self.args.save_references:
                    with open(self.args.save_references_path, "w") as fp:
                        json.dump(references, fp)
                        print("references were saved")

            os.environ["TOKENIZERS_PARALLELISM"] = "false"
            if self.allow_code_execution and task.requires_execution:
                os.environ["HF_ALLOW_CODE_EVAL"] = "1"
            results = task.process_results(generations_prc, references)
            return results


class HFEvaluator(Evaluator):
    def __init__(self, accelerator, model, tokenizer, args):
        super().__init__(args)
        self.accelerator = accelerator
        self.model = model
        self.tokenizer = tokenizer

    def generate_text(self, task_name):
        # print('evaluator.py HFEvaluator, task_name', task_name)
        task = tasks.get_task(task_name)
        dataset = task.get_dataset()
        # print('evaluator.py', dataset)
        n_tasks = self.args.limit if self.args.limit else len(dataset)
        generations_prc, generations_raw = parallel_generations(
            task,
            dataset,
            self.accelerator,
            self.model,
            self.tokenizer,
            n_tasks=n_tasks,
            args=self.
            args,
        )
        print('evaluator HFEvaluator generate_text() generations_prc', generations_prc)
        print('evaluator HFEvaluator generate_text() generations_raw', generations_raw)
        # print('evaluator HFEvaluator generate_text() references', references)
        references = [task.get_reference(dataset[i]) for i in range(n_tasks)]
        return generations_prc, generations_raw, references


# generation.py

In [4]:
class EndOfFunctionCriteria(StoppingCriteria):
    """Custom `StoppingCriteria` which checks if all generated functions in the batch are completed."""
    
    def __init__(self, start_length, eof_strings, tokenizer):
        self.start_length = start_length
        self.eof_strings = eof_strings
        self.tokenizer = tokenizer

    def __call__(self, input_ids, scores, **kwargs):
        """Returns true if all generated sequences contain any of the end-of-function strings."""
        decoded_generations = self.tokenizer.batch_decode(
            input_ids[:, self.start_length :]
        )
        done = []
        for decoded_generation in decoded_generations:
            done.append(
                any(
                    [
                        stop_string in decoded_generation
                        for stop_string in self.eof_strings
                    ]
                )
            )
        return all(done)


def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, args):
    # print('generation.py parallel_generations')
    if args.generations_path:
        with open(args.generations_path) as fp:
            generations = json.load(fp)
            if accelerator.is_main_process:
                print(
                    f"generations loaded, {n_tasks} selected from {len(generations)} with {len(generations[0])} candidates"
                )
        return generations[:n_tasks]

    set_seed(args.seed, device_specific=True)

    gen_kwargs = {
        "do_sample": args.do_sample,
        "temperature": args.temperature,
        "top_p": args.top_p,
        "top_k": args.top_k,
        "max_length": args.max_length_generation,
    }
    if task.stop_words:
        gen_kwargs["stopping_criteria"] = StoppingCriteriaList(
            [EndOfFunctionCriteria(0, task.stop_words, tokenizer)]
        )

    if accelerator.is_main_process:
        print(f"number of problems for this task is {n_tasks}")
    n_copies = args.n_samples // args.batch_size

    # print('generation.py dataset', dataset)
    ds_tokenized = TokenizedDataset(
        task,
        dataset,
        tokenizer,
        num_devices=accelerator.state.num_processes,
        max_length=args.max_length_generation,
        n_tasks=n_tasks,
        n_copies=n_copies,
        prefix=args.prefix,
    )

    # do not confuse args.batch_size, which is actually the num_return_sequences
    ds_loader = DataLoader(ds_tokenized, batch_size=1)

    model, ds_loader = accelerator.prepare(model, ds_loader)
    # print('generation.py model', model)
    # print('generation.py model device', model.device)
    
    generations_prc, generations_raw = complete_code(
        task,
        accelerator,
        model,
        tokenizer,
        ds_loader,
        n_tasks=n_tasks,
        batch_size=args.batch_size,
        prefix=args.prefix,
        postprocess=args.postprocess,
        **gen_kwargs,
    )
    return generations_prc, generations_raw

In [5]:
args = HfArgumentParser(
        [RunnerArguments, HFArguments, OAIArguments, GenerationArguments]
    ).parse_known_args()
print(args[0].output_dir)

outputs


In [6]:
model = None
tokenizer = None

In [7]:
# # import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ['MASTER_PORT'] = '29501'


In [8]:
!export CUDA_VISIBLE_DEVICES=3
!echo $CUDA_VISIBLE_DEVICES




In [9]:
os.environ["CUDA_VISIBLE_DEVICES"] = "3"


In [10]:
deepspeed_plugin = DeepSpeedPlugin(zero_stage=3, gradient_accumulation_steps=2)
accelerator = Accelerator(mixed_precision='fp16', deepspeed_plugin=deepspeed_plugin)

[2024-04-03 17:53:05,874] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-04-03 17:53:05,875] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...


[2024-04-03 17:53:05,937] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=165.132.105.165, master_port=29500
[2024-04-03 17:53:05,938] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


In [11]:
args = HfArgumentParser(
    [RunnerArguments, HFArguments, OAIArguments, GenerationArguments]
).parse_known_args()[0]

# MAIN CODE

based on runner.py

In [12]:
# def main():
args = HfArgumentParser(
    [RunnerArguments, HFArguments, OAIArguments, GenerationArguments]
).parse_known_args()[0]

__file__ = ''
model_name = 'bigcode/starcoderplus'
base = 'folio'
batch_size = 5
max_length=8192 # max model context including prompt
shot = '1'
mode = 'neurosymbolic'
task = f'{base}-{mode}-{shot}shot'
# run_id='${model#*/}_${task}'
run_id = f"{model_name}_{task}"

# print('$')
args.top_k = 2
args.output_dir = pathlib.Path('../output/')
args.model = model_name
args.temperature = 0.8
args.max_length_generation = 1024
# args.generation_only = True
args.save_generations_raw_path = args.output_dir / f'{run_id}_generations_raw.json'
args.save_generations_prc_path = args.output_dir / f'{run_id}_generations_prc.json'
args.save_references_path = args.output_dir / f'{run_id}_references.json'
args.save_results_path = args.output_dir / f'{run_id}_results.json'
args.save_generations_raw_path.parent.mkdir(parents=True, exist_ok=True)
args.save_generations_prc_path.parent.mkdir(parents=True, exist_ok=True)
args.save_references_path.parent.mkdir(parents=True, exist_ok=True)
args.save_results_path.parent.mkdir(parents=True, exist_ok=True)
args.allow_code_execution = True
args.tasks = task
args.precision = 'fp32'

# print(save_generations_raw_path)

if args.tasks is None:
    task_names = ALL_TASKS
else:
    task_names = set()
    for pattern in args.tasks.split(","):
        for matching in fnmatch.filter(ALL_TASKS, pattern):
            task_names.add(matching)
    task_names = list(task_names)
   
# accelerator = Accelerator()

if accelerator.is_main_process:
    print(f"Selected Tasks: {task_names}")

results = {}
if args.generations_path:
    if accelerator.is_main_process:
        print("Evaluation only mode")
    evaluator = HFEvaluator(accelerator, None, None, args)
    for task in task_names:
        results[task] = evaluator.evaluate(task)
else:
    evaluator = None
    
    if evaluator is None:
        dict_precisions = {
            "fp32": torch.float32,
            "fp16": torch.float16,
            "bf16": torch.bfloat16,
        }
        if args.precision not in dict_precisions:
            raise ValueError(
                f"Non valid precision {args.precision}, choose from: fp16, fp32, bf16"
            )
        print(f"Loading the model and tokenizer from HF (in {args.precision})")
        if model is None:
            model = AutoModelForCausalLM.from_pretrained(
                args.model,
                revision=args.revision,
                torch_dtype=dict_precisions[args.precision],
                trust_remote_code=args.trust_remote_code,
                use_auth_token=args.use_auth_token,
            )
        if tokenizer is None:
            tokenizer = AutoTokenizer.from_pretrained(
                args.model,
                revision=args.revision,
                use_auth_token=args.use_auth_token,
                truncation_side="left",
            )
        if not tokenizer.eos_token:
            if tokenizer.bos_token:
                tokenizer.eos_token = tokenizer.bos_token
                print("bos_token used as eos_token")
            else:
                raise ValueError("No eos_token or bos_token found")
        tokenizer.pad_token = tokenizer.eos_token
        evaluator = HFEvaluator(accelerator, model, tokenizer, args)
        print('Finish loading model from HF')
        
    for task in task_names:
        if args.generation_only:
            print('generation only ---')
            if accelerator.is_main_process:
                print("Generation mode only")
            generations_prc, generations_raw, references = evaluator.generate_text(
                task
            )
            if accelerator.is_main_process:
                if args.save_generations_raw:
                    with open(args.save_generations_raw_path, "w") as fp:
                        json.dump(generations_raw, fp)
                        print("raw generations were saved")
                if args.save_generations_prc:
                    with open(args.save_generations_prc_path, "w") as fp:
                        json.dump(generations_prc, fp)
                        print("processed generations were saved")
                if args.save_references:
                    with open(args.save_references_path, "w") as fp:
                        json.dump(references, fp)
                        print("references were saved")
        else:
            print('evaluation only ---')
            
            results[task] = evaluator.evaluate(task)
    
    
    # print(args)
# if __name__ == "__main__":
#     main()

Selected Tasks: ['folio-neurosymbolic-1shot']
Loading the model and tokenizer from HF (in fp32)


In [13]:
results["config"] = {"model": args.model}
if not args.generation_only:
    dumped = json.dumps(results, indent=2, sort_keys=True)
    if accelerator.is_main_process:
        print(dumped)

    if args.save_results:
        with open(args.save_results_path, "w") as f:
            f.write(dumped)

{
  "config": {
    "model": "bigcode/starcoderplus"
  },
  "folio-neurosymbolic-1shot": {
    "accuracy (pass@1 majority)": 0.22549019607843138
  }
}


In [15]:
# t = tasks.get_task('folio-neurosymbolic-8shot')
# dataset = t.get_dataset()
# print(t.get_prompt(dataset[10]))
# # dataset[3]