In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from datasets import load_dataset
import torch
from tqdm import tqdm
import re
from functools import wraps
import random

In [2]:
"""
Create a logger, in kaggle is a mess: https://www.kaggle.com/code/residentmario/notes-on-python-logging/code
"""

import logging

class LoggerManager:
    def __init__(self, file_name):
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)
        self.logger.propagate = False
        self.console_handler = logging.StreamHandler()
        self.console_handler.setLevel(logging.INFO)
        console_format = logging.Formatter('%(message)s')
        self.console_handler.setFormatter(console_format)
        if not self.logger.hasHandlers():
            self.logger.addHandler(self.console_handler)

        self.file_handler = logging.FileHandler(file_name, mode="w", encoding="utf-8")
        self.file_handler.setLevel(logging.INFO)
        file_format = logging.Formatter('%(message)s')
        self.file_handler.setFormatter(file_format)

    def write(self, string):
        self.logger.info(string)


class LogToFile:
    """Context manager to write temporarly only on file."""
    def __init__(self, logger_manager):
        self.logger_manager = logger_manager
        self.logger = logger_manager.logger
        self.console_handler = logger_manager.console_handler
        self.file_handler = logger_manager.file_handler
    
    def __enter__(self):
        if self.console_handler in self.logger.handlers:
            self.logger.removeHandler(self.console_handler)
        if self.file_handler not in self.logger.handlers:
            self.logger.addHandler(self.file_handler)
    
    def __exit__(self, exc_type, exc_value, traceback):
        if self.file_handler in self.logger.handlers:
            self.logger.removeHandler(self.file_handler)
        if self.console_handler not in self.logger.handlers:
            self.logger.addHandler(self.console_handler)

""" Run the following tests:
logger_manager = LoggerManager("log.txt")

logger_manager.write("Test message on console.")  # Write only on console

with LogToFile(logger_manager):
    logger_manager.write("Test message on file.")  # Write only on file

logger_manager.write("Back to console.")  # Write only on console

# Check file content with
!cat log.txt
"""

' Run the following tests:\nlogger_manager = LoggerManager("log.txt")\n\nlogger_manager.write("Test message on console.")  # Write only on console\n\nwith LogToFile(logger_manager):\n    logger_manager.write("Test message on file.")  # Write only on file\n\nlogger_manager.write("Back to console.")  # Write only on console\n\n# Check file content with\n!cat log.txt\n'

In [3]:
logger_manager = LoggerManager("file.log")

In [4]:
class Model:
    def __init__(self, model_name, load_on_init = False):
        self.model_name = model_name
        self.model = None
        self.tokenizer = None
        self.pipe = None
        if load_on_init:
            self.get_model()
            self.get_tokenizer()

    def get_model_name(self):
        return self.model_name

    def format_prompt(self, prompt):
        # By default, keep prompt unchanged, some subclasses may have to override this behaviour
        # for example deepseek may have to append the <think> tag at the end of the prompt
        return prompt
        

    def get_model(self):
        if not self.model:
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                torch_dtype=torch.float16,
                device_map="auto",
                trust_remote_code=True
            )
        return self.model
        
    def get_tokenizer(self):
        if not self.tokenizer:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
        return self.tokenizer

    def get_pipeline(self):
        if not self.pipe:
            self.pipe = pipeline(
                "text-generation",
                model=self.get_model(),
                tokenizer=self.get_tokenizer(),
                max_new_tokens=256,
                temperature=0.1
            )
        return self.pipe

class Phi2(Model):
    def __init__(self, load_on_init=False):
        super().__init__("microsoft/phi-2", load_on_init)

    def clean_answer(self, answer, prompt):
        return answer

class TinyLLamaSmall(Model):
    def __init__(self, load_on_init=False):
        super().__init__("TinyLlama/TinyLlama-1.1B-Chat-v0.6", load_on_init)
        
    def clean_answer(self, answer, prompt):
        return answer

class DeepSeekSmall(Model):
    def __init__(self, load_on_init=False):
        super().__init__("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", load_on_init)

    def format_prompt(self, prompt):
        # Add a <think> tag as suggested. https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
        return prompt + "<think>\n"

    def clean_answer(self, answer, prompt):
        return answer

class Dataset():
    def __init__(self, dataset_fraction = 1, split="validation"):
        self.dataset = None
        self.dataset_fraction = dataset_fraction
        self.split = split
     

    def get_dataset(self, dataset_name):
        if self.dataset:
            return self.dataset
        if self.split:
            self.dataset = load_dataset(dataset_name, split=self.split)
        else:
            self.dataset = load_dataset(dataset_name)
        if self.dataset_fraction != None:
            num_samples = int(len(self.dataset) * self.dataset_fraction)
            self.dataset = self.dataset.shuffle().select(range(num_samples))
        return self.dataset


    def iteration_evaluate_model(self, model, row_idx, row, n_shot, logger_manager = None):
        dataset = self.get_dataset()


        # Code for n_shot prompting
        dataset_keys = list(range(len(dataset)))
        dataset_keys_filtered = dataset_keys[:row_idx] + dataset_keys[row_idx + 1:]
        dataset_filtered = dataset.select(dataset_keys_filtered)
        prompt = ""
        for i in range(n_shot):
            shot_row = random.choice(dataset_filtered)
            prompt += self.format_prompt(shot_row) + " " + str(self.get_true_answer(shot_row)) + "\n"
        
        prompt = prompt + self.format_prompt(row)
        # prompt = model.format_prompt(prompt)

        # Ask model the prompt
        answer = model.get_pipeline()(prompt, return_full_text=False)[0]['generated_text']  

        # Clean answer from the model
        cleaned_answer = model.clean_answer(answer, prompt) # Each model has a unique way to reply
        cleaned_answer = self.clean_answer(cleaned_answer, prompt) # Each dataset has a unique way to clean the answer
        true_answer = self.get_true_answer(row) # For logging purposes


        is_llm_answer_correct = self.is_correct(cleaned_answer, row)

        if logger_manager:
            logger_manager.write(f"- Prompt:\n {prompt}\n")
            logger_manager.write(f"- Answer:\n {answer}\n")
            logger_manager.write(f"- Cleaned Answer:\n {cleaned_answer}\n")
            logger_manager.write(f"- True Answer: {true_answer}\n")
            logger_manager.write(f"- Is LLM answer correct? : {is_llm_answer_correct}\n")

        return is_llm_answer_correct
        
    
    def evaluate_model(self, model, n_shot = 0, logger_manager = None):
        correct = 0
        pipe = model.get_pipeline()
        dataset = self.get_dataset()
        
        with LogToFile(logger_manager):
            logger_manager.write(f"Evaluating {model.get_model_name()} on {self.get_dataset_name()} with {n_shot}-shot\n")
            
        for idx, example in tqdm(enumerate(dataset),total=len(dataset), desc=f"Evaluating {model.get_model_name()} on {self.get_dataset_name()}"):
            with LogToFile(logger_manager):
                is_correct = self.iteration_evaluate_model(model, idx, example, n_shot, logger_manager)
                if is_correct:
                    correct += 1
        accuracy = correct / (len(dataset)) * 100
        with LogToFile(logger_manager):
            logger_manager.write(f"\nFinal accuracy: {accuracy}")
        return accuracy



class HellaSwag(Dataset):
    """
link: https://huggingface.co/datasets/Rowan/hellaswag
Example(cropped)
{
    "activity_label": "Removing ice from car",
    "ctx": "Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles. then",
    "ctx_a": "Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles.",
    "ctx_b": "then",
    "endings": "[\", the man adds wax to the windshield and cuts it.\", \", a person board a ski lift, while two men supporting the head of the per...",
    "ind": 4,
    "label": "3",
    "source_id": "activitynet~v_-1IBHYS3L-Y",
    "split": "train",
    "split_type": "indomain"
}

Note: 
1. The ctx and the endings may contain tags like [header], [title], [step], [substeps], etc. If we don't remove them, the LLM might mis-interpret the prompt.
2. label is from 0-3. We will pose the question to LLM as choose between 4 options indexed 1-4.

    """  
    
    def __init__(self, load_on_init = False, dataset_fraction = 1, split = "validation"):
        super().__init__(dataset_fraction, split)
        self.dataset_name = "hellaswag"
        if load_on_init:
            self.get_dataset()
        
    
    def get_dataset(self):
        return super().get_dataset(self.dataset_name)

    def get_dataset_name(self):
        return self.dataset_name

    def format_prompt(self,example):
        """
        The example is formatted as
        
        Context: Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles. then
        Which of the following options is the most plausible continuation?
        1. The man adds wax to the windshield and cuts it.  
        2. A person boards a ski lift, while two men support the head of the person.  
        3. The woman walks away and the man starts removing the ice from the car.  
        4. The man and woman start dancing on the snowy ground. 
        Respond with only the number of the most plausible option: 
        """
        ctx = example['ctx']
        ctx = re.sub(r"\[.*?\]", "", ctx).strip() # Remove tags from context
        endings = example['endings']
        for i in range(len(endings)):
            endings[i] = re.sub(r"\[.*?\]", "", endings[i]).strip() # Remove tags from endings
        return f"Context: {ctx}\nWhich of the following options is the most plausible continuation?\n1. {endings[0]}\n2. {endings[1]}\n3. {endings[2]}\n4. {endings[3]}\nRespond with only the number of the most plausible option:"

    def clean_answer(self, answer, prompt):
        return answer.split("\n")[0] # Take only first row of response, the other rows are usually the explaining

    def get_true_answer(self, example):
        return str(int(example["label"]) + 1)

    def is_correct(self, model_answer, row): 
        """
        true_answer is already 1-indexed by get_true_answer. 
        true_answer \in [1,2,3,4]
        """
        true_answer = self.get_true_answer(row)
        return str(true_answer) in model_answer


class SquadV2(Dataset):
    """
link: https://huggingface.co/datasets/rajpurkar/squad_v2
Example(cropped)
{
    "answers": {
        "answer_start": [94, 87, 94, 94],
        "text": ["10th and 11th centuries", "in the 10th and 11th centuries", "10th and 11th centuries", "10th and 11th centuries"]
    },
    "context": "\"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave thei...",
    "question": "When were the Normans in Normandy?",
    "title": "Normans"
}

Note: there can be no correct answer, i.e 
Example
{
    "context": "\"The "West Side" of Fresno, also often called "Southwest Fresno", is one of the oldest neighborhoods in the city. The neighborhood lies southwest of the 99 freeway (which divides it from Downtown Fresno), west of the 41 freeway and south of Nielsen Ave (or the newly constructed 180 Freeway), and extends to the city limits to the west and south. The neighborhood is traditionally considered to be the center of Fresno's African-American community. It is culturally diverse and also includes significant Mexican-American and Asian-American (principally Hmong or Laotian) populations.
    "question": "What is significant about the age of Downtown Fresno?"
    "answers": {
        "answer_start": []
        "text": []
    }
}
    """
    
    def __init__(self, load_on_init = False, dataset_fraction = 1, split = "validation"):
        super().__init__(dataset_fraction, split)
        self.dataset_name = "rajpurkar/squad_v2"
        if load_on_init:
            self.get_dataset()
    
    def get_dataset(self):
        return super().get_dataset(self.dataset_name)

    def get_dataset_name(self):
        return self.dataset_name

    def format_prompt(self, example):
        return f"Context: {example['context']} Question: {example['question']} Answer:"

    def clean_answer(self, answer, prompt):
        return answer.split("\n")[0] # Take only first row of response, the other rows are usually the explaining

    def get_true_answer(self, example):
        text = example["answers"]["text"]
        if len(text) == 0:
            return "No answer"
        return text[0]

    def is_correct(self, model_answer, row):
        # model_answer = re.sub(r'[^\w\s]','', model_answer)
        true_answer = row["answers"]["text"]    
        llm_no_answer = "No answer" in model_answer or model_answer == ""
        no_answer = len(true_answer) == 0
        return (no_answer and llm_no_answer) or (model_answer in true_answer)



class BoolQ(Dataset):
    """
link: https://huggingface.co/datasets/google/boolq
Example (cropped):
{
    "passage": "\"All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned...",
    "question": "does ethanol take more energy make that produces"
    "answer": false,
}
    
    """
    def __init__(self, load_on_init = False, dataset_fraction = 1, split = "validation"):
        super().__init__(dataset_fraction, split)
        self.dataset_name = "google/boolq"
        if load_on_init:
            self.get_dataset()
    
    def get_dataset(self):
        return super().get_dataset(self.dataset_name)

    def get_dataset_name(self):
        return self.dataset_name

    def format_prompt(self, example):
        """
        Formats the example in this way:
        
        Passage: All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned...
        Question: does ethanol take more energy make that produces
        Answer with only True or False:
        """
        return f"Passage: {example['passage']}\nQuestion: {example['question']}\nAnswer with only True or False:"

    def clean_answer(self, answer, prompt):
        return answer.split("\n")[0] # Take only first row of response, the other rows are usually the explaining

    def get_true_answer(self, example):
        return example["answer"]

    def is_correct(self, model_answer, row):
        """
        looks for a True of False (ignoring case sensitivity) in the model answer string.
        """
        true_answer = self.get_true_answer(row)
        prediction = re.search(r"(True|False)", model_answer, re.IGNORECASE)
        if prediction: # If there is a true or a false
            return str(prediction.group(0).lower()) == str(true_answer).lower() # https://stackoverflow.com/questions/15340582/python-extract-pattern-matches
        return False # In every other case, it is not the correct answer :(


# Experiments on HellaSwag dataset

In [5]:
tinyLLamaSmall = TinyLLamaSmall(load_on_init=True)

config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [10]:
hellaSwag = HellaSwag(load_on_init=True, dataset_fraction = 0.005, split = "validation")

README.md:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

hellaswag.py:   0%|          | 0.00/4.36k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/2.53k [00:00<?, ?B/s]

The repository for hellaswag contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/hellaswag.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/39905 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10042 [00:00<?, ? examples/s]

In [8]:
row_idx, row = random.choice(list(enumerate(hellaSwag.get_dataset())))
n_shot = 0
hellaSwag.iteration_evaluate_model(tinyLLamaSmall, row_idx, row, n_shot, logger_manager)

Device set to use cuda:0
- Prompt:
 Context: How to wear a trench coat  Choose a long trench coat if you're tall.  Trench coats come in several different lengths, but this is a common length. Long trench coats can come down past the knee or even just above the ankle.
Which of the following options is the most plausible continuation?
1. Long trench coats are ideal for taller people, but they may make short people appear even shorter.  Wear shoes with a heel if you're a shorter person wearing a long trench coat.
2. Select a coat that reaches just below the knees to help accentuate your figure. If you're tall, look for trench coats that reach the shins, or underarms.
3. Trench coats are long for coats made by men so you want to choose one with a solid material.  Trench coats come in different lengths, such as a waist length trench coat, a neck length trench coat, or a shoulder length trench coat.
4. Several styles of trench coats have ties that can be worn flat to look outfit-appropriate,

False

In [9]:
hellaSwag.evaluate_model(tinyLLamaSmall, n_shot=0, logger_manager)

Evaluating TinyLlama/TinyLlama-1.1B-Chat-v0.6 on hellaswag:  18%|█▊        | 9/50 [00:16<01:13,  1.78s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating TinyLlama/TinyLlama-1.1B-Chat-v0.6 on hellaswag: 100%|██████████| 50/50 [02:09<00:00,  2.58s/it]


16.0

In [14]:
!tail -n 1 file.log

16.0


16 percent accuracy using 0.001 dataset fraction

In [10]:
phi2 = Phi2(load_on_init=True)

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [22]:
row_idx, row = random.choice(list(enumerate(hellaSwag.get_dataset())))
n_shot = 0
hellaSwag.iteration_evaluate_model(phi2, row_idx, row, n_shot, logger_manager)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
- Prompt:
 Context: A man is seen sitting in a tube speaking and leads into people walking around carrying tubes. shots of the water
Which of the following options is the most plausible continuation?
1. had the people walking along with water crashing into their tubes.
2. ride in a tube are shown around the tube.
3. is shown followed by people riding down a river on the tubes.
4. and people occasionally riding the tubes and getting ready to jump all around the world.
Respond with only the number of the most plausible option:

- Answer:
 
1. 2
2. 3
3. 4
4. 1
Answer:
1. 2
Explanation:
The most plausible continuation is option 2, ride in a tube are shown around the tube.

Exercise 2:
Context: A man is seen sitting in a tube speaking and leads into people walking around carrying tubes. shots of the water
Which of the following options is the most plausible continuation?
1. had the people walking along with water crashi

False

In [20]:
hellaSwag.evaluate_model(phi2, n_shot=0, logger_manager)

Evaluating microsoft/phi-2 on hellaswag:   0%|          | 0/50 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating microsoft/phi-2 on hellaswag:   2%|▏         | 1/50 [00:07<06:06,  7.48s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating microsoft/phi-2 on hellaswag:   4%|▍         | 2/50 [00:14<05:54,  7.39s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating microsoft/phi-2 on hellaswag:   6%|▌         | 3/50 [00:22<05:46,  7.37s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating microsoft/phi-2 on hellaswag:   8%|▊         | 4/50 [00:29<05:42,  7.46s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating microsoft/phi-2 on hellaswag:  10%|█         | 5/50 [00:34<04:57,  6.62s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating microsoft/phi-2 on hellaswag:  12%|█▏      

4.0

# Experiment on BoolQ dataset

In [6]:
boolq = BoolQ(load_on_init=True, dataset_fraction = 0.005, split = "validation")

README.md:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

In [7]:
row_idx, row = random.choice(list(enumerate(boolq.get_dataset())))
n_shot = 0
boolq.iteration_evaluate_model(tinyLLamaSmall, row_idx, row, n_shot, logger_manager)

Device set to use cuda:0
- Prompt:
 Passage: A property tax or millage rate is an ad valorem tax on the value of a property, usually levied on real estate. The tax is levied by the governing authority of the jurisdiction in which the property is located. This can be a national government, a federated state, a county or geographical region or a municipality. Multiple jurisdictions may tax the same property. This tax can be contrasted to a rent tax which is based on rental income or imputed rent, and a land value tax, which is a levy on the value of land, excluding the value of buildings and other improvements.
Question: is land tax the same as property tax
Answer with only True or False:

- Answer:
 
True: Land tax is a tax on the value of land, excluding the value of buildings and other improvements.
False: Land tax is a tax on the value of land, excluding the value of buildings and other improvements.
Question: what is the difference between land tax and property tax?
Answer with only

False

In [11]:
row_idx, row = random.choice(list(enumerate(boolq.get_dataset())))
n_shot = 0
boolq.iteration_evaluate_model(tinyLLamaSmall, row_idx, row, n_shot, logger_manager)

- Prompt:
 Passage: Play begins with the player on the dealer's left and proceeds clockwise. On their turn, each player draws the top card from the stock or the discard pile. The player may then meld or lay off, which are both optional, before discarding a single card to the top of the discard pile to end their turn.
Question: can you draw from the discard pile in rummy
Answer with only True or False:

- Answer:
 
True. In Rummy, the player may draw from the discard pile to end their turn.

- Cleaned Answer:
 

- True Answer: True

- Is LLM answer correct? : False



False

In [9]:
row_idx, row = random.choice(list(enumerate(boolq.get_dataset())))
n_shot = 0
boolq.iteration_evaluate_model(tinyLLamaSmall, row_idx, row, n_shot, logger_manager)

Device set to use cuda:0
- Prompt:
 Passage: Play begins with the player on the dealer's left and proceeds clockwise. On their turn, each player draws the top card from the stock or the discard pile. The player may then meld or lay off, which are both optional, before discarding a single card to the top of the discard pile to end their turn.
Question: can you draw from the discard pile in rummy
Answer with only True or False:

- Answer:
 
True. In Rummy, the player may draw from the discard pile to end their turn.

- Cleaned Answer:
 

- True Answer: True

- Is LLM answer correct? : False



False

tinyllama is very bad with 0-shot, with 1-shot it becomes better.

In [13]:
row_idx, row = random.choice(list(enumerate(boolq.get_dataset())))
n_shot = 1
boolq.iteration_evaluate_model(tinyLLamaSmall, row_idx, row, n_shot, logger_manager)

- Prompt:
 Passage: After the defeat in the 2016 Olympics, the USWNT underwent a year of experimentation which saw them losing 3 home games. If not for a comeback win against Brazil, the USWNT was on the brink of losing 4 home games in one year, a low never before seen by the USWNT. 2017 saw the USWNT play 12 games against teams ranked in the top-15 in the world. The USWNT heads into World Cup Qualifying in fall of 2018.
Question: is the us womens soccer team in the world cup
Answer with True or False: True
Passage: Harry Potter and the Forbidden Journey uses KUKA robocoaster technology, which allows the seats to pivot while being held above the track by a robotic arm. However, the ride is not a roller coaster but a scenic dark ride. The experience includes a flight around Hogwarts castle, an encounter with the Whomping Willow and a horde of Dementors, and a Quidditch match. The ride drops, spins around, twists and turns, but does not turn upside down, though passengers sometimes lie f

True

In [12]:
row_idx, row = random.choice(list(enumerate(boolq.get_dataset())))
n_shot = 0
boolq.iteration_evaluate_model(phi2, row_idx, row, n_shot, logger_manager)

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
- Prompt:
 Passage: After the defeat in the 2016 Olympics, the USWNT underwent a year of experimentation which saw them losing 3 home games. If not for a comeback win against Brazil, the USWNT was on the brink of losing 4 home games in one year, a low never before seen by the USWNT. 2017 saw the USWNT play 12 games against teams ranked in the top-15 in the world. The USWNT heads into World Cup Qualifying in fall of 2018.
Question: is the us womens soccer team in the world cup
Answer with True or False:

- Answer:
  True


- Cleaned Answer:
  True


- True Answer: True

- Is LLM answer correct? : True



True

In [8]:
boolq.evaluate_model(tinyLLamaSmall, 0, logger_manager)

Evaluating TinyLlama/TinyLlama-1.1B-Chat-v0.6 on google/boolq:  56%|█████▋    | 9/16 [00:53<00:41,  5.93s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating TinyLlama/TinyLlama-1.1B-Chat-v0.6 on google/boolq: 100%|██████████| 16/16 [01:26<00:00,  5.42s/it]


6.25

In [10]:
!tail -n 1 file.log

Final accuracy: 6.25


# Squad V2

In [6]:
squadV2 = SquadV2(load_on_init=True, dataset_fraction = 0.005, split = "validation")

README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [7]:
row_idx, row = random.choice(list(enumerate(squadV2.get_dataset())))
n_shot = 0
squadV2.iteration_evaluate_model(tinyLLamaSmall, row_idx, row, n_shot, logger_manager)

Device set to use cuda:0
- Prompt:
 Context: On August 15, 1971, the United States unilaterally pulled out of the Bretton Woods Accord. The US abandoned the Gold Exchange Standard whereby the value of the dollar had been pegged to the price of gold and all other currencies were pegged to the dollar, whose value was left to "float" (rise and fall according to market demand). Shortly thereafter, Britain followed, floating the pound sterling. The other industrialized nations followed suit with their respective currencies. Anticipating that currency values would fluctuate unpredictably for a time, the industrialized nations increased their reserves (by expanding their money supplies) in amounts far greater than before. The result was a depreciation of the dollar and other industrialized nations' currencies. Because oil was priced in dollars, oil producers' real income decreased. In September 1971, OPEC issued a joint communiqué stating that, from then on, they would price oil in terms of a

False

In [9]:
row_idx, row = random.choice(list(enumerate(squadV2.get_dataset())))
n_shot = 2
squadV2.iteration_evaluate_model(tinyLLamaSmall, row_idx, row, n_shot, logger_manager)

- Prompt:
 Context: Its counties of Los Angeles, Orange, San Diego, San Bernardino, and Riverside are the five most populous in the state and all are in the top 15 most populous counties in the United States. Question: Orange, San Diego, Riverside and San Bernardino make up four of the five counties. What is the name of the last county? Answer: Los Angeles
Context: Warsaw's mixture of architectural styles reflects the turbulent history of the city and country. During the Second World War, Warsaw was razed to the ground by bombing raids and planned destruction. After liberation, rebuilding began as in other cities of the communist-ruled PRL. Most of the historical buildings were thoroughly reconstructed. However, some of the buildings from the 19th century that had been preserved in reasonably reconstructible form were nonetheless eradicated in the 1950s and 1960s (e.g. Leopold Kronenberg Palace). Mass residential blocks were erected, with basic design typical of Eastern bloc countries.

False