# Set Up

In [1]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Install libraries

In [15]:
# Install HuggingFace transformers
%pip install transformers evaluate peft trl bitsandbytes rouge_score

# Install LLM as a Judge
#%pip install prometheus-eval triton vllm


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting trl
  Downloading trl-0.12.2-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.12.2-py3-none-any.whl (365 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging
from peft import LoraConfig
from trl import SFTTrainer
from transformers import Trainer

import pandas as pd
import numpy as np
import torch
from transformers import set_seed
from transformers import Trainer
from transformers import TrainingArguments
import random
from sklearn.model_selection import train_test_split
import evaluate
import re
from datasets import Dataset
from torch.utils.data import DataLoader, TensorDataset


# # LLM as Eval Judge
# from prometheus_eval.vllm import VLLM
# from prometheus_eval import PrometheusEval
# from prometheus_eval.prompts import ABSOLUTE_PROMPT, SCORE_RUBRIC_TEMPLATE


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [12]:
# Disable WANDB logging
import os

os.environ["WANDB_DISABLED"] = "true"
# # To avoid GPU running out of memory, reduce memory fragmentation
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
base_path = './drive/MyDrive/EndOfStory/'

Output_DIR = base_path + '/results_A100_run6/'

# Set Random Seed


In [17]:
# Set seeds for reproducibility
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_value)
set_seed(seed_value)

# Datasets

In [18]:
# Datasets file paths, may change per envrionments like local vs Google Colab

ROCStoriesSpring2016_FILE_PATH = base_path +'ROCStoriesSpring2016.csv'
ROCStoriesWinter2017_FILE_PATH = base_path +'ROCStoriesWinter2017.csv'
ClozeTestWinter2018ValData_FILE_PATH = base_path +'ClozeTestWinter2018ValData.csv'
ClozeTestWinter2018TestData_FILE_PATH = base_path +'ClozeTestWinter2018TestData.csv'



In [19]:
# Load data
ROCStoriesSpring2016 = pd.read_csv(ROCStoriesSpring2016_FILE_PATH)
ROCStoriesWinter2017 = pd.read_csv(ROCStoriesWinter2017_FILE_PATH)
ClozeTestWinter2018ValData = pd.read_csv(ClozeTestWinter2018ValData_FILE_PATH)
ClozeTestWinter2018TestData = pd.read_csv(ClozeTestWinter2018TestData_FILE_PATH)


In [20]:
print(ROCStoriesSpring2016.shape)
ROCStoriesSpring2016.head()

(45496, 7)


Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,9a51198e-96f1-42c3-b09d-a3e1e067d803,Overweight Kid,Dan's parents were overweight.,Dan was overweight as well.,The doctors told his parents it was unhealthy.,His parents understood and decided to make a c...,They got themselves and Dan on a diet.
1,617e7ada-3878-488d-bd56-40695b91f053,The Bike Accident,Carrie had just learned how to ride a bike.,She didn't have a bike of her own.,Carrie would sneak rides on her sister's bike.,She got nervous on a hill and crashed into a w...,The bike frame bent and Carrie got a deep gash...
2,79b0da1f-e460-4173-ba58-8c9e2553c53a,Beach,Morgan enjoyed long walks on the beach.,She and her boyfriend decided to go for a long...,"After walking for over a mile, something happe...",Morgan decided to propose to her boyfriend.,Her boyfriend was upset he didn't propose to h...
3,d173b7de-4611-4cdf-934c-912834755e41,The bad customer.,Jane was working at a diner.,"Suddenly, a customer barged up to the counter.",He began yelling about how long his food was t...,Jane didn't know how to react.,"Luckily, her coworker intervened and calmed th..."
4,af0fd5a4-de36-47ba-8aa2-e99d10986d7a,Being Patient,I was talking to my crush today.,She continued to complain about guys flirting ...,I decided to agree with what she says and list...,"After I got home, I got a text from her.",She asked if we can hang out tomorrow.


In [21]:
print(ROCStoriesWinter2017.shape)
ROCStoriesWinter2017.head()

(52665, 7)


Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,David noticed he had put on a lot of weight re...,He examined his habits to try and figure out t...,He realized he'd been eating too much fast foo...,He stopped going to burger places and started ...,"After a few weeks, he started to feel much bet..."
1,0beabab2-fb49-460e-a6e6-f35a202e3348,Frustration,Tom had a very short temper.,One day a guest made him very angry.,He punched a hole in the wall of his house.,Tom's guest became afraid and left quickly.,Tom sat on his couch filled with regret about ...
2,87da1a22-df0b-410c-b186-439700b70ba6,Marcus Buys Khakis,Marcus needed clothing for a business casual e...,All of his clothes were either too formal or t...,He decided to buy a pair of khakis.,The pair he bought fit him perfectly.,Marcus was happy to have the right clothes for...
3,2d16bcd6-692a-4fc0-8e7c-4a6f81d9efa9,Different Opinions,Bobby thought Bill should buy a trailer and ha...,Bill thought a truck would be better for what ...,Bobby pointed out two vehicles were much more ...,Bill was set in his ways with conventional thi...,He ended up buying the truck he wanted despite...
4,c71bb23b-7731-4233-8298-76ba6886cee1,Overcoming shortcomings,John was a pastor with a very bad memory.,He tried to memorize his sermons many days in ...,He decided to learn to sing to overcome his ha...,He then made all his sermons into music and sa...,His congregation was delighted and so was he.


In [22]:
print(ClozeTestWinter2018ValData.shape)
ClozeTestWinter2018ValData.head()

(1571, 8)


Unnamed: 0,InputStoryid,InputSentence1,InputSentence2,InputSentence3,InputSentence4,RandomFifthSentenceQuiz1,RandomFifthSentenceQuiz2,AnswerRightEnding
0,138d5bfb-05cc-41e3-bf2c-fa85ebad14e2,Rick grew up in a troubled household.,"He never found good support in family, and tur...",It wasn't long before Rick got shot in a robbery.,The incident caused him to turn a new leaf.,He is happy now.,He joined a gang.,1
1,bff9f820-9605-4875-b9af-fe6f14d04256,Laverne needs to prepare something for her fri...,She decides to bake a batch of brownies.,She chooses a recipe and follows it closely.,Laverne tests one of the brownies to make sure...,The brownies are so delicious Laverne eats two...,Laverne doesn't go to her friend's party.,1
2,e8f628d5-9f97-40ed-8611-fc0e774673c4,Sarah had been dreaming of visiting Europe for...,She had finally saved enough for the trip.,She landed in Spain and traveled east across t...,She didn't like how different everything was.,Sarah then decided to move to Europe.,Sarah decided that she preferred her home over...,2
3,f5226bfe-9f26-4377-b05f-3d9568dbdec1,Gina was worried the cookie dough in the tube ...,She was very happy to find she was wrong.,The cookies from the tube were as good as from...,Gina intended to only eat 2 cookies and save t...,Gina liked the cookies so much she ate them al...,Gina gave the cookies away at her church.,1
4,69ac9b05-b956-402f-9fff-1f926ef9176b,It was my final performance in marching band.,I was playing the snare drum in the band.,We played Thriller and Radar Love.,The performance was flawless.,I was very proud of my performance.,I was very ashamed of my performance.,1


In [23]:
print(ClozeTestWinter2018TestData.shape)
ClozeTestWinter2018TestData.head()

(1571, 7)


Unnamed: 0,InputStoryid,InputSentence1,InputSentence2,InputSentence3,InputSentence4,RandomFifthSentenceQuiz1,RandomFifthSentenceQuiz2
0,f6aad64a-e34c-415d-b895-dbfa187ed43e,Bob was bored at his job as a school teacher.,He had been working so hard this past month.,He decided to treat himself with something spe...,He ordered tickets for a weekend snowboarding ...,He was looking forward to getting away.,His boss told him he had to work this weekend.
1,0fedd90d-5295-4b79-b2d0-15a2bad624ee,Olivia went out with Harry on a date.,Harry thought the date was going well.,Olivia thinks he is a complete jerk and never ...,Harry keeps raving about their chemistry.,Olivia is about to leave.,"Olivia had her friend call her, to tell her th..."
2,018152fd-f984-4d05-ad1e-12f1fb7eceb6,Jack and Ferris always fought for headphones.,One day Jack broke Ferris' headphones while jo...,Ferris was furious at Jack.,Their parents yelled at them.,Jack promised Ferris to buy him new headphones.,Jack promised to take Ferris jogging.
3,feef76df-b75a-4501-9c1a-f8a7b6ee442f,I needed someone to help me move a bed across ...,"I called a couple of friends, but they were busy.","Finally, I called my grandson, who came right ...",He helped me move the bed to the right spot.,I made him an ice cream sundae for his efforts.,"He was tired from moving it, so he took a nap ..."
4,929eaf8b-a175-4460-a885-43be8a89ca62,Hannah had a beautiful cat that she loved very...,"However, she noticed that her cat was getting ...","One day, her cat ran away and never came back.",Hannah was devastated.,She never saw her beautiful cat again.,"The next day, she saw the cat walking down the..."


In [24]:
pd.merge(ROCStoriesSpring2016, ROCStoriesWinter2017, how='inner').empty

True

In [25]:
# In case some rows have different story ids but the same sentences
pd.merge(ROCStoriesSpring2016, ROCStoriesWinter2017, how='inner', on=["sentence1", "sentence2"]).empty

True

We can see that two datasets have no duplicate. We can safely combine them

In [26]:
ROCStories = pd.concat([ROCStoriesSpring2016, ROCStoriesWinter2017])
ROCStories

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,9a51198e-96f1-42c3-b09d-a3e1e067d803,Overweight Kid,Dan's parents were overweight.,Dan was overweight as well.,The doctors told his parents it was unhealthy.,His parents understood and decided to make a c...,They got themselves and Dan on a diet.
1,617e7ada-3878-488d-bd56-40695b91f053,The Bike Accident,Carrie had just learned how to ride a bike.,She didn't have a bike of her own.,Carrie would sneak rides on her sister's bike.,She got nervous on a hill and crashed into a w...,The bike frame bent and Carrie got a deep gash...
2,79b0da1f-e460-4173-ba58-8c9e2553c53a,Beach,Morgan enjoyed long walks on the beach.,She and her boyfriend decided to go for a long...,"After walking for over a mile, something happe...",Morgan decided to propose to her boyfriend.,Her boyfriend was upset he didn't propose to h...
3,d173b7de-4611-4cdf-934c-912834755e41,The bad customer.,Jane was working at a diner.,"Suddenly, a customer barged up to the counter.",He began yelling about how long his food was t...,Jane didn't know how to react.,"Luckily, her coworker intervened and calmed th..."
4,af0fd5a4-de36-47ba-8aa2-e99d10986d7a,Being Patient,I was talking to my crush today.,She continued to complain about guys flirting ...,I decided to agree with what she says and list...,"After I got home, I got a text from her.",She asked if we can hang out tomorrow.
...,...,...,...,...,...,...,...
52660,134e8636-3617-43d8-ba6a-9a11b3b115b1,Flavor,The man liked the flavor.,He tried to recreate it at home.,He could not get the flavor right.,He asked the owner of the recipe for help.,The owner of the flavor sold him the recipe.
52661,4c317f76-ca42-4024-a4c2-12ec911cf89b,After Death,"After my friend's dad's funeral, I got in trou...",The principal said I wasn't allowed to leave s...,He found out I had my friend sign me out.,He told me I was getting detention.,I skipped detention all week.
52662,a18fd0d2-4d0c-4316-befe-e3d827fe699b,Janice breaks her wrist,Janice was out exercising for her big soccer g...,She was doing some drills with her legs.,While working out and exercising she slips on ...,She falls down and uses her wrist to break her...,She breaks her wrist in the process and goes t...
52663,2c14252b-4080-4fca-8765-537772018508,Jamie marries for love,Jamie is an american girl.,Jamie wants to get married to a mexican man.,Her family assumes it's because the man wants ...,Jamie insist that she is marrying him out of l...,Jamie gets married and they spent the rest of ...


# Utility Functions

In [27]:
def combine_sentences(row, sentence_columns_to_be_combined=["sentence1", "sentence2", "sentence3", "sentence4"]):
  assert len(sentence_columns_to_be_combined) > 0, "Sentence columns to be combined list length has to be larger than 0"
  combined_sentence = row[sentence_columns_to_be_combined[0]]
  for i in range(1, len(sentence_columns_to_be_combined)):
    combined_sentence += ' ' + row[sentence_columns_to_be_combined[i]]
  return combined_sentence


def combine_selected_columns(row, columns=["sentence1", "sentence2", "sentence3", "sentence4"]):
    return ' '.join(str(row[col]) for col in columns if not pd.isna(row[col]))



In [28]:
def remove_first_four_sentences(paragraph):
    # Split the paragraph into sentences using regex
    sentences = re.split(r'(?<=[.!?])[\s"]+', paragraph)
    # Keep all sentences except the first 4
    remaining_sentences = sentences[4:]
    # Rejoin the sentences into a paragraph
    joined_sentences = ""
    for sentence in remaining_sentences:
      joined_sentences += sentence
      joined_sentences += " "
    joined_sentences = joined_sentences[:-1]
    return joined_sentences

def vectorized_remove_first_four_sentences(paragraphs):
    return np.vectorize(remove_first_four_sentences)(np.array(paragraphs))

def concat_story_body_with_endings(story_body, story_ending):
  return [f"{p1} {p2}" for p1, p2 in zip(story_body, story_ending)]

In [29]:
# def apply_prompt_template(input_text):
#   instruction = "Complete the story in a single sentence, based on the story beginning provided."
#   prompt_input = f"Below is an instruction that describes a task, paired with an input that provides further context. " \
#         "Write a response that appropriately completes the request.\n\n" \
#         f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response: "
#   return prompt_input

def apply_prompt_template(input_text):
  instruction = "Complete the story using a single sentence, based on the story beginning provided."
  prompt_input = f"Below is an instruction that describes a task, paired with an input that provides further context. " \
        "Write a response that appropriately completes the request.\n\n" \
        f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response: "
  return prompt_input

def remove_newline(input_string):
  #cleaned_string = re.sub(r'[^\w\s]', '', input_string)  # Remove special characters
  cleaned_string = input_string.replace('\n', '').replace('\r', '').replace('<br>','')  # Remove newline characters
  return cleaned_string

def remove_prompt_from_generated_text(input):
  input = remove_newline(input)
  #print(input)
  story_context_start_idx = input.find('### Input:')
  story_context_end_idx = input.find('### Response:')

  out = ""
  out += input[story_context_start_idx + 10: story_context_end_idx]
  out += input[story_context_end_idx + 13:]

  return out

def extract_story_beginning_from_generated_text(input):
  input = remove_newline(input)

  story_context_start_idx = input.find('### Input:')
  story_context_end_idx = input.find('### Response:')

  out = ""
  out += input[story_context_start_idx + 10: story_context_end_idx]

  return out


def extract_story_ending_from_generated_text(input):
  input = remove_newline(input)

  response_start_idx = input.find('### Response:')

  out = ""
  out += input[response_start_idx + 13: ]
  return remove_newline(out)

def vectorized_extract_story_beginning_from_generated_text(paragraphs):
    return np.vectorize(extract_story_beginning_from_generated_text)(np.array(paragraphs))

def vectorized_extract_story_ending_from_generated_text(paragraphs):
    return np.vectorize(extract_story_ending_from_generated_text)(np.array(paragraphs))


def vectorized_remove_prompt_from_generated_text(paragraphs):
    return np.vectorize(remove_prompt_from_generated_text)(np.array(paragraphs))

# Train, Val & Test Split

In [30]:
ROCStories_Y = ROCStories["sentence5"]
ROCStories_X = ROCStories.drop(columns=["storyid", "storytitle", "sentence5"])

ROCStories_X = ROCStories_X.apply(combine_selected_columns, axis=1)



In [31]:
ROCStories_X = ROCStories_X.apply(apply_prompt_template)

ROCStories_X.iloc[0]

"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the story using a single sentence, based on the story beginning provided.\n\n### Input:\nDan's parents were overweight. Dan was overweight as well. The doctors told his parents it was unhealthy. His parents understood and decided to make a change.\n\n### Response: "

In [32]:
ROCStories_X.shape

(98161,)

In [33]:
ROCStories_X.shape

(98161,)

In [34]:
ROCStories_Y

Unnamed: 0,sentence5
0,They got themselves and Dan on a diet.
1,The bike frame bent and Carrie got a deep gash...
2,Her boyfriend was upset he didn't propose to h...
3,"Luckily, her coworker intervened and calmed th..."
4,She asked if we can hang out tomorrow.
...,...
52660,The owner of the flavor sold him the recipe.
52661,I skipped detention all week.
52662,She breaks her wrist in the process and goes t...
52663,Jamie gets married and they spent the rest of ...


In [35]:
ROCStories_Y.shape

(98161,)

We decided to combine some of ROCStories data and the Story Cloze 2018 Validation datasets as the final train datasets. The reasoning to additionally include Story Cloze dataset is that Story Cloze 2018 Validation datasets have the correct endings, therefore can be used for both classification or story ending generation evaluation. It also has a potential to be a bit different from the original ROCStories datasets that is mainly used as training data (in other words, increasing the variance of the test datasets for generalization).

In [36]:
ROCX_train_val, ROCX_test, ROCy_train_val, ROCy_test = train_test_split(ROCStories_X, ROCStories_Y, test_size=0.02, random_state=seed_value)

In [37]:
print(ROCX_train_val.shape)
print(ROCX_test.shape)
print(ROCy_train_val.shape)
print(ROCy_test.shape)

(96197,)
(1964,)
(96197,)
(1964,)


In [38]:
ClozeX_test = ClozeTestWinter2018ValData.drop(columns=["InputStoryid", "RandomFifthSentenceQuiz1", "RandomFifthSentenceQuiz2", "AnswerRightEnding"])
ClozeX_test = ClozeX_test.rename(columns={"InputSentence1": "sentence1", "InputSentence2": "sentence2", "InputSentence3": "sentence3", "InputSentence4": "sentence4"})
ClozeX_test = ClozeX_test.apply(combine_selected_columns, axis=1)
ClozeX_test = ClozeX_test.apply(apply_prompt_template)

ClozeX_test

Unnamed: 0,0
0,"Below is an instruction that describes a task,..."
1,"Below is an instruction that describes a task,..."
2,"Below is an instruction that describes a task,..."
3,"Below is an instruction that describes a task,..."
4,"Below is an instruction that describes a task,..."
...,...
1566,"Below is an instruction that describes a task,..."
1567,"Below is an instruction that describes a task,..."
1568,"Below is an instruction that describes a task,..."
1569,"Below is an instruction that describes a task,..."


In [39]:
Clozey_test = np.where(ClozeTestWinter2018ValData["AnswerRightEnding"]==1, ClozeTestWinter2018ValData["RandomFifthSentenceQuiz1"], ClozeTestWinter2018ValData["RandomFifthSentenceQuiz2"])
Clozey_test = pd.DataFrame(Clozey_test, columns=["sentence5"])
Clozey_test

Unnamed: 0,sentence5
0,He is happy now.
1,The brownies are so delicious Laverne eats two...
2,Sarah decided that she preferred her home over...
3,Gina liked the cookies so much she ate them al...
4,I was very proud of my performance.
...,...
1566,I have very fond memories of checkers.
1567,She loved her new phone.
1568,They were on sale.
1569,She was offered the new job at a higher salary.


In [40]:
ClozeX_test.iloc[0]

"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the story using a single sentence, based on the story beginning provided.\n\n### Input:\nRick grew up in a troubled household. He never found good support in family, and turned to gangs. It wasn't long before Rick got shot in a robbery. The incident caused him to turn a new leaf.\n\n### Response: "

In [41]:
X_test = pd.concat([ROCX_test, ClozeX_test])
X_test

Unnamed: 0,0
20391,"Below is an instruction that describes a task,..."
32389,"Below is an instruction that describes a task,..."
8700,"Below is an instruction that describes a task,..."
45048,"Below is an instruction that describes a task,..."
23146,"Below is an instruction that describes a task,..."
...,...
1566,"Below is an instruction that describes a task,..."
1567,"Below is an instruction that describes a task,..."
1568,"Below is an instruction that describes a task,..."
1569,"Below is an instruction that describes a task,..."


In [42]:
y_test = pd.concat([ROCy_test, Clozey_test])
y_test

Unnamed: 0,sentence5
20391,Evan knew he looked cool in the new car.
32389,Together they were able to surprise him perfec...
8700,Fred apologized to the man.
45048,Talking with him not only made my day it made ...
23146,Tom's friends thought it was annoying.
...,...
1566,I have very fond memories of checkers.
1567,She loved her new phone.
1568,They were on sale.
1569,She was offered the new job at a higher salary.


In [43]:
ROCX_train, ROCX_val, ROCy_train, ROCy_val = train_test_split(ROCX_train_val, ROCy_train_val, test_size=0.2, random_state=seed_value)

In [44]:
ROCX_train = ROCX_train.to_list()
ROCX_val = ROCX_val.to_list()
ROCy_train = ROCy_train.to_list()
ROCy_val = ROCy_val.to_list()
X_test = X_test.to_list()
y_test = y_test["sentence5"].to_list()

In [45]:
print(len(ROCX_train))
print(len(ROCX_val))
print(len(ROCy_train))
print(len(ROCy_val))
print(len(X_test))
print(len(y_test))

76957
19240
76957
19240
3535
3535


In [46]:
X_test[0]

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the story using a single sentence, based on the story beginning provided.\n\n### Input:\nEvan had been saving for years. He went to the dealership and bought a really fancy BMW. Evan was so proud of his new car. He showed it off around town.\n\n### Response: '

# LLaMA Model and Baseline

TinyLlama 1.1B model from  https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T

In [None]:
base_model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
# the model will be saved with this name
new_model_name = "llama-1.1B-rocstories_lora_6"

In [None]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(base_model_name)
model.config.use_cache = False
model.config.pretraining_tp = 1 # parallelism = false cuz we have only 1 device.


In [None]:
model.to("cuda")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): 

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token #pad sequences
tokenizer.padding_side = 'right'

In [None]:
X_test[0]

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the story in a single sentence, based on the story beginning provided.\n\n### Input:\nEvan had been saving for years. He went to the dealership and bought a really fancy BMW. Evan was so proud of his new car. He showed it off around town.\n\n### Response: '

Generating story endings with baseline model

In [None]:
print("=====Start of results with Baseline model")

for i in range(5):
  print("Story", i)
  # Use the first row of dataset as a toy example
  story_beginning = X_test[i]

  device = "cuda" if torch.cuda.is_available() else "cpu"

  ## BOTH model and inputs need to be on the same device - CUDA or CPU otherwise there will be errors

  # Load the model and tokenizer, and move the model to the device
  pipe = pipeline(task="text-generation", model=model.to(device), tokenizer=tokenizer)

  # Prepare the input and move it to the correct device
  inputs = tokenizer(story_beginning, return_tensors="pt").input_ids.to(device)

  # Generate the story ending using beam search
  result = model.generate(
      inputs,
      max_new_tokens=25,                # Limiting to one sentence
      no_repeat_ngram_size=3,           # To avoid repetition
      early_stopping=True           # End generation when all beams finish
  )

  # Decode the generated ending
  generated_text = tokenizer.decode(result[0], skip_special_tokens=True)

  # Print the generated conclusion
  print(generated_text)
  print("Reference:",y_test[i])
  print("-----------\n")

print("=====End of results with Baseline model")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


=====Start of results with Baseline model
Story 0


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Complete the story using a single sentence, based on the story beginning provided.

### Input:
Evan had been saving for years. He went to the dealership and bought a really fancy BMW. Evan was so proud of his new car. He showed it off around town.

### Response: 
Even though Evan had saved for years, he still had to pay for the car. Evean had to
Reference: Evan knew he looked cool in the new car.
-----------

Story 1


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Complete the story using a single sentence, based on the story beginning provided.

### Input:
Serena was planning a surprise for her husband's birthday. She wanted to throw him a party, but his schedule was tough. He would always arrive home at widely different times. To get around it, she worked with his co-workers.

### Response: 

```
Serenna was planning to throw a surprise party for her husbands birthday, but he was
Reference: Together they were able to surprise him perfectly.
-----------

Story 2


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Complete the story using a single sentence, based on the story beginning provided.

### Input:
Fred slapped another man's butt. He thought he knew the man. The man turned around. It was not the man he knew.

### Response: 
FRED SLAPPED ANOTHER MAN'S BUTT. HE THOUGHT HE K
Reference: Fred apologized to the man.
-----------

Story 3


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Complete the story using a single sentence, based on the story beginning provided.

### Input:
I used to lived in Phoenix Arizona. On my way home from work, i met my favorite actor. I was shocked by surprise when he actually spoke to me. We had lunch at the Olive Garden.

### Response: 
I lived in Arizona. I met my actor friend at the airport. We went to lunch and he was very
Reference: Talking with him not only made my day it made my week.
-----------

Story 4
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Complete the story using a single sentence, based on the story beginning provided.

### Input:
Tom thought he was really strong. He challenged all his friends to arm wrestle. Tom was

In [None]:
y_test[0]

'Evan knew he looked cool in the new car.'

In [None]:
print(remove_prompt_from_generated_text(generated_text))

Tom thought he was really strong. He challenged all his friends to arm wrestle. Tom was able to beat them all. He would brag about it often. Tom was really good at arm wrestling. He was able beat all his friend's. He could beat them easily


In [None]:
print(extract_story_ending_from_generated_text(generated_text))

 Tom was really good at arm wrestling. He was able beat all his friend's. He could beat them easily


In [None]:
# Tokenize the input text
inputs = tokenizer(X_test, padding=True, return_tensors="pt")
inputs

{'input_ids': tensor([[    1, 13866,   338,  ...,     2,     2,     2],
        [    1, 13866,   338,  ...,     2,     2,     2],
        [    1, 13866,   338,  ...,     2,     2,     2],
        ...,
        [    1, 13866,   338,  ...,     2,     2,     2],
        [    1, 13866,   338,  ...,     2,     2,     2],
        [    1, 13866,   338,  ...,     2,     2,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
inputs['input_ids'].device

device(type='cpu')

Both the model and the inputs need to be on GPU.

In [None]:
inputs["input_ids"] = inputs["input_ids"].to("cuda")
inputs["attention_mask"] = inputs["attention_mask"].to("cuda")

TODO: It's very consuming to do generate. So let's use only a small subset of test data for now.

In [None]:
%%time
# Generate the story ending
with torch.no_grad():
  generated_ids = model.generate(inputs["input_ids"][:50], max_new_tokens=25)
generated_ids

CPU times: user 1.49 s, sys: 4.69 ms, total: 1.49 s
Wall time: 1.51 s


tensor([[    1, 13866,   338,  ..., 29896, 29929,  4259],
        [    1, 13866,   338,  ..., 29929,  4259,   338],
        [    1, 13866,   338,  ..., 29896, 29929,  4259],
        ...,
        [    1, 13866,   338,  ..., 29896, 29929,  4259],
        [    1, 13866,   338,  ..., 29929,  4259,   338],
        [    1, 13866,   338,  ...,   313, 29940, 13367]], device='cuda:0')

In [None]:
# Decode the generated ending
# Please note that currently generated text includes both the input, story beginning, and the generated ending
full_story_with_ending_test = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
full_story_with_ending_test[0]

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the story using a single sentence, based on the story beginning provided.\n\n### Input:\nEvan had been saving for years. He went to the dealership and bought a really fancy BMW. Evan was so proud of his new car. He showed it off around town.\n\n### Response:  The 2018-19 season is in the books for the 2018-19 season'

# Batch processing with TinyLlama Baseline

:Move inputs to CUDA so that model.generate(**inputs) will use GPU. Once you do this, you can see GPU RAM increases and RAM (CPU RAM) is relatively stable. Otherwise, only CPU is used, which is slow, roughly taking 3 to 4 hours for generating ids for 3500ish test data.

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

inputs["input_ids"] = inputs["input_ids"].to(device)
inputs["attention_mask"] = inputs["attention_mask"].to(device)

In [None]:
inputs['input_ids'].device

device(type='cuda', index=0)

In [52]:
batch_size = 32

# Using a batch size of 32 during inferencing actually helps if there are errors
inputs_dataloader = DataLoader(TensorDataset(inputs["input_ids"], inputs["attention_mask"]), batch_size=batch_size)
original_story_with_endings_dataloader = DataLoader(concat_story_body_with_endings(X_test, y_test), batch_size=batch_size)
story_endings_dataloader = DataLoader(y_test, batch_size=batch_size)

NameError: name 'inputs' is not defined

# Perplexity with original stories

In [None]:
%%time


perplexity = evaluate.load("perplexity", module_type="metric")
final_perplexity_original_story = 0
for reference in original_story_with_endings_dataloader:
  weight_factor = len(reference) / len(X_test)
  final_perplexity_original_story += weight_factor * perplexity.compute(predictions=reference, model_id=base_model_name)["mean_perplexity"];

final_perplexity_original_story

# Perplexity and ROUGE with pre-trained TinyLlama as baseline

In [None]:
%%time

rouge = evaluate.load('rouge')
perplexity = evaluate.load("perplexity", module_type="metric")

generated_text_baseline = []

final_perplexity_baseline = 0
final_rouge1_baseline = 0
final_rouge2_baseline = 0
final_rougeL_baseline = 0
with torch.no_grad():
  for input_and_attention_mask, story_reference, ending_reference in zip(inputs_dataloader, original_story_with_endings_dataloader, story_endings_dataloader):

    generated_ids = model.generate(inputs=input_and_attention_mask[0],
                                   attention_mask=input_and_attention_mask[1],
                                   max_new_tokens=25,
                                   early_stopping=True
                                  )
    full_story_with_ending_test = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    for text in full_story_with_ending_test:
      generated_text_baseline.append(text)

    # Calculate perplexity using full story, after removing prompt
    weight_factor = len(ending_reference) / len(X_test)
    final_perplexity_baseline += weight_factor * perplexity.compute(predictions=vectorized_remove_prompt_from_generated_text(full_story_with_ending_test), model_id='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T')["mean_perplexity"]
    # Calculate rouge using only endings
    # Pop empty strings given in some rare cases, when model doesn't copy down the exact prompt, especially missing punctuations
    empty_indices = [i for i, s in enumerate(vectorized_remove_prompt_from_generated_text(full_story_with_ending_test)) if s == ""]
    if len(empty_indices) > 0:
      print(empty_indices)
    # empty_indices.sort(reverse=True)
    # for index in empty_indices:
    #   full_story_with_ending_test.pop(index)
    #   story_reference.pop(index)

    rouge_results = rouge.compute(predictions=vectorized_extract_story_ending_from_generated_text(full_story_with_ending_test), references=ending_reference)
    final_rouge1_baseline += weight_factor * rouge_results["rouge1"]
    final_rouge2_baseline += weight_factor * rouge_results["rouge2"]
    final_rougeL_baseline += weight_factor * rouge_results["rougeL"]



Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.46k [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 5min 42s, sys: 34.3 s, total: 6min 16s
Wall time: 6min 34s


In [None]:
# Store stories into text file

if not os.path.exists(Output_DIR):
  os.makedirs(Output_DIR)


filepaths = [os.path.join(Output_DIR, 'generated_stories_beginning_baseline.txt'),
                          os.path.join(Output_DIR, 'generated_stories_ending_baseline.txt'),
            os.path.join(Output_DIR, 'generated_text_baseline.txt')]

print(filepaths)

with open(filepaths[0], 'w') as f_beg , open(filepaths[1], 'w') as f_end, open(filepaths[2], 'w') as f_text:
  for text in generated_text_baseline:
    f_text.write(remove_newline(text) +'\n')
    f_beg.write(extract_story_beginning_from_generated_text(text) +'\n')
    f_end.write(extract_story_ending_from_generated_text(text) +'\n')


['./drive/MyDrive/EndOfStory//results_A100_run6/generated_stories_beginning_baseline.txt', './drive/MyDrive/EndOfStory//results_A100_run6/generated_stories_ending_baseline.txt', './drive/MyDrive/EndOfStory//results_A100_run6/generated_text_baseline.txt']


In [None]:
print(final_perplexity_baseline)
print(final_rouge1_baseline)
print(final_rouge2_baseline)
print(final_rougeL_baseline)

12.814715036340951
0.17196993372661393
0.024782822437946706
0.14831231254775165


# TinyLlama model LoRA Fine Tuning

In [None]:
train_encodings = tokenizer(ROCX_train, padding=True)
val_encodings = tokenizer(ROCX_val, padding=True)

In [None]:
train_labels = tokenizer(ROCy_train, padding=True, max_length=128, return_tensors="pt")['input_ids']
val_labels = tokenizer(ROCy_val, padding=True, max_length=128, return_tensors="pt")['input_ids']



In [None]:
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})

In [None]:
from peft import get_peft_model, LoraConfig
from peft import TaskType
from peft import PeftModel


Clearing up GPU memory

In [None]:
#Garbage collection
import gc
gc.collect()
for _ in range(11):
  torch.cuda.empty_cache()


I would like to thank the maintainers of this repository for helping us find good LoRA parameters. After multiple iterations of failures, we finally have something that gives a good output. https://github.com/AviSoori1x/Tuning-the-Finetuning/blob/main/Step%202%20Fine%20tuning%20using%20%20LoRA.py

In [None]:

#If only targeting attention blocks of the model
target_modules_only_attention = ["q_proj", "v_proj", "k_proj", "o_proj"]

#If targeting all linear layers
target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']

# LoRA config
lora_config = LoraConfig(lora_alpha=16, #multiplier of lora output when it's added to the full forward output, recommendation is 2* rank
                        lora_dropout = 0.05, #with a probability of 5% it will set random lora outputs to 0
                        r=8, # most important
                        target_modules = target_modules_only_attention,
                        bias="none",
                        task_type="CAUSAL_LM"
)


# Integrate LoRA with the model
model = get_peft_model(model, lora_config)


per_device_train_batch_size = 48
gradient_accumulation_steps = 2
optim = 'adamw_hf'
learning_rate = 1e-4 # earlier it was 5e-4 which is a little high
max_grad_norm = 1.0 #commonly used
warmup_ratio = 0.03
lr_scheduler_type = "cosine"
steps_quanta = 500

# SFT Arguments
training_args = TrainingArguments(output_dir=Output_DIR,
                                  logging_dir="./logs",
                                  overwrite_output_dir=True,
                                    num_train_epochs=3.0,
                                    per_device_train_batch_size=per_device_train_batch_size,
                                    gradient_accumulation_steps=gradient_accumulation_steps,  # total batch size = 32 * 2 = 64
                                    optim=optim,
                                    save_steps=steps_quanta, #checkpoint every X steps
                                    logging_steps=steps_quanta, #log every step
                                    learning_rate=learning_rate, #step size in optimizer update
                                    weight_decay=0.01, #higher regularization
                                    fp16=True, #enable mixed precision
                                    max_grad_norm=max_grad_norm, #gradient clipping to improve convergence,
                                    max_steps=-1, #using epochs, not steps
                                    group_by_length=True,
                                    warmup_ratio=warmup_ratio,
                                    lr_scheduler_type=lr_scheduler_type, #linear scheduler
                                    evaluation_strategy="steps",
                                    save_strategy="steps",
                                    load_best_model_at_end=True,
                                    eval_steps=steps_quanta,  # Evaluate frequently
                                    save_total_limit=1, #keep only 1 checkpoint
                                  )

trainer=SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=lora_config, #LORA
    dataset_text_field="text",
    args=training_args,
    )

#Train the model
trainer.train()



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Step,Training Loss,Validation Loss
500,1.0455,0.885196
1000,0.8744,0.879028
1500,0.8688,0.876151
2000,0.8672,0.875009


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=2406, training_loss=0.9054629864141729, metrics={'train_runtime': 2891.8421, 'train_samples_per_second': 79.835, 'train_steps_per_second': 0.832, 'total_flos': 2.513269655350272e+17, 'train_loss': 0.9054629864141729, 'epoch': 3.0})

Let's check the Peft config we have in our model.

In [None]:
if isinstance(model, PeftModel):
    print("This model has been fine-tuned using PEFT.")
else:
    print("This model is not fine-tuned.")

This model has been fine-tuned using PEFT.


In [None]:
model.print_trainable_parameters()

trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.2044


Test the fine tuned model for a single inference

In [None]:
lora_model = model

In [None]:
# from peft import PeftConfig

# new_base_model = AutoModelForCausalLM.from_pretrained(base_model_name)

# # Check Google drive for the correct path and checkpoint folder
# lora_config_file_path = os.path.join(Output_DIR +'checkpoint-3606/')
# print(lora_config_file_path)
# config = PeftConfig.from_pretrained(Output_DIR +'checkpoint-3606/')
# print(config)
# reloaded_model = PeftModel.from_pretrained(new_base_model,
#             lora_config_file_path,
#             lora_config=config,
#             is_trainable=True # 👈 necessary
# )
# tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# reloaded_model.to("cuda")
# # eval mode
# reloaded_model.eval()

Generating text with LoRA Fine tuned model


In [None]:
print("=====Start of results with LoRA fine tuned model")



for i in range(5):
  print("Story", i)
  # Use the first row of dataset as a toy example
  story_beginning = X_test[i]

  device = "cuda" if torch.cuda.is_available() else "cpu"

  # Prepare the input and move it to the correct device
  inputs = tokenizer(story_beginning, return_tensors="pt").input_ids.to(device)

  # Generate the story ending using beam search
  result = lora_model.generate(
      inputs,
      max_new_tokens=25,                # Limiting to one sentence
  )

  # Decode the generated ending
  generated_text = tokenizer.decode(result[0], skip_special_tokens=True)

  # Print the generated conclusion
  print(generated_text)
  print("Reference:",y_test[i])
  print("-----------\n")

print("=====End of results with LoRA fine tuned model")


=====Start of results with LoRA fine tuned model
Story 0
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Complete the story using a single sentence, based on the story beginning provided.

### Input:
Evan had been saving for years. He went to the dealership and bought a really fancy BMW. Evan was so proud of his new car. He showed it off around town.

### Response: 
Evan was driving home one day. He saw a car in front of him. It was a police car. E
Reference: Evan knew he looked cool in the new car.
-----------

Story 1
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Complete the story using a single sentence, based on the story beginning provided.

### Input:
Serena was planning a surprise for her husband's birthday. She wanted to th

# Perplexity and ROUGE with the LoRA Fine Tuned Model

In [None]:
%%time

rouge = evaluate.load('rouge')
perplexity = evaluate.load("perplexity", module_type="metric")

generated_text_lora = []
final_perplexity_lora = 0
final_rouge1_lora = 0
final_rouge2_lora = 0
final_rougeL_lora = 0
with torch.no_grad():
  for input_and_attention_mask, story_reference, ending_reference in zip(inputs_dataloader, original_story_with_endings_dataloader, story_endings_dataloader):
    generated_ids = model.generate(inputs=input_and_attention_mask[0],
                                   attention_mask=input_and_attention_mask[1],
                                   max_new_tokens=25,
                                  # num_beams=4,                  # Using beam search with 4 beams
                                  # no_repeat_ngram_size=2,       # To avoid repetition
                                  # early_stopping=True           # End generation when all beams finish
                                  )
    full_story_with_ending_test = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    for text in full_story_with_ending_test:
      generated_text_lora.append(text)
    # Calculate perplexity using full story, after removing prompt
    weight_factor = len(ending_reference) / len(X_test)
    final_perplexity_lora += weight_factor * perplexity.compute(predictions=vectorized_remove_prompt_from_generated_text(full_story_with_ending_test), model_id='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T')["mean_perplexity"]
    # Calculate rouge using only endings
    # Pop empty strings given in some rare cases, when model doesn't copy down the exact prompt, especially missing punctuations
    empty_indices = [i for i, s in enumerate(vectorized_remove_prompt_from_generated_text(full_story_with_ending_test)) if s == ""]
    empty_indices.sort(reverse=True)
    for index in empty_indices:
      full_story_with_ending_test.pop(index)
      story_reference.pop(index)

    rouge_results = rouge.compute(predictions=vectorized_extract_story_ending_from_generated_text(full_story_with_ending_test), references=ending_reference)
    final_rouge1_lora += weight_factor * rouge_results["rouge1"]
    final_rouge2_lora += weight_factor * rouge_results["rouge2"]
    final_rougeL_lora += weight_factor * rouge_results["rougeL"]


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 6min 13s, sys: 38.6 s, total: 6min 52s
Wall time: 7min 6s


In [None]:
print(final_perplexity_lora)
print(final_rouge1_lora)
print(final_rouge2_lora)
print(final_rougeL_lora)
print(len(generated_text_lora))

14.495527136072917
0.1895889209571719
0.03555246622416554
0.162905555535987
3535


In [None]:
# Store stories into text file

if not os.path.exists(Output_DIR):
  os.makedirs(Output_DIR)


filepaths = [os.path.join(Output_DIR, 'generated_stories_beginning_lora_no_beam.txt'),
                          os.path.join(Output_DIR, 'generated_stories_ending_lora_no_beam.txt'),
            os.path.join(Output_DIR, 'generated_text_lora_no_beam.txt')]

print(filepaths)

with open(filepaths[0], 'w') as f_beg , open(filepaths[1], 'w') as f_end, open(filepaths[2], 'w') as f_text:
  for text in generated_text_lora:
    f_text.write(remove_newline(text) +'\n')
    f_beg.write(extract_story_beginning_from_generated_text(text) +'\n')
    f_end.write(extract_story_ending_from_generated_text(text) +'\n')


['./drive/MyDrive/EndOfStory//results_A100_run6/generated_stories_beginning_lora_no_beam.txt', './drive/MyDrive/EndOfStory//results_A100_run6/generated_stories_ending_lora_no_beam.txt', './drive/MyDrive/EndOfStory//results_A100_run6/generated_text_lora_no_beam.txt']


In [None]:

# To Stop Google Colab GPU after training is done
# from google.colab import runtime

# runtime.unassign()

In [2]:
%pip install prometheus-eval
%pip install triton
%pip install vllm


Collecting prometheus-eval
  Downloading prometheus_eval-0.1.20-py3-none-any.whl.metadata (26 kB)
Collecting aiolimiter<2.0.0,>=1.1.0 (from prometheus-eval)
  Downloading aiolimiter-1.2.1-py3-none-any.whl.metadata (4.5 kB)
Collecting fastchat<0.2.0,>=0.1.0 (from prometheus-eval)
  Downloading fastchat-0.1.0-py3-none-any.whl.metadata (195 bytes)
Collecting litellm<2.0.0,>=1.40.0 (from prometheus-eval)
  Downloading litellm-1.54.0-py3-none-any.whl.metadata (33 kB)
Collecting httpx<0.28.0,>=0.23.0 (from litellm<2.0.0,>=1.40.0->prometheus-eval)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting openai>=1.55.3 (from litellm<2.0.0,>=1.40.0->prometheus-eval)
  Downloading openai-1.57.0-py3-none-any.whl.metadata (24 kB)
Collecting python-dotenv>=0.2.0 (from litellm<2.0.0,>=1.40.0->prometheus-eval)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting tiktoken>=0.7.0 (from litellm<2.0.0,>=1.40.0->prometheus-eval)
  Downloading tiktoken-0.8.0-cp310-

In [3]:
# LLM as Eval Judge
from prometheus_eval.vllm import VLLM
from prometheus_eval import PrometheusEval
from prometheus_eval.prompts import ABSOLUTE_PROMPT, SCORE_RUBRIC_TEMPLATE

In [4]:
judge_model = VLLM(model="prometheus-eval/prometheus-7b-v2.0", gpu_memory_utilization=0.5, enforce_eager=True, max_model_len=2048)

judge = PrometheusEval(model=judge_model, absolute_grade_template=ABSOLUTE_PROMPT)
rubric_data = {
  "criteria":"Does the model successfully generate a response that is a suitable ending to the provided story body, in terms of language fluency, semantics coherence, and story flow",
  "score1_description":"The ending is riddled with language errors, is incoherent or disconnected from the story body, and disrupts the narrative flow. It leaves the reader confused or unsatisfied.",
  "score2_description":"The ending has noticeable language issues, inconsistencies, or rushed transitions. While it ties some loose ends, it feels incomplete or awkward.",
  "score3_description":"The ending is adequately written with minor language errors and mostly logical progression, but it lacks emotional impact or creativity in its resolution.",
  "score4_description":"The ending is well-crafted, fluent, and coherent, with a fitting and satisfying resolution. It enhances the story’s themes and characters, though it might lack exceptional originality or depth.",
  "score5_description":"The ending is flawless in language fluency, beautifully integrates with the story, and delivers a compelling, imaginative, and emotionally resonant conclusion that elevates the entire narrative."
}
score_rubric = SCORE_RUBRIC_TEMPLATE.format(**rubric_data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

INFO 12-08 19:17:13 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.
INFO 12-08 19:17:13 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='prometheus-eval/prometheus-7b-v2.0', speculative_config=None, tokenizer='prometheus-eval/prometheus-7b-v2.0', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=prometheus-

tokenizer_config.json:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

INFO 12-08 19:17:14 selector.py:135] Using Flash Attention backend.
INFO 12-08 19:17:15 model_runner.py:1072] Starting to load model prometheus-eval/prometheus-7b-v2.0...
INFO 12-08 19:17:16 weight_utils.py:243] Using model weights format ['*.safetensors']


model-00003-of-00008.safetensors:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/789M [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/22.8k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]


INFO 12-08 19:18:08 model_runner.py:1077] Loading model weights took 13.4966 GB
INFO 12-08 19:18:09 worker.py:232] Memory profiling results: total_gpu_memory=39.56GiB initial_memory_usage=14.00GiB peak_torch_memory=13.81GiB memory_usage_post_profile=14.01GiB non_torch_memory=0.51GiB kv_cache_size=5.46GiB gpu_memory_utilization=0.50
INFO 12-08 19:18:10 gpu_executor.py:113] # GPU blocks: 2796, # CPU blocks: 2048
INFO 12-08 19:18:10 gpu_executor.py:117] Maximum concurrency for 2048 tokens per request: 21.84x


In [None]:
X_test[0]

In [None]:
original_story_bodies = X_test
original_story_endings_reference = y_test

In [None]:
X_test_story_only = vectorized_extract_story_beginning_from_generated_text(X_test)

X_test_story_only[0]

In [None]:
len(X_test_story_only[0])

In [None]:
len(y_test[0])

In [None]:
%%time
LLM_judge_score_original_story = 0
for original_stories, reference in zip(original_story_with_endings_dataloader, story_endings_dataloader):
  print(len(original_stories), len(reference))
  feedback, score = judge.absolute_grade(
      instructions=original_stories,
      responses=reference,
      rubric=score_rubric,
      reference_answers=reference
    )
  LLM_judge_score_original_story += sum(score)
LLM_judge_score_original_story = LLM_judge_score_original_story/len(X_test)
LLM_judge_score_original_story

In [None]:
generated_story_endings = []

for text in generated_text_lora:
  ending = extract_story_ending_from_generated_text(text)
  generated_story_endings.append(ending)

len(generated_story_endings)

In [None]:
generated_story_endings[0]

In [None]:
original_story_with_endings_dataloader

In [None]:
instructions_dataloader = DataLoader(X_test, batch_size=batch_size)
reference_dataloader = DataLoader(y_test, batch_size=batch_size)
generated_endings_dataloader = DataLoader(generated_story_endings, batch_size=batch_size)

In [None]:
%%time
LLM_judge_score_fine_tuning = 0
i = 0
for instruction, reference, gen_ending in zip(instructions_dataloader, reference_dataloader,generated_endings_dataloader):
  i += 1
  feedback, score = judge.absolute_grade(
      instructions=instruction,
      responses=gen_ending,
      rubric=score_rubric,
      reference_answers=reference
    )
  print("Iteration", i, score)
  LLM_judge_score_fine_tuning += sum(score)
LLM_judge_score_fine_tuning = LLM_judge_score_fine_tuning/len(X_test)
LLM_judge_score_fine_tuning

# LLM as a judge eval on Fine tuned model

In [None]:
generated_story_endings = []

for text in generated_text_lora:
  ending = extract_story_ending_from_generated_text(text)
  generated_story_endings.append(ending)

len(generated_story_endings)

After removing extra params (beam search, early_stop from generate)


In [None]:
%%time
LLM_judge_score_fine_tuning = 0
for instruction, reference, gen_ending in zip(instructions_dataloader, reference_dataloader,generated_endings_dataloader):
  print(len(original_stories), len(reference))
  feedback, score = judge.absolute_grade(
      instructions=instruction,
      responses=gen_ending,
      rubric=score_rubric,
      reference_answers=reference
    )
  LLM_judge_score_fine_tuning += sum(score)
LLM_judge_score_fine_tuning = LLM_judge_score_fine_tuning/len(X_test)
LLM_judge_score_fine_tuning

# LLM as a judge for Baseline

In [47]:
import os
filepaths = [os.path.join(Output_DIR, 'generated_stories_beginning_baseline.txt'),
             os.path.join(Output_DIR, 'generated_stories_ending_baseline.txt'),
             os.path.join(Output_DIR, 'generated_text_baseline.txt')]

# Read the beginnings
with open(filepaths[0], 'r', encoding='utf-8') as f_beg:
    beginnings = f_beg.read().splitlines()  # This returns a list of strings, one per line

# Read the endings
with open(filepaths[1], 'r', encoding='utf-8') as f_end:
    endings = f_end.read().splitlines()

# Read the full texts
with open(filepaths[2], 'r', encoding='utf-8') as f_text:
    full_texts = f_text.read().splitlines()

# Now you have three arrays: beginnings, endings, and full_texts.
print(beginnings[:5])  # print first 5 items for a quick check
print(endings[:5])
print(full_texts[:5])

['Evan had been saving for years. He went to the dealership and bought a really fancy BMW. Evan was so proud of his new car. He showed it off around town.', "Serena was planning a surprise for her husband's birthday. She wanted to throw him a party, but his schedule was tough. He would always arrive home at widely different times. To get around it, she worked with his co-workers.", "Fred slapped another man's butt. He thought he knew the man. The man turned around. It was not the man he knew.", 'I used to lived in Phoenix Arizona. On my way home from work, i met my favorite actor. I was shocked by surprise when he actually spoke to me. We had lunch at the Olive Garden.', 'Tom thought he was really strong. He challenged all his friends to arm wrestle. Tom was able to beat them all. He would brag about it often.']
['    Evan had been saving for years. He went to the dealership and bought a really fancy BMW. E', ' ###### Instruction:Complete the story using a single sentence, based on the

In [49]:
generated_text_baseline = full_texts

In [53]:
generated_story_endings = []

for text in generated_text_baseline:
  ending = extract_story_ending_from_generated_text(text)
  generated_story_endings.append(ending)

len(generated_story_endings)

3535

In [54]:
instructions_dataloader = DataLoader(X_test, batch_size=batch_size)
reference_dataloader = DataLoader(y_test, batch_size=batch_size)
generated_endings_dataloader = DataLoader(generated_story_endings, batch_size=batch_size)

In [55]:
%%time
LLM_judge_score_baseline = 0
i = 0
for instruction, reference, gen_ending in zip(instructions_dataloader, reference_dataloader,generated_endings_dataloader):
  i+= 1
  feedback, score = judge.absolute_grade(
      instructions=instruction,
      responses=gen_ending,
      rubric=score_rubric,
      reference_answers=reference
    )
  LLM_judge_score_baseline += sum(score)
  print("iteration", i, "score:", score)
LLM_judge_score_baseline = LLM_judge_score_baseline/len(X_test)
LLM_judge_score_baseline

Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.86it/s, est. speed input: 2389.16 toks/s, output: 629.75 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12186.10it/s]


iteration 1 score: [1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:11<00:00,  2.76it/s, est. speed input: 1703.64 toks/s, output: 446.54 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12206.05it/s]


iteration 2 score: [1, 1, 1, 1, 1, 5, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.43it/s, est. speed input: 2125.78 toks/s, output: 542.00 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12609.71it/s]


iteration 3 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.52it/s, est. speed input: 2793.46 toks/s, output: 738.36 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10680.17it/s]


iteration 4 score: [1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.57it/s, est. speed input: 2196.29 toks/s, output: 585.87 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11957.04it/s]


iteration 5 score: [1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.63it/s, est. speed input: 2247.06 toks/s, output: 606.26 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11158.77it/s]


iteration 6 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 3, 4, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.31it/s, est. speed input: 2663.34 toks/s, output: 713.18 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11862.98it/s]


iteration 7 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4]


Processed prompts: 100%|██████████| 32/32 [00:12<00:00,  2.52it/s, est. speed input: 1553.76 toks/s, output: 433.19 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10888.11it/s]


iteration 8 score: [1, 3, 2, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 5, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.98it/s, est. speed input: 2451.42 toks/s, output: 626.71 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7474.40it/s]


iteration 9 score: [1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.96it/s, est. speed input: 3074.78 toks/s, output: 723.62 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 13262.62it/s]


iteration 10 score: [1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.83it/s, est. speed input: 2367.60 toks/s, output: 620.07 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11501.09it/s]


iteration 11 score: [3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.63it/s, est. speed input: 2238.26 toks/s, output: 562.31 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.76s/it, est. speed input: 164.42 toks/s, output: 42.83 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12911.76it/s]


iteration 12 score: [1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.41it/s, est. speed input: 2714.74 toks/s, output: 734.96 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12181.68it/s]


iteration 13 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.54it/s, est. speed input: 2189.45 toks/s, output: 571.48 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11827.43it/s]


iteration 14 score: [1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.58it/s, est. speed input: 2218.32 toks/s, output: 553.12 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.45s/it, est. speed input: 179.83 toks/s, output: 44.02 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10933.34it/s]


iteration 15 score: [3, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.42it/s, est. speed input: 2736.16 toks/s, output: 678.62 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12303.39it/s]


iteration 16 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.92it/s, est. speed input: 2415.07 toks/s, output: 627.07 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11051.27it/s]


iteration 17 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.71it/s, est. speed input: 2925.46 toks/s, output: 717.43 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12412.63it/s]


iteration 18 score: [1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 4, 3, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.06it/s, est. speed input: 2499.39 toks/s, output: 635.16 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12562.50it/s]


iteration 19 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.31it/s, est. speed input: 2663.07 toks/s, output: 688.61 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12425.27it/s]


iteration 20 score: [1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.24it/s, est. speed input: 2610.18 toks/s, output: 670.49 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 2/2 [00:04<00:00,  2.01s/it, est. speed input: 306.52 toks/s, output: 78.24 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12402.30it/s]


iteration 21 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s, est. speed input: 2512.43 toks/s, output: 660.14 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12275.26it/s]


iteration 22 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.91it/s, est. speed input: 3036.49 toks/s, output: 711.73 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.23s/it, est. speed input: 143.16 toks/s, output: 44.65 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11574.48it/s]


iteration 23 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.23it/s, est. speed input: 2599.87 toks/s, output: 667.84 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 2/2 [00:04<00:00,  2.30s/it, est. speed input: 262.91 toks/s, output: 78.16 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11746.69it/s]


iteration 24 score: [1, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 4, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.22it/s, est. speed input: 2598.22 toks/s, output: 715.10 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10780.54it/s]


iteration 25 score: [1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.92it/s, est. speed input: 2414.63 toks/s, output: 616.66 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11049.45it/s]


iteration 26 score: [1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.22it/s, est. speed input: 1986.07 toks/s, output: 530.33 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12117.89it/s]


iteration 27 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.57it/s, est. speed input: 2817.87 toks/s, output: 709.57 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.03s/it, est. speed input: 300.70 toks/s, output: 42.82 toks/s]


Retrying failed batches: Attempt 2/10


Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.23s/it, est. speed input: 144.57 toks/s, output: 42.59 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11362.83it/s]


iteration 28 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  5.05it/s, est. speed input: 3135.59 toks/s, output: 752.80 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12745.01it/s]


iteration 29 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.74it/s, est. speed input: 2304.84 toks/s, output: 596.28 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 2/2 [00:03<00:00,  1.91s/it, est. speed input: 317.08 toks/s, output: 80.19 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12330.52it/s]


iteration 30 score: [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.42it/s, est. speed input: 2718.33 toks/s, output: 671.39 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12633.45it/s]


iteration 31 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  5.00it/s, est. speed input: 3087.97 toks/s, output: 781.80 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12560.15it/s]


iteration 32 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  5.02it/s, est. speed input: 3103.10 toks/s, output: 760.51 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 2/2 [00:05<00:00,  2.70s/it, est. speed input: 229.68 toks/s, output: 72.86 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12288.75it/s]


iteration 33 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.94it/s, est. speed input: 3054.13 toks/s, output: 728.70 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9993.87it/s]


iteration 34 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.25it/s, est. speed input: 2627.41 toks/s, output: 657.52 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10238.59it/s]


iteration 35 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.39it/s, est. speed input: 2094.48 toks/s, output: 543.22 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11455.93it/s]


iteration 36 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:13<00:00,  2.46it/s, est. speed input: 1520.17 toks/s, output: 421.23 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9408.22it/s]


iteration 37 score: [1, 1, 1, 2, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.63it/s, est. speed input: 2856.75 toks/s, output: 722.94 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12561.32it/s]


iteration 38 score: [1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.90it/s, est. speed input: 3017.75 toks/s, output: 747.97 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.50s/it, est. speed input: 244.00 toks/s, output: 44.00 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11550.58it/s]


iteration 39 score: [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.33it/s, est. speed input: 2065.13 toks/s, output: 539.46 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11239.13it/s]


iteration 40 score: [1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.80it/s, est. speed input: 2958.21 toks/s, output: 739.06 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 13035.91it/s]


iteration 41 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.44it/s, est. speed input: 2731.29 toks/s, output: 730.17 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11511.94it/s]


iteration 42 score: [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.24it/s, est. speed input: 2630.70 toks/s, output: 683.39 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 6757.17it/s]


iteration 43 score: [1, 1, 2, 1, 1, 1, 1, 3, 1, 1, 3, 1, 1, 1, 4, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.46it/s, est. speed input: 2749.74 toks/s, output: 689.98 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9689.41it/s]


iteration 44 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:11<00:00,  2.88it/s, est. speed input: 1776.83 toks/s, output: 473.02 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10533.49it/s]


iteration 45 score: [1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.82it/s, est. speed input: 2363.28 toks/s, output: 624.45 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10881.93it/s]


iteration 46 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 3, 1, 1, 1, 1, 1, 1, 4, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:10<00:00,  3.19it/s, est. speed input: 1968.39 toks/s, output: 539.20 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.03s/it, est. speed input: 200.79 toks/s, output: 44.18 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10906.69it/s]


iteration 47 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.62it/s, est. speed input: 2862.60 toks/s, output: 716.19 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12366.88it/s]


iteration 48 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.95it/s, est. speed input: 3056.15 toks/s, output: 741.75 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11615.55it/s]


iteration 49 score: [3, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 3, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.29it/s, est. speed input: 2640.68 toks/s, output: 670.45 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12170.63it/s]


iteration 50 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.65it/s, est. speed input: 2256.80 toks/s, output: 610.99 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 8216.57it/s]


iteration 51 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.31it/s, est. speed input: 2655.98 toks/s, output: 686.40 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12193.85it/s]


iteration 52 score: [1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.20it/s, est. speed input: 1969.62 toks/s, output: 508.62 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 8941.29it/s]


iteration 53 score: [1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.63it/s, est. speed input: 2857.59 toks/s, output: 721.08 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 6440.08it/s]


iteration 54 score: [1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  5.16it/s, est. speed input: 3198.76 toks/s, output: 790.33 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.11s/it, est. speed input: 200.27 toks/s, output: 45.08 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11359.94it/s]


iteration 55 score: [1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.12it/s, est. speed input: 2544.80 toks/s, output: 677.49 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.02s/it, est. speed input: 154.48 toks/s, output: 44.78 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10460.43it/s]


iteration 56 score: [1, 1, 1, 1, 1, 4, 1, 1, 3, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 1, 3, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.02it/s, est. speed input: 2475.46 toks/s, output: 642.42 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12665.63it/s]


iteration 57 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.61it/s, est. speed input: 2839.02 toks/s, output: 706.80 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 8171.05it/s]


iteration 58 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  5.20it/s, est. speed input: 3207.18 toks/s, output: 824.55 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11860.88it/s]


iteration 59 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 3, 4, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.67it/s, est. speed input: 2883.78 toks/s, output: 731.99 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.90s/it, est. speed input: 127.42 toks/s, output: 44.72 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10840.62it/s]


iteration 60 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.97it/s, est. speed input: 3056.92 toks/s, output: 747.46 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11529.74it/s]


iteration 61 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:10<00:00,  2.91it/s, est. speed input: 1802.19 toks/s, output: 479.35 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11731.29it/s]


iteration 62 score: [1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.67it/s, est. speed input: 2871.86 toks/s, output: 734.66 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7593.22it/s]


iteration 63 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.52it/s, est. speed input: 2176.47 toks/s, output: 558.99 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.72s/it, est. speed input: 131.49 toks/s, output: 43.83 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11633.68it/s]


iteration 64 score: [1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.33it/s, est. speed input: 2049.21 toks/s, output: 576.53 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10905.80it/s]


iteration 65 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 3, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.53it/s, est. speed input: 2793.09 toks/s, output: 726.17 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.10s/it, est. speed input: 196.91 toks/s, output: 44.22 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7238.58it/s]


iteration 66 score: [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 5, 1, 3, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.33it/s, est. speed input: 2662.64 toks/s, output: 696.35 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10707.44it/s]


iteration 67 score: [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.32it/s, est. speed input: 2661.36 toks/s, output: 688.10 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12368.02it/s]


iteration 68 score: [1, 2, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 4]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.60it/s, est. speed input: 2214.79 toks/s, output: 570.86 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9883.49it/s]


iteration 69 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.21it/s, est. speed input: 2596.44 toks/s, output: 682.81 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.61s/it, est. speed input: 132.16 toks/s, output: 40.15 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10883.70it/s]


iteration 70 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 3, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:10<00:00,  3.10it/s, est. speed input: 1911.90 toks/s, output: 538.62 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10540.94it/s]


iteration 71 score: [1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.28it/s, est. speed input: 2022.80 toks/s, output: 536.00 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11869.27it/s]


iteration 72 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.19it/s, est. speed input: 2585.06 toks/s, output: 698.64 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12112.42it/s]


iteration 73 score: [1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.89it/s, est. speed input: 2391.93 toks/s, output: 627.71 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.38s/it, est. speed input: 183.62 toks/s, output: 44.94 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11587.48it/s]


iteration 74 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.65it/s, est. speed input: 2247.37 toks/s, output: 628.71 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.02s/it, est. speed input: 203.83 toks/s, output: 43.68 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10262.08it/s]


iteration 75 score: [1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 5, 3, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.74it/s, est. speed input: 2300.68 toks/s, output: 632.55 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10859.92it/s]


iteration 76 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.81it/s, est. speed input: 2969.74 toks/s, output: 743.48 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12114.61it/s]


iteration 77 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.68it/s, est. speed input: 2875.88 toks/s, output: 736.08 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11690.42it/s]


iteration 78 score: [1, 1, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  5.21it/s, est. speed input: 3218.28 toks/s, output: 781.04 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.06s/it, est. speed input: 198.91 toks/s, output: 44.42 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11664.01it/s]


iteration 79 score: [1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.99it/s, est. speed input: 3078.27 toks/s, output: 762.28 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12178.36it/s]


iteration 80 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.60it/s, est. speed input: 2835.74 toks/s, output: 720.26 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.26s/it, est. speed input: 119.04 toks/s, output: 44.50 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12180.57it/s]


iteration 81 score: [1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.35it/s, est. speed input: 2687.86 toks/s, output: 689.95 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7308.34it/s]


iteration 82 score: [1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.73it/s, est. speed input: 2923.79 toks/s, output: 779.11 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it, est. speed input: 108.88 toks/s, output: 44.11 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10156.47it/s]


iteration 83 score: [1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  5.32it/s, est. speed input: 3273.08 toks/s, output: 791.02 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7743.48it/s]


iteration 84 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 5, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.67it/s, est. speed input: 2883.77 toks/s, output: 754.48 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 2/2 [00:04<00:00,  2.13s/it, est. speed input: 287.74 toks/s, output: 77.45 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11787.96it/s]


iteration 85 score: [1, 1, 2, 1, 1, 4, 5, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.98it/s, est. speed input: 3074.65 toks/s, output: 761.50 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.92s/it, est. speed input: 212.33 toks/s, output: 43.22 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12154.10it/s]


iteration 86 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  5.08it/s, est. speed input: 3139.32 toks/s, output: 741.20 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9865.32it/s]


iteration 87 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.68it/s, est. speed input: 2895.57 toks/s, output: 716.61 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7538.63it/s]


iteration 88 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  5.07it/s, est. speed input: 3119.82 toks/s, output: 800.40 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12452.93it/s]


iteration 89 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.94it/s, est. speed input: 3056.77 toks/s, output: 757.04 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12547.23it/s]


iteration 90 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.53it/s, est. speed input: 2188.08 toks/s, output: 596.46 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9903.17it/s]


iteration 91 score: [1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:13<00:00,  2.41it/s, est. speed input: 1479.85 toks/s, output: 424.14 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10869.59it/s]


iteration 92 score: [1, 1, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.71it/s, est. speed input: 2905.39 toks/s, output: 761.39 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12066.68it/s]


iteration 93 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.15it/s, est. speed input: 2560.73 toks/s, output: 682.29 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12022.37it/s]


iteration 94 score: [1, 1, 3, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.82it/s, est. speed input: 2980.81 toks/s, output: 723.24 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.68s/it, est. speed input: 228.21 toks/s, output: 44.37 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12662.05it/s]


iteration 95 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.79it/s, est. speed input: 2337.98 toks/s, output: 617.41 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11982.66it/s]


iteration 96 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:10<00:00,  2.98it/s, est. speed input: 1839.53 toks/s, output: 480.99 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12233.86it/s]


iteration 97 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.64it/s, est. speed input: 2251.03 toks/s, output: 581.05 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.62s/it, est. speed input: 169.86 toks/s, output: 45.02 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11629.64it/s]


iteration 98 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  5.27it/s, est. speed input: 3256.70 toks/s, output: 790.00 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.26s/it, est. speed input: 268.54 toks/s, output: 44.24 toks/s]


Retrying failed batches: Attempt 2/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.75s/it, est. speed input: 162.00 toks/s, output: 44.30 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11526.77it/s]


iteration 99 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.88it/s, est. speed input: 2393.98 toks/s, output: 600.79 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.88s/it, est. speed input: 161.50 toks/s, output: 44.56 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11544.62it/s]


iteration 100 score: [1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.62it/s, est. speed input: 2841.89 toks/s, output: 720.36 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.24s/it, est. speed input: 146.82 toks/s, output: 44.07 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12191.64it/s]


iteration 101 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.88it/s, est. speed input: 3006.95 toks/s, output: 776.48 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11958.10it/s]


iteration 102 score: [1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 4, 1, 3, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.88it/s, est. speed input: 3013.87 toks/s, output: 774.88 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12223.84it/s]


iteration 103 score: [1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.88it/s, est. speed input: 2384.83 toks/s, output: 649.94 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 6894.27it/s]


iteration 104 score: [1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.36it/s, est. speed input: 2063.48 toks/s, output: 548.04 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12181.68it/s]


iteration 105 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 3, 4, 1, 1, 1, 1, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.12it/s, est. speed input: 2536.06 toks/s, output: 645.69 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11047.64it/s]


iteration 106 score: [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.28it/s, est. speed input: 2027.82 toks/s, output: 534.13 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12082.98it/s]


iteration 107 score: [1, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.92it/s, est. speed input: 3027.23 toks/s, output: 759.69 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11726.17it/s]


iteration 108 score: [1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  5.04it/s, est. speed input: 3108.72 toks/s, output: 775.88 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12307.91it/s]


iteration 109 score: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.06it/s, est. speed input: 2501.64 toks/s, output: 641.07 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11545.61it/s]


iteration 110 score: [1, 1, 1, 1, 3, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1]


Processed prompts: 100%|██████████| 15/15 [00:05<00:00,  2.69it/s, est. speed input: 1665.92 toks/s, output: 427.61 toks/s]


Processed 15/15 instances.


Finalizing: 100%|██████████| 15/15 [00:00<00:00, 11635.76it/s]

iteration 111 score: [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1]
CPU times: user 16min 40s, sys: 2.42 s, total: 16min 42s
Wall time: 16min 36s





1.1796322489391797

In [56]:
print(LLM_judge_score_baseline)

1.1796322489391797


In [57]:
filepaths = [os.path.join(Output_DIR, 'generated_stories_beginning_lora_no_beam.txt'),
             os.path.join(Output_DIR, 'generated_stories_ending_lora_no_beam.txt'),
             os.path.join(Output_DIR, 'generated_text_lora_no_beam.txt')]

# Read the beginnings
with open(filepaths[0], 'r', encoding='utf-8') as f_beg:
    beginnings = f_beg.read().splitlines()  # This returns a list of strings, one per line

# Read the endings
with open(filepaths[1], 'r', encoding='utf-8') as f_end:
    endings = f_end.read().splitlines()

# Read the full texts
with open(filepaths[2], 'r', encoding='utf-8') as f_text:
    full_texts = f_text.read().splitlines()

# Now you have three arrays: beginnings, endings, and full_texts.
print(beginnings[:5])  # print first 5 items for a quick check
print(endings[:5])
print(full_texts[:5])

['Evan had been saving for years. He went to the dealership and bought a really fancy BMW. Evan was so proud of his new car. He showed it off around town.', "Serena was planning a surprise for her husband's birthday. She wanted to throw him a party, but his schedule was tough. He would always arrive home at widely different times. To get around it, she worked with his co-workers.", "Fred slapped another man's butt. He thought he knew the man. The man turned around. It was not the man he knew.", 'I used to lived in Phoenix Arizona. On my way home from work, i met my favorite actor. I was shocked by surprise when he actually spoke to me. We had lunch at the Olive Garden.', 'Tom thought he was really strong. He challenged all his friends to arm wrestle. Tom was able to beat them all. He would brag about it often.']
[' ###Evan was driving home one day. He was driving very fast. He hit a patch of ice. He', " her husband surprised her with a surprise party. He arrived home at 10pm. He was so

In [58]:
generated_text_lora = full_texts

In [59]:
generated_story_endings = []

for text in generated_text_lora:
  ending = extract_story_ending_from_generated_text(text)
  generated_story_endings.append(ending)

len(generated_story_endings)

3535

In [60]:
%%time
# Prepare the dataloaders
instructions_dataloader = DataLoader(X_test, batch_size=batch_size)
reference_dataloader = DataLoader(y_test, batch_size=batch_size)
generated_endings_dataloader = DataLoader(generated_story_endings, batch_size=batch_size)

LLM_judge_score_lora = 0
i = 0
for instruction, reference, gen_ending in zip(instructions_dataloader, reference_dataloader,generated_endings_dataloader):
  i+= 1
  feedback, score = judge.absolute_grade(
      instructions=instruction,
      responses=gen_ending,
      rubric=score_rubric,
      reference_answers=reference
    )
  LLM_judge_score_lora += sum(score)
  print("iteration", i, "score:", score)
LLM_judge_score_lora = LLM_judge_score_lora/len(X_test)
LLM_judge_score_lora

Processed prompts: 100%|██████████| 32/32 [00:11<00:00,  2.76it/s, est. speed input: 1709.43 toks/s, output: 480.09 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11506.02it/s]


iteration 1 score: [1, 1, 1, 3, 3, 2, 4, 4, 1, 2, 1, 3, 1, 3, 3, 1, 1, 3, 1, 4, 1, 3, 1, 1, 1, 1, 3, 1, 3, 1, 2, 2]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.66it/s, est. speed input: 2264.57 toks/s, output: 644.94 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10501.35it/s]


iteration 2 score: [1, 1, 3, 1, 2, 1, 3, 1, 3, 1, 2, 1, 3, 1, 3, 3, 1, 1, 2, 2, 2, 2, 3, 3, 1, 1, 3, 2, 1, 1, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.30it/s, est. speed input: 2668.41 toks/s, output: 782.12 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7970.17it/s]


iteration 3 score: [2, 1, 1, 3, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 2, 1, 2, 1, 1, 4, 4, 1, 1, 3, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.09it/s, est. speed input: 2531.22 toks/s, output: 691.56 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10441.71it/s]


iteration 4 score: [3, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 2, 1, 4, 3, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 4, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.21it/s, est. speed input: 2588.98 toks/s, output: 693.16 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11332.13it/s]


iteration 5 score: [4, 2, 1, 4, 3, 2, 3, 4, 3, 1, 1, 2, 1, 1, 3, 3, 3, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.28it/s, est. speed input: 2647.62 toks/s, output: 744.78 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11515.89it/s]


iteration 6 score: [3, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 3, 1, 2, 2, 3, 1, 1, 1, 3, 2, 2, 1, 3, 3, 4, 3, 1, 4]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.18it/s, est. speed input: 2582.00 toks/s, output: 735.57 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10868.71it/s]


iteration 7 score: [4, 3, 2, 3, 2, 1, 3, 1, 3, 1, 1, 3, 3, 1, 1, 1, 1, 2, 1, 1, 3, 3, 3, 3, 1, 1, 3, 1, 3, 1, 3, 2]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.51it/s, est. speed input: 2166.69 toks/s, output: 623.32 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11471.60it/s]


iteration 8 score: [3, 3, 3, 4, 1, 4, 3, 3, 1, 4, 1, 1, 1, 2, 1, 1, 1, 2, 3, 1, 1, 3, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.74it/s, est. speed input: 2309.61 toks/s, output: 679.45 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11238.19it/s]


iteration 9 score: [3, 3, 1, 3, 1, 1, 1, 2, 1, 3, 1, 3, 2, 1, 1, 2, 2, 3, 2, 3, 1, 1, 1, 1, 3, 1, 1, 1, 1, 3, 3, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.18it/s, est. speed input: 2591.89 toks/s, output: 714.01 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11285.44it/s]


iteration 10 score: [1, 1, 1, 1, 4, 3, 1, 1, 1, 1, 3, 2, 3, 1, 2, 1, 2, 3, 3, 1, 2, 1, 3, 1, 3, 3, 3, 2, 3, 1, 3, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.28it/s, est. speed input: 2645.69 toks/s, output: 705.44 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11227.85it/s]


iteration 11 score: [2, 2, 1, 2, 2, 1, 3, 2, 1, 3, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 3, 3, 2, 3, 4, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.37it/s, est. speed input: 2697.99 toks/s, output: 763.92 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10154.16it/s]


iteration 12 score: [1, 1, 1, 3, 1, 3, 1, 3, 2, 1, 3, 2, 3, 3, 3, 2, 1, 3, 1, 1, 1, 1, 3, 2, 3, 1, 1, 1, 2, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.48it/s, est. speed input: 2763.15 toks/s, output: 743.31 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11269.33it/s]


iteration 13 score: [3, 3, 1, 3, 3, 1, 1, 3, 3, 3, 2, 1, 1, 1, 4, 1, 1, 3, 3, 3, 1, 4, 2, 3, 4, 2, 1, 1, 1, 3, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.22it/s, est. speed input: 2612.95 toks/s, output: 727.43 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10421.44it/s]


iteration 14 score: [2, 1, 1, 4, 2, 1, 3, 1, 3, 2, 1, 1, 3, 2, 1, 1, 3, 1, 2, 1, 3, 1, 2, 1, 3, 3, 3, 1, 1, 3, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.51it/s, est. speed input: 2793.78 toks/s, output: 744.42 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11707.76it/s]


iteration 15 score: [1, 1, 3, 3, 3, 4, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 3, 1, 2, 4, 2, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.25it/s, est. speed input: 2015.83 toks/s, output: 577.85 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11463.76it/s]


iteration 16 score: [1, 2, 1, 1, 3, 1, 3, 1, 1, 1, 2, 2, 3, 3, 2, 1, 3, 1, 2, 2, 1, 1, 2, 3, 3, 3, 2, 1, 3, 3, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.81it/s, est. speed input: 2354.03 toks/s, output: 691.70 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11070.42it/s]


iteration 17 score: [3, 1, 1, 1, 1, 2, 3, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 2, 4, 1, 3, 2, 3, 3, 3, 3]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.50it/s, est. speed input: 2180.74 toks/s, output: 608.33 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11511.94it/s]


iteration 18 score: [1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 3, 1, 1, 3, 1, 1, 2, 2, 2, 1, 3, 1, 1, 3, 1, 3, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.11it/s, est. speed input: 2526.94 toks/s, output: 731.22 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9969.38it/s]


iteration 19 score: [3, 3, 2, 3, 1, 4, 1, 3, 2, 1, 2, 3, 1, 2, 2, 2, 1, 1, 3, 1, 3, 2, 1, 3, 1, 2, 2, 1, 3, 4, 4, 1]


Processed prompts: 100%|██████████| 32/32 [00:10<00:00,  3.11it/s, est. speed input: 1921.72 toks/s, output: 574.97 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10767.57it/s]


iteration 20 score: [2, 1, 3, 3, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 2, 5, 1, 1, 2, 1, 1, 2, 5, 3, 1, 3, 2, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.33it/s, est. speed input: 2666.27 toks/s, output: 738.63 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11734.37it/s]


iteration 21 score: [2, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 3, 1, 2, 1, 2, 1, 2, 3, 1, 1, 2, 2, 1, 1, 4, 1, 3, 1, 1, 4, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.23it/s, est. speed input: 2607.03 toks/s, output: 707.63 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10912.90it/s]


iteration 22 score: [3, 1, 2, 1, 3, 3, 1, 2, 2, 3, 3, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.91it/s, est. speed input: 3036.95 toks/s, output: 809.38 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11013.19it/s]


iteration 23 score: [1, 4, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 2, 3, 1, 1, 3, 2, 3, 1, 3, 3, 3, 1, 1, 3, 3, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.49it/s, est. speed input: 2761.29 toks/s, output: 783.25 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 6401.38it/s]


iteration 24 score: [1, 1, 3, 2, 1, 3, 1, 1, 2, 3, 1, 1, 2, 5, 1, 2, 1, 2, 1, 1, 1, 2, 1, 4, 2, 3, 2, 1, 1, 2, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.38it/s, est. speed input: 2694.24 toks/s, output: 759.75 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10818.78it/s]


iteration 25 score: [1, 1, 1, 3, 1, 2, 3, 1, 1, 1, 3, 1, 2, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 3, 4, 1, 3, 3, 3, 1, 3, 3]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.76it/s, est. speed input: 2315.92 toks/s, output: 653.59 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10427.11it/s]


iteration 26 score: [1, 1, 1, 3, 1, 1, 1, 2, 3, 3, 1, 3, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 3, 1, 1, 4, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.30it/s, est. speed input: 2646.81 toks/s, output: 720.23 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10742.57it/s]


iteration 27 score: [3, 1, 3, 3, 1, 2, 1, 1, 3, 1, 2, 2, 3, 1, 2, 1, 1, 2, 2, 3, 3, 2, 1, 1, 1, 3, 1, 3, 4, 1, 3, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.54it/s, est. speed input: 2184.99 toks/s, output: 625.09 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11111.66it/s]


iteration 28 score: [1, 1, 1, 1, 3, 1, 2, 1, 3, 1, 3, 3, 2, 3, 3, 3, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.27it/s, est. speed input: 2649.93 toks/s, output: 704.45 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10729.69it/s]


iteration 29 score: [3, 1, 1, 3, 3, 1, 3, 1, 1, 2, 3, 1, 1, 1, 2, 3, 1, 3, 1, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.49it/s, est. speed input: 2761.55 toks/s, output: 772.15 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10915.56it/s]


iteration 30 score: [1, 3, 1, 3, 1, 2, 1, 1, 1, 1, 5, 2, 1, 1, 3, 2, 1, 3, 3, 2, 1, 2, 1, 3, 3, 1, 2, 2, 1, 3, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.85it/s, est. speed input: 2981.71 toks/s, output: 797.41 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10734.84it/s]


iteration 31 score: [2, 1, 3, 1, 3, 1, 1, 1, 3, 3, 3, 3, 1, 3, 3, 1, 1, 3, 1, 3, 3, 1, 1, 1, 1, 2, 3, 1, 3, 1, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.20it/s, est. speed input: 2594.22 toks/s, output: 705.16 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11893.46it/s]


iteration 32 score: [4, 1, 1, 3, 3, 1, 4, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 3, 2, 1, 3, 1, 1, 4, 1, 2, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:10<00:00,  3.16it/s, est. speed input: 1953.68 toks/s, output: 556.09 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10296.72it/s]


iteration 33 score: [1, 1, 2, 1, 1, 3, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 3, 1, 1, 3, 1, 1, 2, 2, 4, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.67it/s, est. speed input: 2272.12 toks/s, output: 613.58 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10049.25it/s]


iteration 34 score: [3, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 4, 1, 1, 3, 1, 3, 1, 3, 5, 1, 3, 3, 1, 3, 4, 3, 1, 1, 3, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.51it/s, est. speed input: 2786.30 toks/s, output: 742.90 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.53s/it, est. speed input: 172.06 toks/s, output: 44.79 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11638.72it/s]


iteration 35 score: [4, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 3, 3, 2, 1, 1, 1, 1, 3, 2, 3, 1, 2, 1, 1, 2, 1, 3, 3, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.15it/s, est. speed input: 2568.55 toks/s, output: 696.34 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11510.95it/s]


iteration 36 score: [3, 1, 2, 2, 1, 3, 4, 3, 3, 1, 1, 3, 1, 3, 2, 2, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 2, 3, 1, 1, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.21it/s, est. speed input: 2602.94 toks/s, output: 740.76 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.75s/it, est. speed input: 167.32 toks/s, output: 44.30 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9459.28it/s]


iteration 37 score: [3, 1, 3, 1, 3, 1, 1, 3, 3, 1, 1, 1, 1, 4, 1, 4, 3, 1, 1, 2, 1, 2, 3, 2, 1, 3, 3, 1, 1, 2, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.47it/s, est. speed input: 2761.00 toks/s, output: 739.36 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11599.49it/s]


iteration 38 score: [1, 1, 2, 2, 1, 2, 3, 1, 4, 1, 2, 3, 2, 1, 3, 3, 2, 1, 1, 1, 3, 3, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.53it/s, est. speed input: 2179.87 toks/s, output: 643.37 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 8452.00it/s]


iteration 39 score: [2, 1, 1, 2, 3, 1, 1, 2, 1, 1, 3, 1, 3, 1, 3, 2, 3, 1, 4, 3, 1, 3, 5, 2, 1, 2, 1, 3, 1, 2, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.42it/s, est. speed input: 2747.09 toks/s, output: 763.64 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10624.37it/s]


iteration 40 score: [1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 3, 3, 3, 5, 3, 1, 4, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.38it/s, est. speed input: 2699.74 toks/s, output: 746.99 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11314.93it/s]


iteration 41 score: [1, 1, 3, 3, 2, 1, 1, 1, 1, 2, 3, 2, 1, 2, 3, 1, 1, 1, 1, 1, 2, 1, 2, 3, 3, 3, 1, 2, 3, 1, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.88it/s, est. speed input: 3008.86 toks/s, output: 780.53 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12471.45it/s]


iteration 42 score: [1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 3, 5, 1, 1, 2, 1, 1, 1, 1, 2, 4, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 4]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.19it/s, est. speed input: 2598.77 toks/s, output: 666.23 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11921.99it/s]


iteration 43 score: [1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 2, 1, 3, 1, 1, 1, 3, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3, 1, 3, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.37it/s, est. speed input: 2695.25 toks/s, output: 751.71 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11245.73it/s]


iteration 44 score: [3, 1, 4, 3, 3, 2, 1, 2, 1, 4, 2, 1, 3, 2, 3, 2, 1, 1, 2, 3, 1, 1, 2, 1, 2, 1, 3, 1, 3, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.33it/s, est. speed input: 2670.93 toks/s, output: 769.03 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11311.12it/s]


iteration 45 score: [1, 1, 1, 3, 3, 2, 3, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 2, 3, 2, 2, 3]


Processed prompts: 100%|██████████| 32/32 [00:10<00:00,  3.19it/s, est. speed input: 1979.81 toks/s, output: 550.72 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9025.47it/s]


iteration 46 score: [3, 1, 2, 1, 3, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 4, 1, 2, 2, 1, 1, 3, 4, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.66it/s, est. speed input: 2257.54 toks/s, output: 642.84 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10538.45it/s]


iteration 47 score: [3, 3, 1, 2, 2, 3, 1, 3, 2, 1, 1, 3, 2, 1, 1, 3, 1, 1, 1, 3, 3, 3, 1, 3, 3, 1, 1, 3, 1, 4, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.55it/s, est. speed input: 2818.07 toks/s, output: 720.94 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11770.39it/s]


iteration 48 score: [3, 2, 1, 1, 1, 3, 2, 2, 3, 1, 2, 1, 3, 2, 1, 1, 1, 1, 1, 3, 1, 3, 3, 3, 1, 2, 4, 1, 4, 2, 3, 2]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.59it/s, est. speed input: 2218.74 toks/s, output: 633.49 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11362.83it/s]


iteration 49 score: [1, 3, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 3, 2, 3, 3, 2, 3, 1, 2, 1, 2, 2, 1, 2, 4, 1, 1, 2, 2, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.33it/s, est. speed input: 2670.55 toks/s, output: 732.42 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 8737.56it/s]


iteration 50 score: [4, 1, 1, 1, 1, 3, 1, 1, 1, 3, 3, 1, 4, 1, 1, 1, 3, 1, 1, 3, 1, 1, 1, 2, 3, 1, 1, 1, 1, 3, 3, 2]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.52it/s, est. speed input: 2799.18 toks/s, output: 767.37 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11156.92it/s]


iteration 51 score: [1, 1, 3, 1, 3, 1, 1, 3, 3, 1, 1, 3, 2, 1, 1, 3, 1, 3, 1, 1, 1, 3, 1, 4, 3, 1, 1, 1, 2, 3, 3, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.66it/s, est. speed input: 2869.20 toks/s, output: 742.65 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10580.82it/s]


iteration 52 score: [3, 1, 1, 1, 1, 2, 1, 3, 2, 2, 1, 3, 1, 3, 1, 3, 1, 2, 3, 1, 1, 3, 1, 1, 1, 1, 1, 2, 3, 3, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.75it/s, est. speed input: 2309.37 toks/s, output: 631.63 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 6759.21it/s]


iteration 53 score: [1, 3, 2, 1, 1, 2, 1, 3, 3, 3, 1, 3, 1, 1, 1, 3, 1, 2, 3, 3, 1, 2, 1, 1, 1, 1, 1, 1, 4, 1, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.78it/s, est. speed input: 2340.80 toks/s, output: 655.71 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10106.00it/s]


iteration 54 score: [1, 3, 1, 2, 1, 1, 4, 1, 3, 3, 2, 3, 3, 3, 2, 3, 5, 3, 3, 3, 1, 3, 2, 1, 1, 1, 3, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.68it/s, est. speed input: 2286.12 toks/s, output: 674.45 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7844.86it/s]


iteration 55 score: [1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 1, 2, 3, 1, 1, 1, 2, 1, 1, 1, 3, 1, 3, 1, 3, 3, 2, 3, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.43it/s, est. speed input: 2742.29 toks/s, output: 731.31 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10600.88it/s]


iteration 56 score: [1, 1, 1, 1, 3, 3, 3, 2, 1, 2, 1, 3, 1, 3, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.93it/s, est. speed input: 3028.90 toks/s, output: 804.28 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12211.60it/s]


iteration 57 score: [1, 1, 2, 3, 1, 3, 2, 1, 1, 1, 3, 3, 3, 1, 3, 3, 1, 4, 1, 1, 3, 1, 1, 1, 1, 4, 3, 3, 3, 2, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.70it/s, est. speed input: 2895.32 toks/s, output: 754.73 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12355.49it/s]


iteration 58 score: [2, 2, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.18it/s, est. speed input: 2577.67 toks/s, output: 718.58 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10599.20it/s]


iteration 59 score: [1, 1, 1, 1, 1, 3, 1, 1, 1, 4, 1, 3, 3, 1, 1, 1, 2, 1, 3, 2, 3, 1, 1, 1, 1, 1, 3, 1, 3, 2, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.66it/s, est. speed input: 2263.33 toks/s, output: 649.44 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11335.96it/s]


iteration 60 score: [1, 1, 3, 1, 1, 3, 1, 3, 2, 2, 3, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 2, 3, 1, 1, 3, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:10<00:00,  3.01it/s, est. speed input: 1852.90 toks/s, output: 579.24 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10271.50it/s]


iteration 61 score: [3, 3, 1, 1, 2, 1, 3, 1, 3, 1, 2, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3, 1, 3, 3, 3, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.38it/s, est. speed input: 2716.14 toks/s, output: 751.58 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11086.88it/s]


iteration 62 score: [3, 1, 1, 3, 3, 1, 1, 1, 2, 3, 2, 2, 4, 3, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.27it/s, est. speed input: 2629.78 toks/s, output: 719.51 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.34s/it, est. speed input: 186.12 toks/s, output: 43.16 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9290.35it/s]


iteration 63 score: [2, 1, 1, 2, 1, 1, 1, 2, 2, 3, 1, 1, 1, 1, 1, 3, 1, 2, 3, 1, 2, 1, 3, 1, 2, 3, 3, 1, 3, 2, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.05it/s, est. speed input: 2500.14 toks/s, output: 714.21 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10435.21it/s]


iteration 64 score: [1, 4, 1, 1, 2, 1, 1, 4, 3, 1, 2, 3, 1, 3, 1, 2, 2, 3, 1, 1, 3, 3, 2, 4, 1, 3, 1, 1, 2, 1, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.70it/s, est. speed input: 2277.87 toks/s, output: 637.72 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11572.49it/s]


iteration 65 score: [1, 2, 4, 1, 1, 1, 1, 3, 2, 3, 1, 3, 1, 2, 1, 1, 3, 1, 2, 1, 1, 2, 1, 1, 2, 3, 2, 2, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.95it/s, est. speed input: 2435.64 toks/s, output: 698.73 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 8591.58it/s]


iteration 66 score: [1, 3, 1, 2, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 3, 3, 1, 1, 1, 2, 2, 3, 3, 1, 1, 3, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.22it/s, est. speed input: 2601.41 toks/s, output: 719.29 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11053.09it/s]


iteration 67 score: [1, 1, 1, 1, 1, 3, 4, 3, 1, 2, 4, 3, 1, 1, 5, 3, 1, 3, 1, 1, 2, 3, 1, 2, 2, 2, 2, 2, 3, 4, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.44it/s, est. speed input: 2742.06 toks/s, output: 781.28 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9637.23it/s]


iteration 68 score: [4, 1, 3, 3, 2, 3, 1, 3, 1, 1, 1, 2, 3, 3, 3, 1, 1, 1, 2, 1, 3, 1, 3, 3, 2, 1, 2, 1, 1, 1, 1, 4]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.86it/s, est. speed input: 2995.38 toks/s, output: 770.65 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7878.48it/s]


iteration 69 score: [2, 1, 1, 1, 1, 4, 1, 1, 3, 1, 3, 1, 3, 3, 1, 1, 1, 3, 3, 2, 3, 4, 1, 1, 3, 3, 3, 1, 1, 2, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.62it/s, est. speed input: 2845.95 toks/s, output: 772.02 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10784.01it/s]


iteration 70 score: [3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 1, 1, 3, 1, 1, 1, 3, 4, 2, 1, 3, 1, 1, 3, 3, 3, 3]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.42it/s, est. speed input: 2117.50 toks/s, output: 586.92 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11552.57it/s]


iteration 71 score: [4, 3, 2, 3, 1, 4, 3, 1, 1, 3, 1, 1, 3, 1, 3, 3, 3, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  5.10it/s, est. speed input: 3151.25 toks/s, output: 830.88 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11898.73it/s]


iteration 72 score: [1, 1, 2, 1, 3, 3, 1, 1, 1, 1, 1, 2, 1, 1, 2, 3, 1, 1, 4, 3, 3, 2, 2, 3, 3, 1, 1, 1, 3, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.01it/s, est. speed input: 2473.20 toks/s, output: 684.60 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11723.10it/s]


iteration 73 score: [2, 1, 2, 2, 3, 1, 1, 1, 2, 2, 3, 1, 3, 1, 2, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.29it/s, est. speed input: 2647.04 toks/s, output: 717.61 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10419.82it/s]


iteration 74 score: [1, 1, 1, 1, 2, 1, 2, 3, 3, 1, 1, 2, 3, 1, 2, 3, 2, 1, 3, 1, 4, 1, 3, 3, 1, 1, 1, 1, 4, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  4.00it/s, est. speed input: 2466.12 toks/s, output: 677.41 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.06s/it, est. speed input: 201.62 toks/s, output: 43.86 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11532.71it/s]


iteration 75 score: [1, 1, 1, 2, 1, 1, 1, 3, 3, 3, 4, 1, 1, 1, 1, 1, 2, 4, 1, 1, 1, 3, 1, 5, 1, 3, 3, 4, 1, 3, 3, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.49it/s, est. speed input: 2761.79 toks/s, output: 771.99 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11074.98it/s]


iteration 76 score: [1, 3, 1, 4, 2, 2, 1, 1, 1, 2, 1, 2, 1, 3, 5, 2, 2, 3, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 3, 3, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.47it/s, est. speed input: 2761.74 toks/s, output: 693.75 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9013.95it/s]


iteration 77 score: [3, 3, 1, 2, 2, 1, 1, 3, 1, 1, 3, 3, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1, 3, 1, 1, 1, 3, 3, 1, 3, 2, 4]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.74it/s, est. speed input: 2299.65 toks/s, output: 677.64 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7879.40it/s]


iteration 78 score: [1, 1, 1, 1, 1, 1, 2, 3, 2, 1, 3, 2, 1, 1, 3, 2, 2, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 3, 3, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.13it/s, est. speed input: 2550.62 toks/s, output: 707.74 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11058.56it/s]


iteration 79 score: [3, 3, 3, 1, 4, 1, 1, 1, 2, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 1, 2, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.75it/s, est. speed input: 2932.56 toks/s, output: 789.77 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7823.37it/s]


iteration 80 score: [1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 3, 3, 1, 3, 1, 3, 4, 1, 3, 3, 1, 1, 1, 1, 1, 2, 3, 1, 1, 3, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.60it/s, est. speed input: 2223.83 toks/s, output: 614.40 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11602.50it/s]


iteration 81 score: [2, 4, 3, 1, 1, 3, 4, 3, 1, 1, 3, 3, 2, 3, 3, 2, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.48it/s, est. speed input: 2770.83 toks/s, output: 741.88 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7132.79it/s]


iteration 82 score: [1, 1, 3, 1, 3, 1, 1, 2, 2, 1, 2, 1, 1, 3, 2, 1, 1, 1, 3, 3, 2, 1, 1, 4, 1, 4, 1, 1, 3, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.69it/s, est. speed input: 2903.40 toks/s, output: 765.89 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.45s/it, est. speed input: 178.19 toks/s, output: 42.01 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 6629.02it/s]


iteration 83 score: [3, 3, 1, 1, 2, 1, 1, 2, 1, 1, 3, 1, 1, 1, 4, 1, 1, 1, 3, 1, 3, 3, 1, 2, 1, 5, 3, 3, 3, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.23it/s, est. speed input: 2602.59 toks/s, output: 730.01 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11505.03it/s]


iteration 84 score: [1, 1, 1, 1, 1, 3, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 2, 3, 3, 2, 2, 1, 3, 3, 1, 2, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.31it/s, est. speed input: 2046.13 toks/s, output: 562.63 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11335.00it/s]


iteration 85 score: [1, 4, 2, 3, 2, 1, 1, 3, 3, 1, 3, 3, 1, 3, 1, 1, 4, 1, 1, 1, 1, 4, 4, 3, 1, 2, 1, 1, 1, 2, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.61it/s, est. speed input: 2228.52 toks/s, output: 625.49 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9806.94it/s]


iteration 86 score: [1, 3, 1, 1, 1, 1, 1, 2, 3, 4, 1, 1, 1, 1, 3, 1, 1, 1, 3, 3, 3, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.87it/s, est. speed input: 2391.60 toks/s, output: 690.47 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11081.38it/s]


iteration 87 score: [1, 1, 2, 1, 2, 1, 2, 3, 3, 2, 3, 2, 2, 2, 3, 1, 1, 1, 3, 1, 2, 1, 1, 1, 3, 1, 3, 3, 4, 2, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.69it/s, est. speed input: 2904.59 toks/s, output: 768.58 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11121.79it/s]


iteration 88 score: [3, 1, 1, 3, 1, 1, 1, 3, 3, 3, 2, 1, 1, 3, 3, 1, 1, 1, 2, 1, 4, 1, 4, 1, 4, 3, 1, 1, 1, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.32it/s, est. speed input: 2039.50 toks/s, output: 576.80 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10479.21it/s]


iteration 89 score: [1, 3, 2, 1, 1, 1, 1, 2, 3, 3, 1, 1, 3, 1, 2, 1, 1, 1, 1, 3, 1, 3, 2, 1, 1, 1, 1, 3, 2, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.67it/s, est. speed input: 2269.38 toks/s, output: 626.99 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7817.45it/s]


iteration 90 score: [2, 1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 3, 1, 1, 3, 1, 4, 1, 4, 3, 1, 2, 1, 1, 1, 3, 1, 3, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.47it/s, est. speed input: 2769.01 toks/s, output: 752.07 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11540.65it/s]


iteration 91 score: [3, 3, 3, 1, 1, 3, 4, 3, 3, 1, 1, 1, 4, 2, 3, 2, 2, 1, 1, 1, 4, 1, 1, 1, 1, 2, 2, 1, 1, 4, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.93it/s, est. speed input: 3032.29 toks/s, output: 814.36 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10728.84it/s]


iteration 92 score: [1, 1, 3, 1, 4, 1, 2, 1, 4, 1, 3, 2, 3, 3, 4, 1, 3, 2, 2, 3, 1, 3, 3, 1, 1, 1, 1, 2, 3, 1, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.05it/s, est. speed input: 2501.67 toks/s, output: 694.57 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7890.52it/s]


iteration 93 score: [2, 3, 3, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 2, 1, 1, 3, 2, 2, 1, 2, 3, 2, 1, 3, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.32it/s, est. speed input: 2670.44 toks/s, output: 726.63 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.44s/it, est. speed input: 181.42 toks/s, output: 44.26 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11480.43it/s]


iteration 94 score: [3, 3, 3, 3, 1, 3, 2, 1, 1, 2, 3, 1, 1, 4, 1, 1, 3, 1, 1, 2, 1, 1, 3, 1, 3, 1, 3, 2, 1, 4, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.90it/s, est. speed input: 2411.96 toks/s, output: 696.23 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11171.78it/s]


iteration 95 score: [1, 1, 2, 1, 1, 2, 1, 1, 3, 2, 1, 2, 1, 1, 3, 1, 2, 1, 3, 4, 3, 1, 3, 2, 1, 3, 2, 1, 3, 1, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.49it/s, est. speed input: 2771.12 toks/s, output: 739.86 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 9322.62it/s]


iteration 96 score: [1, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 3, 2, 4, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 4, 3, 1, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:09<00:00,  3.28it/s, est. speed input: 2026.50 toks/s, output: 569.60 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11147.65it/s]


iteration 97 score: [1, 1, 3, 3, 1, 1, 4, 3, 2, 3, 1, 2, 3, 1, 1, 1, 1, 3, 3, 1, 5, 4, 1, 1, 3, 1, 1, 4, 3, 4, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.14it/s, est. speed input: 2562.65 toks/s, output: 734.60 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11085.05it/s]


iteration 98 score: [2, 1, 1, 2, 1, 3, 1, 1, 3, 3, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 1, 4, 1, 1, 2, 3, 3, 3, 1, 4]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.45it/s, est. speed input: 2751.53 toks/s, output: 768.61 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11229.73it/s]


iteration 99 score: [1, 1, 1, 2, 1, 3, 3, 3, 4, 3, 1, 1, 2, 1, 1, 1, 3, 3, 2, 1, 2, 3, 1, 3, 1, 2, 1, 2, 3, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:08<00:00,  3.95it/s, est. speed input: 2435.54 toks/s, output: 668.56 toks/s]


Retrying failed batches: Attempt 1/10


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.70s/it, est. speed input: 166.33 toks/s, output: 44.62 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10859.04it/s]


iteration 100 score: [1, 1, 2, 3, 1, 1, 1, 4, 1, 3, 1, 1, 4, 1, 1, 2, 1, 3, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 3, 1, 2, 3]


Processed prompts: 100%|██████████| 32/32 [00:10<00:00,  2.96it/s, est. speed input: 1821.02 toks/s, output: 515.76 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10453.91it/s]


iteration 101 score: [2, 1, 1, 1, 2, 3, 2, 1, 1, 1, 3, 1, 1, 1, 1, 2, 4, 3, 1, 1, 1, 2, 3, 3, 2, 1, 1, 3, 3, 2, 3, 2]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.09it/s, est. speed input: 2522.96 toks/s, output: 687.82 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10384.35it/s]


iteration 102 score: [1, 1, 3, 4, 1, 1, 5, 3, 1, 3, 3, 3, 3, 2, 1, 2, 2, 3, 2, 1, 1, 1, 3, 3, 1, 3, 2, 1, 3, 3, 4, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.65it/s, est. speed input: 2877.10 toks/s, output: 774.86 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10585.83it/s]


iteration 103 score: [3, 1, 1, 1, 1, 2, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 3, 3, 1, 2, 1, 1, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.71it/s, est. speed input: 2893.14 toks/s, output: 742.05 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 12431.02it/s]


iteration 104 score: [3, 1, 3, 1, 3, 3, 4, 1, 1, 1, 1, 4, 1, 1, 3, 1, 3, 2, 3, 1, 1, 1, 1, 3, 3, 1, 2, 1, 1, 2, 1, 2]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.67it/s, est. speed input: 2861.39 toks/s, output: 822.30 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10100.67it/s]


iteration 105 score: [1, 1, 1, 2, 1, 3, 1, 1, 2, 3, 1, 1, 1, 1, 1, 2, 3, 1, 1, 2, 1, 2, 1, 3, 4, 1, 1, 1, 2, 1, 1, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.24it/s, est. speed input: 2612.14 toks/s, output: 723.42 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 8487.27it/s]


iteration 106 score: [1, 3, 1, 1, 3, 1, 3, 2, 2, 3, 2, 3, 3, 1, 1, 3, 3, 1, 1, 1, 1, 2, 3, 1, 2, 1, 1, 1, 3, 1, 3, 1]


Processed prompts: 100%|██████████| 32/32 [00:06<00:00,  4.73it/s, est. speed input: 2924.56 toks/s, output: 773.20 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 11835.78it/s]


iteration 107 score: [1, 3, 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2, 4, 1, 1, 2, 2, 3, 2, 3, 1, 1, 1, 1, 3, 3, 1, 2, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.44it/s, est. speed input: 2733.95 toks/s, output: 766.04 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7080.11it/s]


iteration 108 score: [3, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 2, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 1, 3]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.10it/s, est. speed input: 2531.33 toks/s, output: 685.86 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 7214.46it/s]


iteration 109 score: [1, 3, 3, 2, 1, 3, 1, 2, 3, 1, 1, 1, 3, 1, 4, 1, 1, 1, 3, 1, 2, 2, 4, 1, 1, 3, 2, 1, 3, 3, 2, 1]


Processed prompts: 100%|██████████| 32/32 [00:07<00:00,  4.10it/s, est. speed input: 2525.87 toks/s, output: 733.26 toks/s]


Processed 32/32 instances.


Finalizing: 100%|██████████| 32/32 [00:00<00:00, 10574.15it/s]


iteration 110 score: [1, 3, 2, 3, 3, 3, 1, 1, 1, 1, 3, 4, 2, 1, 1, 1, 3, 3, 2, 1, 1, 1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 2]


Processed prompts: 100%|██████████| 15/15 [00:05<00:00,  2.58it/s, est. speed input: 1598.24 toks/s, output: 446.02 toks/s]


Processed 15/15 instances.


Finalizing: 100%|██████████| 15/15 [00:00<00:00, 11076.51it/s]

iteration 111 score: [1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 1, 1, 3]
CPU times: user 15min 7s, sys: 3.32 s, total: 15min 11s
Wall time: 15min 4s





1.8147100424328146

In [61]:
from google.colab import runtime
runtime.unassign()