In [None]:
!pip install openai datasets gspread_pandas



In [None]:
from openai import OpenAI

client_OpenAI = OpenAI(
    api_key="",
)

def get_eval_gpt4o(prompt, system_prompt, model="gpt-4o"):
    response = client_OpenAI.chat.completions.create(
        model=model,
        messages=[{
                    'role': 'system',
                    'content': system_prompt
                },{"role": "user", "content": prompt}],
        temperature=0.0,
        max_tokens=50
    )
    return response.choices[0].message.content.strip()

In [None]:
import pandas as pd
from tqdm.notebook import tqdm
from google.colab import auth
from google.auth import default
import gspread_pandas

# Authenticate and set up Google Sheets
auth.authenticate_user()
creds, _ = default()
spread = gspread_pandas.Spread('Temporal biases of Foundation Models', creds=creds, create_spread=True)
# spread.create_sheet('Raw Data v1 Egyptian')
spread.open_sheet('Actual Evals v3')

# spread.df_to_sheet(dfss, index=False, headers=True, start='A1',)

dfss = spread.sheet_to_df(index=False)

dfss.head()

Unnamed: 0,Type of Q,Question,Extracted Date,Answer,Time Period,Date Format,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct,Qwen/Qwen2.5-72B-Instruct,meta-llama/Llama-3.2-3B-Instruct,meta-llama/Llama-2-7b-chat-hf,microsoft/Phi-3.5-mini-instruct,mistralai/Mistral-7B-Instruct-v0.3
0,Factual,Which from the following famous people died \n...,23041616,William Shakespeare,Past,DDMMYYYY,Loading...,Loading...,Loading...,Loading...,Loading...,Loading...,Loading...
1,Factual,Which from the following famous people died \n...,4231616,William Shakespeare,Past,MMDDYYYY,Loading...,Loading...,Loading...,Loading...,Loading...,Loading...,Loading...
2,Factual,Which from the following famous people died \n...,23April1616,William Shakespeare,Past,DDMonYYYY,Loading...,Loading...,Loading...,Loading...,Loading...,Loading...,Loading...
3,Factual,Which from the following famous people died \n...,23-04-16,William Shakespeare,Past,DD-MM-YY,Loading...,Loading...,Loading...,Loading...,Loading...,Loading...,Loading...
4,Factual,Which from the following famous people died \n...,"1616, April 23",William Shakespeare,Past,"YYYY, Mon DD",Loading...,Loading...,Loading...,Loading...,Loading...,Loading...,Loading...


In [None]:
def run_batches(dataset):

    # dataset = data.copy()

    dataset = dataset.reset_index(drop=True)

    system_prompt="""Be concise and answer in less than 15 words:"""

    dataset['body'] = [{"model":"gpt-4-turbo", "max_tokens": 100,
        "messages": [{'role': 'system','content': system_prompt},
                    {'role': 'user', 'content':f"{dataset['Question'][i]}" }] } for i in range(len(dataset))]

    dataset['method'] = 'POST'

    dataset["url"]= "/v1/chat/completions"

    openai_dataset = dataset[['body', 'custom_id', 'method', 'url']]

    openai_dataset.to_json('eval_data.jsonl', orient='records', lines=True)

    batch_input_file = client_OpenAI.files.create(
      file=open("/content/eval_data.jsonl", "rb"),
      purpose="batch"
    )

    batch_input_file_id = batch_input_file.id

    batch_id = client_OpenAI.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
          "description": "nightly eval job"
        }
    )

    return batch_id.id

In [None]:
dataset = dfss

dataset['custom_id'] = ["Request-"+str(i) for i in range(len(dataset))]

batch_ids = []

deldataset = dataset.copy()
import time

# # batch_ids = batch_ids['batch_id'].tolist()

while len(deldataset) > 0:
    # Take the first 100 items or all remaining items if less than 100
    batch = deldataset.head(100)

    # Run the batch and store the batch ID
    batch_id = run_batches(batch)
    batch_ids.append(batch_id)

    # Remove the processed items from the dataset
    deldataset = deldataset.iloc[100:]

    # Optional: Print progress
    print(f"Processed batch of {len(batch)} items. {len(deldataset)} items remaining.")

    time.sleep(90)

Processed batch of 100 items. 90 items remaining.
Processed batch of 90 items. 0 items remaining.


In [None]:
from io import StringIO

def get_output(batch_id):
    batch = client_OpenAI.batches.retrieve(batch_id)
    output_file = batch.output_file_id
    if output_file is None:
        print("Output file not ready yet. Waiting...")
        return None
    else:
        file_response = client_OpenAI.files.content(output_file)
        data_new = pd.read_json(StringIO(file_response.text), lines=True)
        return data_new


batch_ids = [batch.id for batch in client_OpenAI.batches.list(limit=2).data]

dfs = []
for batch_id in batch_ids:
    data_new = get_output(batch_id)
    dfs.append(data_new)

data_new = pd.concat(dfs, ignore_index=True)

dfssa = pd.merge(dataset, data_new, on='custom_id')
dfssa['GPT-4-turbo'] = dfssa['response'].apply(lambda x: x['body']['choices'][0]['message']['content'])

# dfss = dfss.drop_duplicates().reset_index(drop=True)

dfss['GPT-4-turbo'] = dfssa['GPT-4-turbo']

In [None]:
spread.open_sheet('Actual Evals v3')

spread.df_to_sheet(dfss, index=False, headers=True, start='A1',)
