In [None]:
%%capture
!pip install mistralai anthropic langdetect

In [None]:
from google.colab import userdata
from google.colab import drive
from google.genai import Client as GenaiClient
from openai import OpenAI as OpenAIClient
from mistralai import Mistral as MistralClient
from anthropic import Anthropic as AnthropicClient
from huggingface_hub import InferenceClient as HFClient
from typing import TypedDict
import os
import sys
import pandas as pd
import json
from uuid import uuid4
import datetime
from enum import Enum
import nltk
import langdetect

In [None]:
DRIVE_MOUNT_PATH = '/content/drive'
PROJECT_ROOT = f'{DRIVE_MOUNT_PATH}/MyDrive/ifeval'
LIB_DIR = '/instruction_following_eval'
INPUT_FILEPATH = f'{PROJECT_ROOT}{LIB_DIR}/data/input_data.jsonl'
OUTPUT_DIR = f'{PROJECT_ROOT}{LIB_DIR}/data/output/'
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
GENAI_API_KEY = userdata.get('GENAI_API_KEY')
MISTRAL_API_KEY = userdata.get('MISTRAL_API_KEY')
ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')
HF_TOKEN = userdata.get('HF_TOKEN')

In [None]:
%%capture
drive.mount(DRIVE_MOUNT_PATH)

In [None]:
%%capture
%cd {DRIVE_MOUNT_PATH}{PROJECT_ROOT}

In [None]:
%%capture
!pip install -r instruction_following_eval/requirements.txt

In [None]:
questions = []
with open(INPUT_FILEPATH, 'r') as f:
    for line in f:
        try:
            questions.append(json.loads(line).get('prompt'))
        except json.JSONDecodeError as e:
            print(f"Skipping invalid JSON line: {line.strip()} - Error: {e}", file=sys.stderr)
            continue

In [None]:
class ChatMessage(TypedDict):
    role: str
    content: str

Chat = list[ChatMessage]

class ProviderID(Enum):
  OPENAI = 'openai'
  GOOGLE = 'google'
  ANTHROPIC = 'anthropic'
  MISTRAL = 'mistral'
  HF = 'hf'

class BatchStatus(Enum):
  CREATED = 'created'
  PENDING = 'pending'
  RUNNING = 'running'
  COMPLETED = 'completed'
  FAILED = 'failed'
  CANCELED = 'canceled'
  UNEXISTING = 'unexisting'

class InferenceMode(Enum):
  COMPLETIONS = 'completions'
  RESPONSES = 'responses'

class ModelClient:

  def __init__(self, id: str, provider_client, hyperparameters: dict = None):
    self.id: str = id
    self.provider_client: OpenAIClient = provider_client
    self.hyperparameters: dict = hyperparameters if hyperparameters else dict()

  def get_completion(self, messages: list[Chat]):
    raise NotImplementedError()

  def get_response(self, question: str, instruction: str = None):
    raise NotImplementedError(self)

  def create_batch(self, model_id: str, questions: list):
    raise NotImplementedError(self)

  def get_batch_by_id(self, batch_id: str):
    raise NotImplementedError(self)

  def retrieve_batch_result(self, batch_id: str):
    raise NotImplementedError(self)

  def create_batch_file(self, questions: list, batch_id: str = None):
    raise NotImplementedError(self)

  def cancel_batch(self, batch_id: str):
    raise NotImplementedError(self)

class OpenAIModelClient(ModelClient):

  def __init__(self, id: str, provider_client: OpenAIClient, hyperparameters: dict = None):
    super().__init__(id, provider_client, hyperparameters)

  def get_completion(self, messages: list[Chat]):
    return self.provider_client.chat.completions.create(
      model=self.id,
      messages=messages
    ).choices[0].message.content

  def get_response(self, question: str, instruction: str = None):
    return self.provider_client.responses.create(
      model=self.id,
      instructions=instruction,
      input=question
    ).output_text

  def create_batch_file(self, questions: list, batch_id: str = None):
    if not questions:
      raise ValueError("Questions cannot be empty")
    if isinstance(questions[0], str):
      questions = [[{'role': 'user', 'content': question}] for question in questions]
    id = batch_id if batch_id else f"batch-{str(uuid4())[-8:]}"
    interim_filename = f"{OUTPUT_DIR}{id}.jsonl"
    with open(interim_filename, 'w') as f_iter:
      for i, question in enumerate(questions):
        f_iter.write(json.dumps({"custom_id": f"{id}-{i}",
                            "method": "POST",
                            "url": f"/v1/chat/completions",
                            "body": {"model": self.id,
                                    "messages": question}

                            }))
        f_iter.write('\n')

    interim_resp = self.provider_client.files.create(
      file=open(interim_filename, "rb"),
      purpose="batch"
    )

    if interim_resp and 0 < interim_resp.bytes == os.path.getsize(interim_filename):
      os.remove(interim_filename)
      return interim_resp.id
    else:
      raise Exception(f"Wrong upload file size \"{interim_filename}\" has {interim_resp['bytes']}")

  def create_batch(self, batch_file_id: str):

    if not batch_file_id:
      raise ValueError("Batch file id cannot be empty")

    return self.provider_client.batches.create(
      input_file_id=batch_file_id,
      completion_window='24h',
      endpoint=f"/v1/chat/completions",
    ), batch_file_id

  def get_batch_by_id(self, batch_id: str):
    return self.provider_client.batches.retrieve(batch_id)

  def retrieve_batch_result(self, batch_id: str):
    batch = self.get_batch_by_id(batch_id)
    if not batch.output_file_id:
      return None
    ret = []
    with open(f"{OUTPUT_DIR}output-{self.id}-{batch_id}.jsonl", 'w') as f:
      for i, line in enumerate(p.get_models()['gpt-4o-mini'].provider_client.files.content(batch.output_file_id).text.strip().split("\n")):
        if not line:
          continue
        f.write(
            json.dumps({
                'prompt': questions[i],
                'response': json.dumps(json.loads(line).get('response').get('body').get('choices')[0].get('message').get('content'))
            })
        )
        f.write('\n')
      ret.append(f"{OUTPUT_DIR}{batch_id}.jsonl")
    return ret


  def cancel_batch(self, batch_id: str):
    return self.provider_client.batches.cancel(batch_id)


class HFModelClient(ModelClient):

  def __init__(self, id: str, provider_client: HFClient, hyperparameters: dict = None):
    super().__init__(id, provider_client, hyperparameters)

  def create_batch(self, model_id: str, questions: list):
    raise NotImplementedError(self)

class MistralAIClient(OpenAIClient):

  def __init__(*args, **kwargs):

    super().__init__(*args, **kwargs)



class Provider:

  def __init__(self, id: ProviderID, cls: type, api_key: str, cls_args: dict = None):
    self.id: str = id
    self.cls: type = cls
    self.api_key: str = api_key
    self.cls_args: dict = cls_args if cls_args else dict()
    self.client = None
    self.models: list[str] = None

  def get_client(self):
    if not self.client:
      self.client = self._get_client()
    return self.client

  def get_models(self):
    if not self.models:
      self.models = self._get_models()
    return self.models

  def _get_client(self):
    return self.cls(api_key=self.api_key, **self.cls_args)

  def _get_models(self):
    raise NotImplementedError(self)

class OpenAIProvider(Provider):

  def __init__(self, api_key: str):
    super().__init__(ProviderID.OPENAI, OpenAIClient, api_key)

  def _get_models(self):
    return dict(map(lambda x: (x.id, OpenAIModelClient(x.id, self.get_client(), dict())), self.get_client().models.list().data))

In [None]:
p = OpenAIProvider(OPENAI_API_KEY)

In [None]:
p.get_models()

{'gpt-4o-realtime-preview-2024-12-17': <__main__.OpenAIModelClient at 0x7855c0ccf650>,
 'gpt-4o-audio-preview-2024-12-17': <__main__.OpenAIModelClient at 0x7855c091da90>,
 'dall-e-3': <__main__.OpenAIModelClient at 0x7855c3913dd0>,
 'dall-e-2': <__main__.OpenAIModelClient at 0x7855ccbc4310>,
 'gpt-4o-audio-preview-2024-10-01': <__main__.OpenAIModelClient at 0x7855c0951990>,
 'gpt-4-turbo-preview': <__main__.OpenAIModelClient at 0x7855c09533d0>,
 'text-embedding-3-small': <__main__.OpenAIModelClient at 0x7855c0939390>,
 'babbage-002': <__main__.OpenAIModelClient at 0x7855c0939450>,
 'o1-mini-2024-09-12': <__main__.OpenAIModelClient at 0x7855c0939810>,
 'o1-mini': <__main__.OpenAIModelClient at 0x7855c0939a50>,
 'gpt-4': <__main__.OpenAIModelClient at 0x7855c0cbb6d0>,
 'text-embedding-ada-002': <__main__.OpenAIModelClient at 0x7855c0c87750>,
 'chatgpt-4o-latest': <__main__.OpenAIModelClient at 0x7855c090c210>,
 'text-embedding-3-large': <__main__.OpenAIModelClient at 0x7855c090c290>,
 'g

In [None]:
OPENAI_MODEL_IDS = ['ft:gpt-4o-mini-2024-07-18:personal:week-46-cif:BWWS7Ka7']

In [None]:
def save_batch_result(batch_id: str, model_id: str, tuned: bool):
  with open(f"{PROJECT_ROOT}/input-response-{model_id}-{'it' if tuned else 'pt'}.raw.jsonl", 'w') as f:
    with open(f'{PROJECT_ROOT}/instruction_following_eval/data/input-response{model_id}-{"it" if tuned else "pt"}.raw.jsonl','r') as fin:
      for i, line in enumerate(fin):
             f.write(
                json.dumps({
                    'prompt': questions[i],
                    'response': json.loads(line).get('response').get('body').get('choices')[0].get('message').get('content')
                }))
             f.write('\n')

In [None]:
p.get_models()['gpt-4o-mini'].get_completion([{'role': 'user', 'content': 'Ehere is Bragança Pta?'}])

'Bragança Paulista, often referred to simply as Bragança, is a municipality located in the state of São Paulo, Brazil. It is situated about 90 kilometers (approximately 56 miles) northwest of the city of São Paulo. Bragança Paulista is known for its lakes, green areas, and a mix of rural and urban environments. The city has historical and cultural significance, as well as various attractions for visitors. If you need more specific information about Bragança Paulista, feel free to ask!'

In [None]:
p.get_models()['gpt-4o-mini'].get_response('Where is Bragança Pta?')

'Bragança Pta, also known as Bragança Paulista, is a municipality located in the state of São Paulo, Brazil. It is situated approximately 90 kilometers (about 56 miles) northeast of the city of São Paulo. The city is known for its historical buildings, natural beauty, and cultural events. If you need more specific information about Bragança Pta, feel free to ask!'

In [None]:
bid, fid = p.get_models()['gpt-3.5-turbo-0125'].create_batch('file-NFXqKXAyLSjUkR7NWcx3BR')

In [None]:
bid, fid

(Batch(id='batch_68227b46392c8190bf3c49daebfb4ffd', completion_window='24h', created_at=1747090246, endpoint='/v1/chat/completions', input_file_id='file-NFXqKXAyLSjUkR7NWcx3BR', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1747176646, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0)),
 'file-NFXqKXAyLSjUkR7NWcx3BR')

In [None]:
p.get_models()

{'gpt-4o-realtime-preview-2024-12-17': <__main__.OpenAIModelClient at 0x7d9451daf8d0>,
 'gpt-4o-audio-preview-2024-12-17': <__main__.OpenAIModelClient at 0x7d9451949690>,
 'dall-e-3': <__main__.OpenAIModelClient at 0x7d9451948c90>,
 'dall-e-2': <__main__.OpenAIModelClient at 0x7d945194a510>,
 'gpt-4o-audio-preview-2024-10-01': <__main__.OpenAIModelClient at 0x7d9451f798d0>,
 'gpt-4-turbo-preview': <__main__.OpenAIModelClient at 0x7d9451b22ad0>,
 'text-embedding-3-small': <__main__.OpenAIModelClient at 0x7d9451b22950>,
 'babbage-002': <__main__.OpenAIModelClient at 0x7d9451b21310>,
 'o1-mini-2024-09-12': <__main__.OpenAIModelClient at 0x7d9451b20590>,
 'o1-mini': <__main__.OpenAIModelClient at 0x7d9451b200d0>,
 'gpt-4': <__main__.OpenAIModelClient at 0x7d9451b23bd0>,
 'text-embedding-ada-002': <__main__.OpenAIModelClient at 0x7d9451dc4dd0>,
 'chatgpt-4o-latest': <__main__.OpenAIModelClient at 0x7d9451dc6050>,
 'text-embedding-3-large': <__main__.OpenAIModelClient at 0x7d9451dc6b90>,
 'g

In [None]:
p.get_models()['gpt-4o'].get_batch_by_id('batch_6822636e0ac88190a46a8d6876d4bd51')

Batch(id='batch_6822636e0ac88190a46a8d6876d4bd51', completion_window='24h', created_at=1747084142, endpoint='/v1/chat/completions', input_file_id='file-NFXqKXAyLSjUkR7NWcx3BR', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1747170542, failed_at=None, finalizing_at=None, in_progress_at=1747084204, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=128, failed=0, total=541))

In [None]:
p.get_models()['gpt-4o-mini'].retrieve_batch_result(bid.id)

['/content/drive/MyDrive/ifeval/instruction_following_eval/data/output/batch_68224476eae081908d8c545546d771da.jsonl']

In [None]:
bid,fid

(Batch(id='batch_68222d24352c8190a6cc36e6ddf96a66', completion_window='24h', created_at=1747070244, endpoint='/v1/responses', input_file_id='file-FbduykPYRoUP8JpWNN24Qn', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1747156644, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0)),
 'file-FbduykPYRoUP8JpWNN24Qn')

In [None]:
p.get_models()['gpt-4o'].cancel_batch('batch_68222d24352c8190a6cc36e6ddf96a66')

Batch(id='batch_68222d24352c8190a6cc36e6ddf96a66', completion_window='24h', created_at=1747070244, endpoint='/v1/responses', input_file_id='file-FbduykPYRoUP8JpWNN24Qn', object='batch', status='cancelling', cancelled_at=None, cancelling_at=1747070857, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1747156644, failed_at=None, finalizing_at=None, in_progress_at=1747070306, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=491, total=541))

In [None]:
MistralAIClient(api_key=MISTRAL_API_KEY).evals.run('eval_6823dcc9259c819099880608ddf54fad')

In [None]:
!cwd /content

/bin/bash: line 1: cwd: command not found


In [None]:
!python3 -m instruction_following_eval.evaluation_main \
  --input_data=/content/drive/MyDrive/ifeval/instruction_following_eval/data/input_data.jsonl \
  --input_response_data=/content/drive/MyDrive/ifeval/instruction_following_eval/data/input-response-4o-ifeval-pt.jsonl \
  --output_dir=/content/drive/MyDrive/ifeval/instruction_following_eval/data/output/

/usr/bin/python3: Error while finding module specification for 'instruction_following_eval.evaluation_main' (ModuleNotFoundError: No module named 'instruction_following_eval')


 response->body->choices[0>message[0>content

In [None]:
!pip install vllm



In [None]:

!HF_TOKEN={HF_TOKEN} vllm serve "giovannioliveira/gemma-2-2B-it-w46-cif-sft"

/bin/bash: line 1: huggingface_hub: command not found
