In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
dl_project_path ='MyDrive/ETH/DL_PROJECT/MAIN'

env_path = f'/content/drive/{dl_project_path}'

import sys
# Add the handout folder to python paths
if env_path not in sys.path:
    sys.path.append(env_path)

In [None]:
# Installation of HuggingFace datasets
!pip install openai

In [None]:
import os
import numpy as np

from openai import OpenAI

In [None]:
#loading baseline responses (and prompts)
import pickle

# the folder of the baseline pre-trained model could be different from the folder of the custom SASHA model if number of attributes was set to 3
# if the text generation mode was 1 attribute at a time then the baseline and SASHA models will be in the same folder
baseline_path = 'inference_all_1_attr' # specify the folder in which the baseline model is saved
baseline_path = os.path.join(env_path, baseline_path)

with open(os.path.join(baseline_path, 'prompts.pkl'), 'rb') as f:
  baseline_prompts = pickle.load(f)

with open(os.path.join(baseline_path, 'responses.pkl'), 'rb') as f:
  baseline_responses = pickle.load(f)

baseline_responses = baseline_responses['base_model']

In [None]:
# Loading the inference responses from the local dir
inference_folder = 'inference_all_1_attr' # specify the folder in which the custom model is saved
inference_path = os.path.join(env_path, inference_folder)

with open(os.path.join(inference_path, 'prompts.pkl'), 'rb') as f:
  custom_prompts = pickle.load(f)

with open(os.path.join(inference_path, 'responses.pkl'), 'rb') as f:
  custom_responses = pickle.load(f)

print(f'Responses keys: {custom_responses.keys()}') # print all available custom models in the folder

In [None]:
# print samples of baseline and custom prompts to ensure correct loading
#print(baseline_prompts[0])
#print(baseline_prompts[2])

#print(custom_prompts[0])
#print(custom_prompts[2])

In [None]:
# choose which model to evaluate and combine baseline and model responses inside the responses dict --> comparison between baseline model and one SASHA model at a time
model = 'custom_model_samestart_adaptive_v3' # TODO
responses = {'Baseline': baseline_responses, 'Custom': custom_responses[model]}
prompts = baseline_prompts

In [None]:
from pydantic import BaseModel, validator, ValidationError

# Definition of response format object
class Response(BaseModel):
  score_A: int
  score_B: int

  @validator("score_A", "score_B")
  def scores_between_zero_and_ten(cls, v):
      if not (0 <= v <= 10):
          raise ValidationError("Score must be between 0 and 10 inclusive.")
      return v


In [None]:
results = []
system_message = """You are a helpful assistant that evaluates how well each response aligns with the given prompt, provided the specific attributes (each ranging between 0 and 4) included before the prompt itself.
You must output valid JSON with the following structure:

{
  "score_A": 0,
  "score_B": 0
}

No additional keys or text.
All scores must be integers between 0 and 10, with perfect alignment having score 10.
"""

client = OpenAI(api_key = open(f'{env_path}/OpenAI_API_key.txt', 'r').read())

responses_order = [] # 0 if baseline response comes first

for i in range(len(prompts)):
  prompt = prompts[i]
  # randomly select the order of the baseline model answer and the custom model answer
  if np.random.rand() < 0.5:
    response_A = responses['Baseline'][i]
    response_B = responses['Custom'][i]
    responses_order.append(0)
  else:
    response_A = responses['Custom'][i]
    response_B = responses['Baseline'][i]
    responses_order.append(1)
  messages = [{'role': 'system', 'content': system_message},
             {'role': 'user', 'content': f'PROMPT:\n{prompt}\nRESPONSE A:\n{response_A}\nRESPONSE B:\n{response_B}'}]

  try:
    completion = client.beta.chat.completions.parse(
      model="gpt-4o",
      messages=messages,
      response_format=Response)

    evaluation = completion.choices[0].message.parsed
    results.append(evaluation)

  except ValidationError:
    continue


In [None]:
# print len results
print(len(results))

In [None]:
# extraction of the evaluation results
scores = {'Baseline': [], 'Custom': []}

for i, result in enumerate(results):
  if responses_order[i] == 0:
    scores['Baseline'].append(result.score_A)
    scores['Custom'].append(result.score_B)
  else:
    scores['Baseline'].append(result.score_B)
    scores['Custom'].append(result.score_A)

In [None]:
# computation of the average score and standard deviation for each one of the models
mean_scores = {'Baseline': 0, 'Custom': 0}
std_scores = {'Baseline':0, 'Custom': 0}
custom_win_rate = 0

for model in scores.keys():
  mean_scores[model] = np.mean(scores[model])
  std_scores[model] = np.std(scores[model])

custom_win_rate = 100 * np.sum(np.array(scores['Custom']) > np.array(scores['Baseline'])) / len(scores['Custom'])
ties_percentage = 100 * np.sum(np.array(scores['Custom']) == np.array(scores['Baseline'])) / len(scores['Custom'])
print(f'Mean scores: {mean_scores}')
print(f'Standard deviation of scores: {std_scores}')
print(f'Custom model win rate: {custom_win_rate}%')
print(f'Ties percentage: {ties_percentage}%')


In [None]:
# save scores, custom win rate and ties in the evaluation folder
evaluation_folder = 'evaluation_4_4o_1attr'
evaluation_path = os.path.join(env_path, evaluation_folder)

if not os.path.exists(evaluation_path):
    os.makedirs(evaluation_path)

with open(os.path.join(evaluation_path, 'scores.pkl'), 'wb') as f:
  pickle.dump(scores, f)

with open(os.path.join(evaluation_path, 'mean_scores.pkl'), 'wb') as f:
  pickle.dump(mean_scores, f)

with open(os.path.join(evaluation_path, 'std_scores.pkl'), 'wb') as f:
  pickle.dump(std_scores, f)

with open(os.path.join(evaluation_path, 'custom_win_rate.pkl'), 'wb') as f:
  pickle.dump(custom_win_rate, f)

with open(os.path.join(evaluation_path, 'ties_percentage.pkl'), 'wb') as f:
  pickle.dump(ties_percentage, f)