In [190]:
import asyncio
import logging
import sys
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import time
import os
from datetime import datetime, timedelta
import json

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import google.generativeai as genai

current_path = os.getcwd()

sys.path.append(os.path.join(current_path, "../.."))
from helpers.constants import (
    MODEL_TOKEN_LIMITS,
    MODEL_NAME_TO_SOURCE,
)
from helpers.llm_prompts import SCRATCH_PAD_PROMPT, REFORMAT_SINGLE_PROMPT

from helpers import model_eval, data_utils, dates, decorator, keys  # noqa: E402


logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [191]:
models = {
    "gpt_4o": {"source": "OAI", "full_name": "gpt-4o"},
    "gpt_4_turbo_0409": {"source": "OAI", "full_name": "gpt-4-turbo-2024-04-09"},
    "llama_3_70b": {
        "source": "TOGETHER",
        "full_name": "meta-llama/Llama-3-70b-chat-hf",
    },
    "mistral_large": {
        "source": "MISTRAL",
        "full_name": "mistral-large-latest",
    },
    "claude_3_opus": {"source": "ANTHROPIC", "full_name": "claude-3-opus-20240229"},
    "qwen_1p5_110b": {
        "source": "TOGETHER",
        "full_name": "Qwen/Qwen1.5-110B-Chat",
    },
    # "gemini_pro": {"source": "GOOGLE", "full_name": "gemini-pro"},
}

### Load data

In [192]:
from datasets import load_dataset, Dataset

dataset_name = "YuehHanChen/forecasting"

# Load the dataset, specifying the splits if necessary
dataset = load_dataset(path="YuehHanChen/forecasting")

mini_val = list(dataset["test"])[:]
len(mini_val)



  0%|          | 0/3 [00:00<?, ?it/s]

914

### General Functions

In [193]:
def read_jsonl(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            if line.strip():
                json_object = json.loads(line)
                data.append(json_object)
    return data

In [194]:
def worker(index, prompt, model_name, save_dict):
    if save_dict[index] != "":
        return

    logger.info(f"Starting question: {index}")
    prompt = SCRATCH_PAD_PROMPT.format(
        question=mini_val[index]["question"],
        background=mini_val[index]["background"],
        resolution_criteria=mini_val[index]["resolution_criteria"],
        close_date=mini_val[index]["date_resolve_at"],
    )

    response = model_eval.get_response_from_model(
        prompt=prompt,
        max_tokens=1300,
        model_name=models[model_name]["full_name"],
        temperature=0,
        wait_time=30,
    )

    save_dict[index] = model_eval.reformat_answers(response=response, single=True)

    logger.info(f"finished question: {index}, forecast: {save_dict[index]}")

    return None


def executor(max_workers, prompt, model_name, save_dict):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:

        worker_with_args = partial(
            worker, prompt=prompt, model_name=model_name, save_dict=save_dict
        )
        return list(executor.map(worker_with_args, range(len(questions_list))))

In [195]:
base = "llm_crowd_candidate_eval/"
all_prompts = [SCRATCH_PAD_PROMPT]

results = {}
questions_list = [d["question"] for d in mini_val]
model_result_loaded = {}

for prompt_index in range(len(all_prompts)):
    for model in models:
        if model not in model_result_loaded.keys():
            model_result_loaded[model] = {}
        model_result_loaded[model][f"prompt_{prompt_index}"] = False

for prompt_index in range(len(all_prompts)):
    for model in models:
        file_path = f"{base}/{prompt_index}/{model}.jsonl"

        if model not in results.keys():
            results[model] = {}
        try:
            results[model] = read_jsonl(file_path)
            model_result_loaded[model][
                f"prompt_{prompt_index}"
            ] = True  # Set flag to True if loaded successfully
        except:
            results[model][f"prompt_{prompt_index}"] = {i: "" for i in range(len(questions_list))}

for prompt_index in range(len(all_prompts)):
    for model, info in models.items():
        # only execute the model if we have not had its results yet
        logger.info(f"Running {model}")
        if not model_result_loaded[model][f"prompt_{prompt_index}"]:
            executor_count = 30
            executor(
                executor_count,
                all_prompts[prompt_index],
                model,
                results[model][f"prompt_{prompt_index}"],
            )

for prompt_index in range(len(all_prompts)):
    for model in models:
        file_path = f"{base}/{prompt_index}/{model}.jsonl"
        if not model_result_loaded[model][f"prompt_{prompt_index}"]:
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            with open(file_path, "w") as f:
                json.dump(results[model][f"prompt_{prompt_index}"], f)

2024-05-15 10:47:27,280 INFO root: Running gpt_4o
2024-05-15 10:47:27,281 INFO root: Running gpt_4_turbo_0409
2024-05-15 10:47:27,282 INFO root: Running llama_3_70b
2024-05-15 10:47:27,282 INFO root: Running mistral_large
2024-05-15 10:47:27,282 INFO root: Running claude_3_opus
2024-05-15 10:47:27,283 INFO root: Running qwen_1p5_110b


### Evaluation

In [196]:
for model in results.keys():
    refuse_to_answer_cnt = 0
    for key, answer in results[model][0].items():
        if answer == None:
            answer = 0.5
            refuse_to_answer_cnt += 1
        results[model][0][key] = answer

    results[model][0]["refuse_to_answer_cnt"] = refuse_to_answer_cnt

for model in results.keys():
    results[model] = results[model][0]


def brier_score(prediction, answer):
    return (prediction - answer) ** 2


# aggregation
agg_results = {}
for agg_type in ["mean", "median", "trimmed_mean", "geometric_mean", "geometric_mean_log_odds"]:
    agg_results[agg_type] = {}
    for model in results.keys():
        for key, answer in results[model].items():
            if key not in agg_results[agg_type]:
                agg_results[agg_type][key] = [answer]
            else:
                agg_results[agg_type][key].append(answer)

In [197]:
import math


def trimmed_mean(probabilities):
    # Sort the list of probabilities

    sorted_probs = sorted(probabilities)

    # Remove the smallest and largest probabilities
    trimmed_probs = sorted_probs[1:-1]

    # Calculate the mean of the remaining probabilities
    trimmed_mean_value = sum(trimmed_probs) / len(trimmed_probs)

    return trimmed_mean_value


def geometric_mean(numbers):
    if not numbers:
        return 0  # Return 0 for an empty list to avoid math domain error
    product = 1.0
    for number in numbers:
        product *= number
    return product ** (1 / len(numbers))


def geometric_mean_log_odds(probs):
    # Convert probabilities to log odds
    log_odds = np.log(np.array(probs) / (1 - np.array(probs)))

    # Compute the geometric mean of the log odds
    mean_log_odds = np.mean(log_odds)

    # Convert the mean log odds back to probability
    combined_prob = np.exp(mean_log_odds) / (1 + np.exp(mean_log_odds))

    return combined_prob

In [198]:
# aggregation
for agg_type in ["mean", "median", "trimmed_mean", "geometric_mean", "geometric_mean_log_odds"]:
    for key, answers in agg_results[agg_type].items():
        if key != "refuse_to_answer_cnt":
            if agg_type == "mean":
                agg_results[agg_type][key] = np.mean(answers)
            elif agg_type == "median":
                agg_results[agg_type][key] = np.median(answers)
            elif agg_type == "trimmed_mean":
                agg_results[agg_type][key] = trimmed_mean(answers)
            elif agg_type == "geometric_mean":
                agg_results[agg_type][key] = geometric_mean(answers)
            elif agg_type == "geometric_mean_log_odds":
                agg_results[agg_type][key] = geometric_mean_log_odds(answers)

In [199]:
brier_scores = pd.DataFrame()

results.update(agg_results)

for model in results.keys():
    brier_scores_model = []
    for question_id, prediction in results[model].items():
        if question_id != "refuse_to_answer_cnt":
            brier_score_value = brier_score(
                float(prediction), mini_val[int(question_id)]["resolution"]
            )
            brier_scores_model.append(brier_score_value)

    avg_brier_score = sum(brier_scores_model) / len(brier_scores_model)
    brier_scores.at[model, f"Scratchpad"] = avg_brier_score

In [200]:
for model in results:
    print(f"{model}'s refuse-to-answer count: {results[model]['refuse_to_answer_cnt']}")

gpt_4o's refuse-to-answer count: 0
gpt_4_turbo_0409's refuse-to-answer count: 0
llama_3_70b's refuse-to-answer count: 1
mistral_large's refuse-to-answer count: 1
claude_3_opus's refuse-to-answer count: 7
qwen_1p5_110b's refuse-to-answer count: 0
mean's refuse-to-answer count: [0, 0, 1, 1, 7, 0]
median's refuse-to-answer count: [0, 0, 1, 1, 7, 0]
trimmed_mean's refuse-to-answer count: [0, 0, 1, 1, 7, 0]
geometric_mean's refuse-to-answer count: [0, 0, 1, 1, 7, 0]
geometric_mean_log_odds's refuse-to-answer count: [0, 0, 1, 1, 7, 0]


In [201]:
brier_scores

Unnamed: 0,Scratchpad
gpt_4o,0.18739
gpt_4_turbo_0409,0.20457
llama_3_70b,0.219266
mistral_large,0.224858
claude_3_opus,0.215378
qwen_1p5_110b,0.209576
mean,0.201707
median,0.201373
trimmed_mean,0.201978
geometric_mean,0.199454
