In [None]:
import asyncio
import logging
import sys
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import time
import os
from datetime import datetime, timedelta
import json

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import google.generativeai as genai

current_path = os.getcwd()

sys.path.append(os.path.join(current_path, "../.."))
from helpers.constants import (
    #     S3,
    #     S3_BUCKET_NAME,
    MODEL_TOKEN_LIMITS,
    MODEL_NAME_TO_SOURCE,
)
from helpers.prompts import (
    ZERO_SHOT_PROMPT,
    ZERO_SHOT_ACLED_PROMPT,
    ZERO_SHOT_JOINT_QUESTION_PROMPT,
    SCRATCH_PAD_PROMPT,
    SCRATCH_PAD_ACLED_PROMPT,
    SCRATCH_PAD_JOINT_QUESTION_PROMPT,
    REFORMAT_SINGLE_PROMPT,
    REFORMAT_PROMPT,
    HUMAN_JOINT_PROMPT_1,
    HUMAN_JOINT_PROMPT_2,
    HUMAN_JOINT_PROMPT_3,
    HUMAN_JOINT_PROMPT_4,
)
from helpers import model_utils, model_eval, data_utils, dates, decorator, keys  # noqa: E402


logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [29]:
human_joint_prompts = [
    HUMAN_JOINT_PROMPT_1,
    HUMAN_JOINT_PROMPT_2,
    HUMAN_JOINT_PROMPT_3,
    HUMAN_JOINT_PROMPT_4,
]

models = {
    "gpt_3p5_turbo_0125": {"source": "OAI", "full_name": "gpt-3.5-turbo-0125"},
    #     "gpt_4": {"source": "OAI", "full_name": "gpt-4"},
    #     "gpt_4_turbo_0409": {"source": "OAI", "full_name": "gpt-4-turbo-2024-04-09"},
    #     "llama_2_70b": {
    #         "source": "TOGETHER",
    #         "full_name": "meta-llama/Llama-2-70b-chat-hf",
    #     },
    "llama_3_8b": {
        "source": "TOGETHER",
        "full_name": "meta-llama/Llama-3-8b-chat-hf",
    },
    #     "llama_3_70b": {
    #         "source": "TOGETHER",
    #         "full_name": "meta-llama/Llama-3-70b-chat-hf",
    #     },
    "mistral_8x7b_instruct": {
        "source": "TOGETHER",
        "full_name": "mistralai/Mixtral-8x7B-Instruct-v0.1",
    },
    #     "claude_3_opus": {"source": "ANTHROPIC", "full_name": "claude-3-opus-20240229"},
    #     "claude_3_sonnet": {"source": "ANTHROPIC", "full_name": "claude-3-sonnet-20240229"},
    # "gemini_pro": {"source": "GOOGLE", "full_name": "gemini-pro"},
}

### Load data

In [32]:
# log in hf
hf_access_token = "your_token"
!huggingface-cli login --token $hf_access_token

dataset_name = "YuehHanChen/forecasting"
from datasets import load_dataset, Dataset

# Load the dataset, specifying the splits if necessary
dataset = load_dataset(path="YuehHanChen/forecasting")

mini_val = list(dataset["validation"])[:100]
len(mini_val)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/apple/.cache/huggingface/token
Login successful




  0%|          | 0/3 [00:00<?, ?it/s]

100

### General Functions

In [33]:
def read_jsonl(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            if line.strip():
                json_object = json.loads(line)
                data.append(json_object)
    return data

### Scratchpad prompt candidates

In [34]:
SCRATCHPD_PROMPT_0 = """
Question:
{question}

Question Background:
{background}

Resolution Criteria:
{resolution_criteria}

Question close date: {close_date}

Instructions:
1. Provide reasons why the answer might be no.
{{ Insert your thoughts }}

2. Provide reasons why the answer might be yes.
{{ Insert your thoughts }}

3. Aggregate your considerations.
{{ Insert your aggregated considerations }}

4. Output your answer (a number between 0 and 1) with an asterisk at the beginning and end of the decimal.
{{ Insert your answer }}
"""

SCRATCHPD_PROMPT_1 = """
Question:
{question}

Question Background:
{background}

Resolution Criteria:
{resolution_criteria}

Question close date: {close_date}

Instructions:
1. Provide at least 3 reasons why the answer might be no.
{{ Insert your thoughts }}

2. Provide at least 3 reasons why the answer might be yes.
{{ Insert your thoughts }}

3. Rate the strength of each of the reasons given in the last two responses. Think like a superforecaster (e.g. Nate Silver).
{{ Insert your rating of the strength of each reason }}

4. Aggregate your considerations.
{{ Insert your aggregated considerations }}

5. Output your answer (a number between 0 and 1) with an asterisk at the beginning and end of the decimal.
{{ Insert your answer }}
"""

SCRATCHPD_PROMPT_2 = """
Question:
{question}

Question Background:
{background}

Resolution Criteria:
{resolution_criteria}

Question close date: {close_date}

Instructions:
1. Write down any additional relevant information that is not included above. This should be specific facts that you already know the answer to, rather than information that needs to be looked up.
{{ Insert additional information }}

2. Provide at least 3 reasons why the answer might be no.
{{ Insert your thoughts }}

3. Provide at least 3 reasons why the answer might be yes.
{{ Insert your thoughts }}

4. Rate the strength of each of the reasons given in the last two responses. Think like a superforecaster (e.g. Nate Silver).
{{ Insert your rating of the strength of each reason }}

5. Aggregate your considerations.
{{ Insert your aggregated considerations }}

6. Output your answer (a number between 0 and 1) with an asterisk at the beginning and end of the decimal.
{{ Insert your answer }}
"""

SCRATCHPD_PROMPT_3 = """
Question:
{question}

Question Background:
{background}

Resolution Criteria:
{resolution_criteria}

Question close date: {close_date}

Think step by step: {{ Insert your step by step consideration }}
Aggregating considerations: {{ Aggregate your considerations }}
Answer: {{ Output your answer (a number between 0 and 1) with an asterisk at the beginning and end of the decimal }}
"""

SCRATCHPD_PROMPT_4 = """
Question:
{question}

Question Background:
{background}

Resolution Criteria:
{resolution_criteria}

Question close date: {close_date}

Instructions:
1. Given the above question, rephrase and expand it to help you do better answering. Maintain all information in the original question.
{{ Insert rephrased and expanded question.}}

2. Provide a few reasons why the answer might be no. Rate the strength of each reason.
{{ Insert your thoughts }}

3. Provide a few reasons why the answer might be yes. Rate the strength of each reason.
{{ Insert your thoughts }}

4. Aggregate your considerations. Think like a superforecaster (e.g. Nate Silver).
{{ Insert your aggregated considerations }}

5. Output an initial probability (prediction) given steps 1-4.
{{ Insert initial probability. }}

6. Evaluate whether your calculated probability is excessively confident or not confident enough. Also, consider anything else that might affect the forecast that you did not before consider.
{{ Insert your thoughts }}

7. Output your final prediction (a number between 0 and 1) with an asterisk at the beginning and end of the decimal.
{{ Insert your answer }}
"""

SCRATCHPD_PROMPT_5 = """
Question:
{question}

Question Background:
{background}

Resolution Criteria:
{resolution_criteria}

Question close date: {close_date}


Instructions:
1. Given the above question, rephrase and expand it to help you do better answering. Maintain all information in the original question.
{{ Insert rephrased and expanded question.}}

2. Provide a few reasons why the answer might be "yes" or "no". Rate the strength of each reason.
{{ Insert your thoughts }}

3. Consider anything else that might affect the forecast that you did not before consider.
{{ Insert your thoughts }}

4. Aggregate your considerations. Think like a superforecaster (e.g. Nate Silver).
{{ Insert your aggregated considerations }}

5. Output an initial probability (prediction) given steps 1-4.
{{ Insert initial probability. }}

6. Evaluate whether your calculated probability is excessively confident or not confident enough.
{{ Insert your thoughts }}

7. Output your final prediction (a number between 0 and 1) with an asterisk at the beginning and end of the decimal.
{{ Insert your answer }}
"""

all_prompts = [
    SCRATCHPD_PROMPT_0,
    SCRATCHPD_PROMPT_1,
    SCRATCHPD_PROMPT_2,
    SCRATCHPD_PROMPT_3,
    SCRATCHPD_PROMPT_4,
    SCRATCHPD_PROMPT_5,
]

In [35]:
def worker(index, prompt, model_name, save_dict):
    if save_dict[index] != "":
        return

    logger.info(f"Starting question: {index}")
    prompt = SCRATCH_PAD_PROMPT.format(
        question=mini_val[index]["question"],
        background=mini_val[index]["background"],
        resolution_criteria=mini_val[index]["resolution_criteria"],
        close_date=mini_val[index]["date_resolve_at"],
    )

    response = model_eval.get_response_from_model(
        prompt=prompt,
        max_tokens=1300,
        model_name=models[model_name]["full_name"],
        temperature=0,
        wait_time=30,
    )

    logger.info(f"Finished question: {index}")

    save_dict[index] = response

    return None


def executor(max_workers, prompt, model_name, save_dict):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:

        worker_with_args = partial(
            worker, prompt=prompt, model_name=model_name, save_dict=save_dict
        )
        return list(executor.map(worker_with_args, range(len(questions_list))))

In [53]:
base = "zero_shot_prompt_eval/"

results = {}
questions_list = [d["question"] for d in mini_val]
model_result_loaded = {}

for prompt_index in range(len(all_prompts)):
    for model in models:
        if model not in model_result_loaded.keys():
            model_result_loaded[model] = {}
        model_result_loaded[model][f"prompt_{prompt_index}"] = False

for prompt_index in range(len(all_prompts)):
    for model in models:
        file_path = f"{base}/{prompt_index}/{model}.jsonl"

        if model not in results.keys():
            results[model] = {}
        try:
            results[model] = read_jsonl(file_path)
            model_result_loaded[model][
                f"prompt_{prompt_index}"
            ] = True  # Set flag to True if loaded successfully
        except:
            results[model][f"prompt_{prompt_index}"] = {i: "" for i in range(len(questions_list))}

for prompt_index in range(len(all_prompts)):
    for model, info in models.items():
        # only execute the model if we have not had its results yet
        if not model_result_loaded[model][f"prompt_{prompt_index}"]:
            executor_count = 50
            executor(
                executor_count,
                all_prompts[prompt_index],
                model,
                results[model][f"prompt_{prompt_index}"],
            )

for prompt_index in range(len(all_prompts)):
    for model in models:
        file_path = f"{base}/{prompt_index}/{model}.jsonl"
        if not model_result_loaded[model][f"prompt_{prompt_index}"]:
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            with open(file_path, "w") as f:
                json.dump(results[model][f"prompt_{prompt_index}"], f)

### Evaluation

In [39]:
import re


def extract_probability(text):
    # Pattern to find numbers enclosed in asterisks, possibly with a percentage sign
    pattern = r"\*([\d\.]+)%?\*"

    # Find all matches in the text
    matches = re.findall(pattern, text)

    # Process matches to find the valid probability
    for match in reversed(matches):
        # Convert to float
        number = float(match)

        # Convert from percentage to decimal if necessary
        if "%" in text[text.find(f"*{match}*") - 1]:
            number /= 100

        # Check if the number is a valid probability
        if 0 <= number <= 1:
            return number

    # Return None if no valid probability found
    return None

In [None]:
for prompt_index in range(len(all_prompts)):
    for model in results.keys():
        refuse_to_answer_cnt = 0
        for key, response in results[model][f"prompt_{prompt_index}"].items():
            answer = extract_probability(response)
            if answer == None:
                answer = 0.5
                refuse_to_answer_cnt += 1

            results[model][f"prompt_{prompt_index}"][key] = answer
        results[model][f"prompt_{prompt_index}"]["refuse_to_answer_cnt"] = refuse_to_answer_cnt


def brier_score(prediction, answer):
    return (prediction - answer) ** 2


brier_scores = pd.DataFrame()

for prompt_index in range(len(all_prompts)):
    for model in results.keys():
        brier_scores_model = []
        for question_id, prediction in results[model][f"prompt_{prompt_index}"].items():
            if question_id != "refuse_to_answer_cnt":
                brier_score_value = brier_score(
                    float(prediction), mini_val[question_id]["resolution"]
                )
                brier_scores_model.append(brier_score_value)

        avg_brier_score = sum(brier_scores_model) / len(brier_scores_model)
        ç
        brier_scores.at[model, f"Prompt {prompt_index}"] = avg_brier_score

In [54]:
brier_scores

Unnamed: 0,Prompt 0,Prompt 1,Prompt 2,Prompt 3,Prompt 4,Prompt 5
gpt_3p5_turbo_0125,0.2718,0.2713,0.2724,0.277525,0.25785,0.27845
llama_3_8b,0.249894,0.251463,0.251,0.256625,0.255944,0.250775
mistral_8x7b_instruct,0.26475,0.26458,0.260813,0.262,0.2698,0.264475


In [60]:
brier_scores.mean()

Prompt 0    0.262148
Prompt 1    0.262448
Prompt 2    0.261404
Prompt 3    0.265383
Prompt 4    0.261198
Prompt 5    0.264567
dtype: float64