In [1]:
import asyncio
import logging
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
import time
import os
from datetime import datetime, timedelta
import json
import pickle

import random
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

current_path = os.getcwd()

sys.path.append(os.path.join(current_path, "../.."))
from helpers.llm_prompts import (
    ZERO_SHOT_MARKET_PROMPT,
    ZERO_SHOT_NON_MARKET_PROMPT,
    ZERO_SHOT_JOINT_QUESTION_PROMPT,
    SCRATCH_PAD_MARKET_PROMPT,
    SCRATCH_PAD_NON_MARKET_PROMPT,
    SCRATCH_PAD_JOINT_QUESTION_PROMPT,
    SCRATCH_PAD_WITH_SUMMARIES_MARKET_PROMPT,
    SCRATCH_PAD_WITH_SUMMARIES_NON_MARKET_PROMPT,
    SCRATCH_PAD_WITH_SUMMARIES_JOINT_QUESTION_PROMPT,
    REFORMAT_SINGLE_PROMPT,
    REFORMAT_PROMPT,
    HUMAN_JOINT_PROMPT_1,
    HUMAN_JOINT_PROMPT_2,
    HUMAN_JOINT_PROMPT_3,
    HUMAN_JOINT_PROMPT_4,
)

from helpers import model_eval


logger = logging.getLogger()
logger.setLevel(logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
human_joint_prompts = [
    HUMAN_JOINT_PROMPT_1,
    HUMAN_JOINT_PROMPT_2,
    HUMAN_JOINT_PROMPT_3,
    HUMAN_JOINT_PROMPT_4,
]

models = {
    "gpt_3p5_turbo_0125": {"source": "OAI", "full_name": "gpt-3.5-turbo-0125"},
    "gpt_4": {"source": "OAI", "full_name": "gpt-4"},
    "gpt_4_turbo_0409": {"source": "OAI", "full_name": "gpt-4-turbo-2024-04-09"},
    "gpt_4_1106_preview": {"source": "OAI", "full_name": "gpt-4-1106-preview"},
    "gpt_4_0125_preview": {"source": "OAI", "full_name": "gpt-4-0125-preview"},
    "gpt_4o": {"source": "OAI", "full_name": "gpt-4o"},
    "llama_2_70b": {
        "source": "TOGETHER",
        "full_name": "meta-llama/Llama-2-70b-chat-hf",
    },
    "llama_3_8b": {
        "source": "TOGETHER",
        "full_name": "meta-llama/Llama-3-8b-chat-hf",
    },
    "llama_3_70b": {
        "source": "TOGETHER",
        "full_name": "meta-llama/Llama-3-70b-chat-hf",
    },
    "mistral_8x7b_instruct": {
        "source": "TOGETHER",
        "full_name": "mistralai/Mixtral-8x7B-Instruct-v0.1",
    },
    "mistral_8x22b_instruct": {
        "source": "TOGETHER",
        "full_name": "mistralai/Mixtral-8x22B-Instruct-v0.1",
    },
    "mistral_large": {
        "source": "MISTRAL",
        "full_name": "mistral-large-latest",
    },
    "qwen_1p5_110b": {
        "source": "TOGETHER",
        "full_name": "Qwen/Qwen1.5-110B-Chat",
    },
    "claude_2p1": {"source": "ANTHROPIC", "full_name": "claude-2.1"},
    "claude_3_opus": {"source": "ANTHROPIC", "full_name": "claude-3-opus-20240229"},
    "claude_3_sonnet": {"source": "ANTHROPIC", "full_name": "claude-3-sonnet-20240229"},
    "claude_3_haiku": {"source": "ANTHROPIC", "full_name": "claude-3-haiku-20240307"},
    #     "gemini_pro": {"source": "GOOGLE", "full_name": "gemini-pro"},
}

### Useful function

In [3]:
def capitalize_substrings(model_name):
    model_name = model_name.replace('gpt', 'GPT') if 'gpt' in model_name else model_name
    substrings = model_name.split('-')
    capitalized_substrings = [
        substr[0].upper() + substr[1:] if substr and not substr[0].isdigit() else substr 
        for substr in substrings
    ]
    return '-'.join(capitalized_substrings)

def generage_final_forecast_files(deadline, forecast_date, prompt_type):
    
    for model in models_to_test:
        current_model_forecasts = []
        for test_type in [f"{prompt_type}/non_acled", f"{prompt_type}/acled", f"{prompt_type}/combo"]:
            file_path = f"{test_type}/{model}.jsonl"
            questions = read_jsonl(file_path)
            current_model_forecasts.extend(questions)

        final_file_name = f"{prompt_type}/final/{model}"
        os.makedirs(os.path.dirname(final_file_name), exist_ok=True)
        with open(final_file_name, "w") as file:
            for entry in current_model_forecasts:
                json_line = json.dumps(entry)
                file.write(json_line + "\n")

    for model in models_to_test:
        file_path = f"{prompt_type}/final/{model}"
        questions = read_jsonl(file_path)
        if "gpt" in model:
            org = "OpenAI"
        elif "llama" in model:
            org = "Meta"
        elif "mistral" in model:
            org = "Mistral AI"
        elif "claude" in model:
            org = "Anthropic"
        elif "qwen" in model:
            org = "Qwen"

        directory = f"{prompt_type}/final_submit"
        os.makedirs(directory, exist_ok=True)

        new_file_name = f"{directory}/{deadline}.{org}.{model}_{prompt_type}.json"

        model_name = models[model]['full_name'] if '/' not in models[model]['full_name'] else models[model]['full_name'].split('/')[1]
        
        forecast_file = {
            "organization": org,
            "model": f"{capitalize_substrings(model_name)} ({prompt_type.replace('_', ' ')})",
            "question_set": f"{deadline}-llm.jsonl",
            "forecast_date": forecast_date,
            "forecasts": questions,
        }

        with open(new_file_name, "w") as f:
            json.dump(forecast_file, f, indent=4)

### Load data

In [4]:
import json


def read_jsonl(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            if line.strip():
                json_object = json.loads(line)
                data.append(json_object)
    return data


file_path = "2024-05-03-llm.jsonl"
questions = read_jsonl(file_path)

single_non_acled_questions = [
    q for q in questions if q["combination_of"] == "N/A" and q["source"] != "acled"
]
single_acled_questions = [
    q for q in questions if q["combination_of"] == "N/A" and q["source"] == "acled"
]
combo_questions = [q for q in questions if q["combination_of"] != "N/A" and q["source"] == "acled"]

In [5]:
combo_questions_unrolled = []

for q in combo_questions:
    for i in range(4):
        new_q = q.copy()
        new_q["combo_index"] = i

        combo_questions_unrolled.append(new_q)

In [6]:
len(single_non_acled_questions), len(single_acled_questions), len(combo_questions_unrolled)

(339, 162, 644)

### Zero Shot Eval

In [7]:
def worker(index, model_name, save_dict, questions_to_eval, rate_limit=False):
    if save_dict[index] != "":
        return

    logger.info(f"{model_name} - {index}")

    if rate_limit:
        start_time = datetime.now()

    if questions_to_eval[index]["source"] != "acled":
        prompt = ZERO_SHOT_MARKET_PROMPT.format(
            question=questions_to_eval[index]["question"],
            background=questions_to_eval[index]["background"]
            + "\n"
            + questions_to_eval[index]["model_info_resolution_criteria"],
            resolution_criteria=questions_to_eval[index]["resolution_criteria"],
            close_date=questions_to_eval[index]["model_info_close_datetime"],
        )
        response = model_eval.get_response_from_model(
            prompt=prompt,
            max_tokens=100,
            model_name=models[model_name]["full_name"],
            temperature=0,
            wait_time=30,
        )

        save_dict[index] = model_eval.extract_probability(response)

    else:
        all_resolution_dates = []
        for horizon in questions_to_eval[index]["forecast_horizons"]:
            resolution_date = datetime.fromisoformat(
                questions_to_eval[index]["freeze_datetime"]
            ) + timedelta(days=7 + horizon)
            resolution_date = resolution_date.isoformat()
            all_resolution_dates.append(resolution_date)

        if questions_to_eval[index]["combination_of"] == "N/A":
            prompt = ZERO_SHOT_NON_MARKET_PROMPT.format(
                question=questions_to_eval[index]["question"],
                background=questions_to_eval[index]["background"]
                + "\n"
                + questions_to_eval[index]["model_info_resolution_criteria"],
                resolution_criteria=questions_to_eval[index]["resolution_criteria"],
                freeze_datetime=questions_to_eval[index]["freeze_datetime"],
                freeze_datetime_value=questions_to_eval[index]["freeze_datetime_value"],
                freeze_datetime_value_explanation=questions_to_eval[index][
                    "freeze_datetime_value_explanation"
                ],
                list_of_resolution_dates=all_resolution_dates,
            )
            response = model_eval.get_response_from_model(
                prompt=prompt,
                max_tokens=100,
                model_name=models[model_name]["full_name"],
                temperature=0,
                wait_time=30,
            )

            save_dict[index] = model_eval.reformat_answers(
                response=response, prompt=prompt, question=questions_to_eval[index]
            )

        else:
            prompt = ZERO_SHOT_JOINT_QUESTION_PROMPT.format(
                human_prompt=human_joint_prompts[questions_to_eval[index]["combo_index"]],
                question_1=questions_to_eval[index]["combination_of"][0]["question"],
                question_2=questions_to_eval[index]["combination_of"][1]["question"],
                background_1=questions_to_eval[index]["combination_of"][0]["background"]
                + "\n"
                + questions_to_eval[index]["combination_of"][0]["model_info_resolution_criteria"],
                background_2=questions_to_eval[index]["combination_of"][1]["background"]
                + "\n"
                + questions_to_eval[index]["combination_of"][1]["model_info_resolution_criteria"],
                resolution_criteria_1=questions_to_eval[index]["combination_of"][0][
                    "resolution_criteria"
                ],
                resolution_criteria_2=questions_to_eval[index]["combination_of"][1][
                    "resolution_criteria"
                ],
                freeze_datetime_1=questions_to_eval[index]["combination_of"][0]["freeze_datetime"],
                freeze_datetime_2=questions_to_eval[index]["combination_of"][1]["freeze_datetime"],
                freeze_datetime_value_1=questions_to_eval[index]["combination_of"][0][
                    "freeze_datetime_value"
                ],
                freeze_datetime_value_2=questions_to_eval[index]["combination_of"][1][
                    "freeze_datetime_value"
                ],
                freeze_datetime_value_explanation_1=questions_to_eval[index]["combination_of"][
                    0
                ]["freeze_datetime_value_explanation"],
                freeze_datetime_value_explanation_2=questions_to_eval[index]["combination_of"][
                    1
                ]["freeze_datetime_value_explanation"],
                list_of_resolution_dates=all_resolution_dates,
            )

            response = model_eval.get_response_from_model(
                prompt=prompt,
                max_tokens=500,
                model_name=models[model_name]["full_name"],
                temperature=0,
                wait_time=30,
            )

            save_dict[index] = model_eval.reformat_answers(
                response=response, prompt=prompt, question=questions_to_eval[index]
            )

    logger.info(save_dict[index])

    if rate_limit:
        end_time = datetime.now()
        elapsed_time = (end_time - start_time).total_seconds()
        if elapsed_time < 1:
            time.sleep(
                1 - elapsed_time
            )  # Ensure at least 1 second per request to stay within rate limits

    return None


def executor(max_workers, model_name, save_dict, questions_to_eval, use_gemini=False):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        worker_with_args = partial(
            worker, model_name=model_name, save_dict=save_dict, questions_to_eval=questions_to_eval
        )
        return list(executor.map(worker_with_args, range(len(questions_to_eval))))

In [8]:
results = {}
model_result_loaded = {}
models_to_test = list(models.keys())[:]

for question in [single_non_acled_questions, single_acled_questions, combo_questions_unrolled]:
    questions_to_eval = question
    if question[0]["source"] != "acled":
        test_type = "zero_shot/non_acled"
    elif question[0]["source"] == "acled" and question[0]["combination_of"] == "N/A":
        test_type = "zero_shot/acled"
    else:
        test_type = "zero_shot/combo"

    for model in models_to_test:
        if model not in model_result_loaded.keys():
            model_result_loaded[model] = {}
        model_result_loaded[model] = False

    for model in models_to_test:
        file_path = f"{test_type}/{model}.jsonl"
        if model not in results.keys():
            results[model] = {}
        try:
            results[model] = read_jsonl(file_path)
            model_result_loaded[model] = True  # Set flag to True if loaded successfully
        except:
            results[model] = {i: "" for i in range(len(questions_to_eval))}

    for model in models_to_test:
        file_path = f"{test_type}/{model}.jsonl"
        if not model_result_loaded[model]:
            executor_count = 50
            use_gemini = False
            if models[model]["source"] == "GOOGLE":
                executor_count = 1
                use_gemini = True

            executor(executor_count, model, results[model], questions_to_eval, use_gemini)

            current_model_forecasts = []
            for index in range(len(questions_to_eval)):
                if questions_to_eval[index]["source"] == "acled":
                    for forecast, horizon in zip(
                        results[model][index], questions_to_eval[index]["forecast_horizons"]
                    ):
                        current_forecast = {
                            "id": questions_to_eval[index]["id"],
                            "source": questions_to_eval[index]["source"],
                            "forecast": forecast,
                            "horizon": horizon,
                            "reasoning": None,
                        }
                        if questions_to_eval[index]["combination_of"] != "N/A":
                            combo_index = questions_to_eval[index]["combo_index"]
                            if combo_index == 0:
                                current_forecast["direction"] = [1, 1]
                            elif combo_index == 1:
                                current_forecast["direction"] = [1, -1]
                            elif combo_index == 2:
                                current_forecast["direction"] = [-1, 1]
                            else:
                                current_forecast["direction"] = [-1, -1]

                        current_model_forecasts.append(current_forecast)
                else:
                    current_forecast = {
                        "id": questions_to_eval[index]["id"],
                        "source": questions_to_eval[index]["source"],
                        "forecast": results[model][index],
                        "reasoning": None,
                    }
                    current_model_forecasts.append(current_forecast)

            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            with open(file_path, "w") as file:
                for entry in current_model_forecasts:
                    json_line = json.dumps(entry)
                    file.write(json_line + "\n")

In [9]:
generage_final_forecast_files(deadline='2024-05-03', 
                              forecast_date='2024-05-03', 
                              prompt_type="zero_shot")

In [10]:
# # fix directions
# prompt_type= "zero_shot"
# directory= prompt_type + '/final_submit'
# for model in models_to_test:
#     if "gpt" in model:
#         org = "OpenAI"
#     elif "llama" in model:
#         org = "Meta"
#     elif "mistral" in model:
#         org = "Mistral AI"
#     elif "claude" in model:
#         org = "Anthropic"
#     elif "qwen" in model:
#         org = "Qwen"

#     file_path = f"{directory}/2024-05-03.{org}.{model}_{prompt_type}.json"

#     with open(file_path, "r") as file:
#         questions = json.load(file)
            
#     for q in questions['forecasts']:
#         if 'direction' in q:
#             q['direction'] = [-1 if item == 0 else item for item in q['direction']]          
    
#     new_file_path = f"{prompt_type}/direction_fix/2024-05-03.{org}.{model}_{prompt_type}.json"
#     os.makedirs(os.path.dirname(new_file_path), exist_ok=True)

#     with open(new_file_path, 'w') as json_file:
#         json.dump(questions, json_file, indent=4)

### Scratchpad

In [11]:
def worker(index, model_name, save_dict, questions_to_eval):
    if save_dict[index] != "":
        return

    logger.info(f"Starting {model_name} - {index}")

    if questions_to_eval[index]["source"] != "acled":
        prompt = SCRATCH_PAD_MARKET_PROMPT.format(
            question=questions_to_eval[index]["question"],
            background=questions_to_eval[index]["background"]
            + "\n"
            + questions_to_eval[index]["model_info_resolution_criteria"],
            resolution_criteria=questions_to_eval[index]["resolution_criteria"],
            close_date=questions_to_eval[index]["model_info_close_datetime"],
        )
        response = model_eval.get_response_from_model(
            prompt=prompt,
            max_tokens=1300,
            model_name=models[model_name]["full_name"],
            temperature=0,
            wait_time=30,
        )

        save_dict[index] = (model_eval.reformat_answers(response=response, single=True), response)

    else:
        all_resolution_dates = []
        for horizon in questions_to_eval[index]["forecast_horizons"]:
            resolution_date = datetime.fromisoformat(
                questions_to_eval[index]["freeze_datetime"]
            ) + timedelta(days=7 + horizon)
            resolution_date = resolution_date.isoformat()
            all_resolution_dates.append(resolution_date)

        if questions_to_eval[index]["combination_of"] == "N/A":
            prompt = SCRATCH_PAD_NON_MARKET_PROMPT.format(
                question=questions_to_eval[index]["question"],
                background=questions_to_eval[index]["background"]
                + "\n"
                + questions_to_eval[index]["model_info_resolution_criteria"],
                resolution_criteria=questions_to_eval[index]["resolution_criteria"],
                freeze_datetime=questions_to_eval[index]["freeze_datetime"],
                freeze_datetime_value=questions_to_eval[index]["freeze_datetime_value"],
                freeze_datetime_value_explanation=questions_to_eval[index][
                    "freeze_datetime_value_explanation"
                ],
                list_of_resolution_dates=all_resolution_dates,
            )
            response = model_eval.get_response_from_model(
                prompt=prompt,
                max_tokens=2000,
                model_name=models[model_name]["full_name"],
                temperature=0,
                wait_time=30,
            )
            save_dict[index] = (
                model_eval.reformat_answers(
                    response=response, prompt=prompt, question=questions_to_eval[index]
                ),
                response,
            )
        else:
            prompt = SCRATCH_PAD_JOINT_QUESTION_PROMPT.format(
                human_prompt=human_joint_prompts[questions_to_eval[index]["combo_index"]],
                question_1=questions_to_eval[index]["combination_of"][0]["question"],
                question_2=questions_to_eval[index]["combination_of"][1]["question"],
                background_1=questions_to_eval[index]["combination_of"][0]["background"]
                + "\n"
                + questions_to_eval[index]["combination_of"][0]["model_info_resolution_criteria"],
                background_2=questions_to_eval[index]["combination_of"][1]["background"]
                + "\n"
                + questions_to_eval[index]["combination_of"][1]["model_info_resolution_criteria"],
                resolution_criteria_1=questions_to_eval[index]["combination_of"][0][
                    "resolution_criteria"
                ],
                resolution_criteria_2=questions_to_eval[index]["combination_of"][1][
                    "resolution_criteria"
                ],
                freeze_datetime_1=questions_to_eval[index]["combination_of"][0]["freeze_datetime"],
                freeze_datetime_2=questions_to_eval[index]["combination_of"][1]["freeze_datetime"],
                freeze_datetime_value_1=questions_to_eval[index]["combination_of"][0][
                    "freeze_datetime_value"
                ],
                freeze_datetime_value_2=questions_to_eval[index]["combination_of"][1][
                    "freeze_datetime_value"
                ],
                freeze_datetime_value_explanation_1=questions_to_eval[index]["combination_of"][
                    0
                ]["freeze_datetime_value_explanation"],
                freeze_datetime_value_explanation_2=questions_to_eval[index]["combination_of"][
                    1
                ]["freeze_datetime_value_explanation"],
                list_of_resolution_dates=all_resolution_dates,
            )

            response = model_eval.get_response_from_model(
                prompt=prompt,
                max_tokens=2000,
                model_name=models[model_name]["full_name"],
                temperature=0,
                wait_time=30,
            )

            save_dict[index] = (
                model_eval.reformat_answers(
                    response=response, prompt=prompt, question=questions_to_eval[index]
                ),
                response,
            )

    logger.info(f"Answer {save_dict[index][0]}")

    return None


def executor(max_workers, model_name, save_dict, questions_to_eval):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        worker_with_args = partial(
            worker, model_name=model_name, save_dict=save_dict, questions_to_eval=questions_to_eval
        )
        return list(executor.map(worker_with_args, range(len(questions_to_eval))))

In [12]:
results = {}
model_result_loaded = {}
models_to_test = list(models.keys())[:]


for question in [single_acled_questions, single_non_acled_questions, combo_questions_unrolled]:
    questions_to_eval = question
    if question[0]["source"] != "acled":
        test_type = "scratchpad/non_acled"
    elif question[0]["source"] == "acled" and question[0]["combination_of"] == "N/A":
        test_type = "scratchpad/acled"
    else:
        test_type = "scratchpad/combo"

    for model in models_to_test:
        if model not in model_result_loaded.keys():
            model_result_loaded[model] = {}
        model_result_loaded[model] = False

    for model in models_to_test:
        file_path = f"{test_type}/{model}.jsonl"
        if model not in results.keys():
            results[model] = {}
        try:
            results[model] = read_jsonl(file_path)
            model_result_loaded[model] = True  # Set flag to True if loaded successfully
        except:
            results[model] = {i: "" for i in range(len(questions_to_eval))}

    for model in models_to_test:
        file_path = f"{test_type}/{model}.jsonl"
        if not model_result_loaded[model]:
            executor_count = 50
            if models[model]["source"] == "ANTHROPIC":
                executor_count = 30

            executor(executor_count, model, results[model], questions_to_eval)

            current_model_forecasts = []
            for index in range(len(questions_to_eval)):
                if questions_to_eval[index]["source"] == "acled":
                    for forecast, horizon in zip(
                        results[model][index][0], questions_to_eval[index]["forecast_horizons"]
                    ):
                        current_forecast = {
                            "id": questions_to_eval[index]["id"],
                            "source": questions_to_eval[index]["source"],
                            "forecast": forecast,
                            "horizon": horizon,
                            "reasoning": results[model][index][1],
                        }

                        if questions_to_eval[index]["combination_of"] != "N/A":
                            combo_index = questions_to_eval[index]["combo_index"]
                            if combo_index == 0:
                                current_forecast["direction"] = [1, 1]
                            elif combo_index == 1:
                                current_forecast["direction"] = [1, -1]
                            elif combo_index == 2:
                                current_forecast["direction"] = [-1, 1]
                            else:
                                current_forecast["direction"] = [-1, -1]

                        current_model_forecasts.append(current_forecast)

                else:
                    current_forecast = {
                        "id": questions_to_eval[index]["id"],
                        "source": questions_to_eval[index]["source"],
                        "forecast": results[model][index][0],
                        "reasoning": results[model][index][1],
                    }
                    current_model_forecasts.append(current_forecast)

            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            with open(file_path, "w") as file:
                for entry in current_model_forecasts:
                    json_line = json.dumps(entry)
                    file.write(json_line + "\n")

In [13]:
generage_final_forecast_files(deadline='2024-05-03', 
                              forecast_date='2024-05-03', 
                              prompt_type="scratchpad")

In [14]:
# fix directions
# prompt_type= "scratchpad"
# directory= prompt_type + '/final_submit'
# for model in models_to_test:
#     if "gpt" in model:
#         org = "OpenAI"
#     elif "llama" in model:
#         org = "Meta"
#     elif "mistral" in model:
#         org = "Mistral AI"
#     elif "claude" in model:
#         org = "Anthropic"
#     elif "qwen" in model:
#         org = "Qwen"

#     file_path = f"{directory}/2024-05-03.{org}.{model}_{prompt_type}.json"

#     with open(file_path, "r") as file:
#         questions = json.load(file)
            
#     for q in questions['forecasts']:
#         if 'direction' in q:
#             q['direction'] = [-1 if item == 0 else item for item in q['direction']]
          
    
#     new_file_path = f"{prompt_type}/direction_fix/2024-05-03.{org}.{model}_{prompt_type}.json"
#     os.makedirs(os.path.dirname(new_file_path), exist_ok=True)

#     with open(new_file_path, 'w') as json_file:
#         json.dump(questions, json_file, indent=4)

### Scratchpad + Retrieval

In [15]:
models = {
    "gpt_3p5_turbo_0125": {"source": "OAI", "full_name": "gpt-3.5-turbo-0125"},
    "gpt_4_turbo_0409": {"source": "OAI", "full_name": "gpt-4-turbo-2024-04-09"},
    "gpt_4_1106_preview": {"source": "OAI", "full_name": "gpt-4-1106-preview"},
    "gpt_4_0125_preview": {"source": "OAI", "full_name": "gpt-4-0125-preview"},
    "gpt_4o": {"source": "OAI", "full_name": "gpt-4o"},
    "mistral_8x7b_instruct": {
        "source": "TOGETHER",
        "full_name": "mistralai/Mixtral-8x7B-Instruct-v0.1",
    },
    "mistral_8x22b_instruct": {
        "source": "TOGETHER",
        "full_name": "mistralai/Mixtral-8x22B-Instruct-v0.1",
    },
    "mistral_large": {
        "source": "MISTRAL",
        "full_name": "mistral-large-latest",
    },
    "qwen_1p5_110b": {
        "source": "TOGETHER",
        "full_name": "Qwen/Qwen1.5-110B-Chat",
    },
    "claude_2p1": {"source": "ANTHROPIC", "full_name": "claude-2.1"},
    "claude_3_opus": {"source": "ANTHROPIC", "full_name": "claude-3-opus-20240229"},
    "claude_3_sonnet": {"source": "ANTHROPIC", "full_name": "claude-3-sonnet-20240229"},
    "claude_3_haiku": {"source": "ANTHROPIC", "full_name": "claude-3-haiku-20240307"},
}

In [16]:
# Mapping retrieved summaries back into questions
for question_source in [single_non_acled_questions, single_acled_questions]:
    for q in question_source:
        reformatted_id = q["id"].replace("/", "_")
        filename = f"news/{reformatted_id}.pickle"
        with open(filename, "rb") as file:
            retrieved_info = pickle.load(file)
        q["news"] = retrieved_info

for q in combo_questions_unrolled:
    for sub_q in q["combination_of"]:
        reformatted_id = sub_q["id"].replace("/", "_")
        filename = f"news/{reformatted_id}.pickle"
        with open(filename, "rb") as file:
            retrieved_info = pickle.load(file)
        sub_q["news"] = retrieved_info

In [17]:
def get_all_retrieved_info(all_retrieved_info):
    retrieved_info = ""
    for summary in all_retrieved_info:
        retrieved_info += f"Article title: {summary['title']}" + "\n"
        retrieved_info += f"Summary: {summary['summary']}" + "\n\n"
    return retrieved_info


def worker(index, model_name, save_dict, questions_to_eval):
    if save_dict[index] != "":
        return

    logger.info(f"Starting {model_name} - {index}")

    if questions_to_eval[index]["source"] != "acled":
        prompt = SCRATCH_PAD_WITH_SUMMARIES_MARKET_PROMPT.format(
            question=questions_to_eval[index]["question"],
            background=questions_to_eval[index]["background"]
            + "\n"
            + questions_to_eval[index]["model_info_resolution_criteria"],
            resolution_criteria=questions_to_eval[index]["resolution_criteria"],
            close_date=questions_to_eval[index]["model_info_close_datetime"],
            retrieved_info=get_all_retrieved_info(questions_to_eval[index]["news"]),
        )
        response = model_eval.get_response_from_model(
            prompt=prompt,
            max_tokens=2000,
            model_name=models[model_name]["full_name"],
            temperature=0,
            wait_time=30,
        )

        save_dict[index] = (model_eval.reformat_answers(response=response, single=True), response)

    else:
        all_resolution_dates = []
        for horizon in questions_to_eval[index]["forecast_horizons"]:
            resolution_date = datetime.fromisoformat(
                questions_to_eval[index]["freeze_datetime"]
            ) + timedelta(days=7 + horizon)
            resolution_date = resolution_date.isoformat()
            all_resolution_dates.append(resolution_date)

        if questions_to_eval[index]["combination_of"] == "N/A":
            prompt = SCRATCH_PAD_WITH_SUMMARIES_NON_MARKET_PROMPT.format(
                question=questions_to_eval[index]["question"],
                background=questions_to_eval[index]["background"]
                + "\n"
                + questions_to_eval[index]["model_info_resolution_criteria"],
                resolution_criteria=questions_to_eval[index]["resolution_criteria"],
                freeze_datetime=questions_to_eval[index]["freeze_datetime"],
                freeze_datetime_value=questions_to_eval[index]["freeze_datetime_value"],
                freeze_datetime_value_explanation=questions_to_eval[index][
                    "freeze_datetime_value_explanation"
                ],
                retrieved_info=get_all_retrieved_info(questions_to_eval[index]["news"]),
                list_of_resolution_dates=all_resolution_dates,
            )
            response = model_eval.get_response_from_model(
                prompt=prompt,
                max_tokens=2000,
                model_name=models[model_name]["full_name"],
                temperature=0,
                wait_time=30,
            )
            save_dict[index] = (
                model_eval.reformat_answers(
                    response=response, prompt=prompt, question=questions_to_eval[index]
                ),
                response,
            )
        else:
            prompt = SCRATCH_PAD_WITH_SUMMARIES_JOINT_QUESTION_PROMPT.format(
                human_prompt=human_joint_prompts[questions_to_eval[index]["combo_index"]],
                question_1=questions_to_eval[index]["combination_of"][0]["question"],
                question_2=questions_to_eval[index]["combination_of"][1]["question"],
                background_1=questions_to_eval[index]["combination_of"][0]["background"]
                + "\n"
                + questions_to_eval[index]["combination_of"][0]["model_info_resolution_criteria"],
                background_2=questions_to_eval[index]["combination_of"][1]["background"]
                + "\n"
                + questions_to_eval[index]["combination_of"][1]["model_info_resolution_criteria"],
                resolution_criteria_1=questions_to_eval[index]["combination_of"][0][
                    "resolution_criteria"
                ],
                resolution_criteria_2=questions_to_eval[index]["combination_of"][1][
                    "resolution_criteria"
                ],
                freeze_datetime_1=questions_to_eval[index]["combination_of"][0]["freeze_datetime"],
                freeze_datetime_2=questions_to_eval[index]["combination_of"][1]["freeze_datetime"],
                freeze_datetime_value_1=questions_to_eval[index]["combination_of"][0][
                    "freeze_datetime_value"
                ],
                freeze_datetime_value_2=questions_to_eval[index]["combination_of"][1][
                    "freeze_datetime_value"
                ],
                freeze_datetime_value_explanation_1=questions_to_eval[index]["combination_of"][
                    0
                ]["freeze_datetime_value_explanation"],
                freeze_datetime_value_explanation_2=questions_to_eval[index]["combination_of"][
                    1
                ]["freeze_datetime_value_explanation"],
                retrieved_info_1=get_all_retrieved_info(
                    questions_to_eval[index]["combination_of"][0]["news"]
                ),
                retrieved_info_2=get_all_retrieved_info(
                    questions_to_eval[index]["combination_of"][1]["news"]
                ),
                list_of_resolution_dates=all_resolution_dates,
            )

            response = model_eval.get_response_from_model(
                prompt=prompt,
                max_tokens=2000,
                model_name=models[model_name]["full_name"],
                temperature=0,
                wait_time=30,
            )

            save_dict[index] = (
                model_eval.reformat_answers(
                    response=response, prompt=prompt, question=questions_to_eval[index]
                ),
                response,
            )

    logger.info(f"Answer: {save_dict[index][0]}")

    return None


def executor(max_workers, model_name, save_dict, questions_to_eval):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        worker_with_args = partial(
            worker, model_name=model_name, save_dict=save_dict, questions_to_eval=questions_to_eval
        )
        return list(executor.map(worker_with_args, range(len(questions_to_eval))))

In [18]:
results = {}
model_result_loaded = {}
models_to_test = list(models.keys())[:]


for question in [single_acled_questions, combo_questions_unrolled, single_non_acled_questions]:
    questions_to_eval = question
    if question[0]["source"] != "acled":
        test_type = "scratchpad_with_info_retrieval/non_acled"
    elif question[0]["source"] == "acled" and question[0]["combination_of"] == "N/A":
        test_type = "scratchpad_with_info_retrieval/acled"
    else:
        test_type = "scratchpad_with_info_retrieval/combo"

    for model in models_to_test:
        if model not in model_result_loaded.keys():
            model_result_loaded[model] = {}
        model_result_loaded[model] = False

    for model in models_to_test:
        file_path = f"{test_type}/{model}.jsonl"
        if model not in results.keys():
            results[model] = {}
        try:
            results[model] = read_jsonl(file_path)
            model_result_loaded[model] = True  # Set flag to True if loaded successfully
        except:
            results[model] = {i: "" for i in range(len(questions_to_eval))}

    for model in models_to_test:
        file_path = f"{test_type}/{model}.jsonl"
        if not model_result_loaded[model]:
            executor_count = 50
            if models[model]["source"] == "ANTHROPIC":
                executor_count = 30

            executor(executor_count, model, results[model], questions_to_eval)

            current_model_forecasts = []
            for index in range(len(questions_to_eval)):
                if questions_to_eval[index]["source"] == "acled":
                    for forecast, horizon in zip(
                        results[model][index][0], questions_to_eval[index]["forecast_horizons"]
                    ):
                        current_forecast = {
                            "id": questions_to_eval[index]["id"],
                            "source": questions_to_eval[index]["source"],
                            "forecast": forecast,
                            "horizon": horizon,
                            "reasoning": results[model][index][1],
                        }

                        if questions_to_eval[index]["combination_of"] != "N/A":
                            combo_index = questions_to_eval[index]["combo_index"]
                            if combo_index == 0:
                                current_forecast["direction"] = [1, 1]
                            elif combo_index == 1:
                                current_forecast["direction"] = [1, -1]
                            elif combo_index == 2:
                                current_forecast["direction"] = [-1, 1]
                            else:
                                current_forecast["direction"] = [-1, -1]

                        current_model_forecasts.append(current_forecast)

                else:
                    current_forecast = {
                        "id": questions_to_eval[index]["id"],
                        "source": questions_to_eval[index]["source"],
                        "forecast": results[model][index][0],
                        "reasoning": results[model][index][1],
                    }
                    current_model_forecasts.append(current_forecast)

            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            with open(file_path, "w") as file:
                for entry in current_model_forecasts:
                    json_line = json.dumps(entry)
                    file.write(json_line + "\n")

In [19]:
generage_final_forecast_files(deadline='2024-05-03', 
                              forecast_date='2024-05-03', 
                              prompt_type="scratchpad_with_info_retrieval")

In [20]:
# fix directions
# prompt_type= "scratchpad_with_info_retrieval"
# directory= prompt_type + '/final_submit'
# for model in models_to_test:
#     if "gpt" in model:
#         org = "OpenAI"
#     elif "llama" in model:
#         org = "Meta"
#     elif "mistral" in model:
#         org = "Mistral AI"
#     elif "claude" in model:
#         org = "Anthropic"
#     elif "qwen" in model:
#         org = "Qwen"

#     file_path = f"{directory}/2024-05-03.{org}.{model}_{prompt_type}.json"

#     with open(file_path, "r") as file:
#         questions = json.load(file)
            
#     for q in questions['forecasts']:
#         if 'direction' in q:
#             q['direction'] = [-1 if item == 0 else item for item in q['direction']]

#     new_file_path = f"{prompt_type}/direction_fix/2024-05-03.{org}.{model}_{prompt_type}.json"
#     os.makedirs(os.path.dirname(new_file_path), exist_ok=True)

#     with open(new_file_path, 'w') as json_file:
#         json.dump(questions, json_file, indent=4)