In [1]:
import asyncio
import logging
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
import time
import os
from datetime import datetime, timedelta
import json
import pickle

import random
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

current_path = os.getcwd()

sys.path.append(os.path.join(current_path, "../.."))
from helpers.llm_prompts import (
    REFORMAT_SINGLE_PROMPT,
    REFORMAT_PROMPT,
    HUMAN_JOINT_PROMPT_1,
    HUMAN_JOINT_PROMPT_2,
    HUMAN_JOINT_PROMPT_3,
    HUMAN_JOINT_PROMPT_4,
)

from helpers import model_eval


logger = logging.getLogger()
logger.setLevel(logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
human_joint_prompts = [
    HUMAN_JOINT_PROMPT_1,
    HUMAN_JOINT_PROMPT_2,
    HUMAN_JOINT_PROMPT_3,
    HUMAN_JOINT_PROMPT_4,
]

### Load data

In [3]:
import json


def read_jsonl(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            if line.strip():
                json_object = json.loads(line)
                data.append(json_object)
    return data


file_path = "2024-05-03-llm.jsonl"
questions = read_jsonl(file_path)

single_non_acled_questions = [
    q for q in questions if q["combination_of"] == "N/A" and q["source"] != "acled"
]
single_acled_questions = [
    q for q in questions if q["combination_of"] == "N/A" and q["source"] == "acled"
]
combo_questions = [q for q in questions if q["combination_of"] != "N/A" and q["source"] == "acled"]

In [4]:
combo_questions_unrolled = []

for q in combo_questions:
    for i in range(4):
        new_q = q.copy()
        new_q["combo_index"] = i

        combo_questions_unrolled.append(new_q)

In [5]:
len(single_non_acled_questions), len(single_acled_questions), len(combo_questions_unrolled)

(339, 162, 644)

### Wisdom of Crowd: Scratchpad + Retrieval

In [6]:
from helpers.llm_crowd_prompts import (
    SCRATCH_PAD_WITH_SUMMARIES_MARKET_PROMPT_1,
    SCRATCH_PAD_WITH_SUMMARIES_NON_MARKET_PROMPT_1,
    SCRATCH_PAD_WITH_SUMMARIES_JOINT_QUESTION_PROMPT_1,
    SCRATCH_PAD_WITH_SUMMARIES_MARKET_PROMPT_2,
    SCRATCH_PAD_WITH_SUMMARIES_NON_MARKET_PROMPT_2,
    SCRATCH_PAD_WITH_SUMMARIES_JOINT_QUESTION_PROMPT_2,
    SUPERFORECASTER_MARKET_PROMPT_3,
    SUPERFORECASTER_NON_MARKET_PROMPT_3,
    SUPERFORECASTER_JOINT_QUESTION_PROMPT_3,
    SUPERFORECASTER_MARKET_PROMPT_4,
    SUPERFORECASTER_NON_MARKET_PROMPT_4,
    SUPERFORECASTER_JOINT_QUESTION_PROMPT_4,
    SUPERFORECASTER_MARKET_PROMPT_5,
    SUPERFORECASTER_NON_MARKET_PROMPT_5,
    SUPERFORECASTER_JOINT_QUESTION_PROMPT_5,
)

all_llm_crowd_prompts = {
    1: (
        SCRATCH_PAD_WITH_SUMMARIES_MARKET_PROMPT_1,
        SCRATCH_PAD_WITH_SUMMARIES_NON_MARKET_PROMPT_1,
        SCRATCH_PAD_WITH_SUMMARIES_JOINT_QUESTION_PROMPT_1,
    ),
    2: (
        SCRATCH_PAD_WITH_SUMMARIES_MARKET_PROMPT_2,
        SCRATCH_PAD_WITH_SUMMARIES_NON_MARKET_PROMPT_2,
        SCRATCH_PAD_WITH_SUMMARIES_JOINT_QUESTION_PROMPT_2,
    ),
    3: (
        SUPERFORECASTER_MARKET_PROMPT_3,
        SUPERFORECASTER_NON_MARKET_PROMPT_3,
        SUPERFORECASTER_JOINT_QUESTION_PROMPT_3,
    ),
    4: (
        SUPERFORECASTER_MARKET_PROMPT_4,
        SUPERFORECASTER_NON_MARKET_PROMPT_4,
        SUPERFORECASTER_JOINT_QUESTION_PROMPT_4,
    ),
    5: (
        SUPERFORECASTER_MARKET_PROMPT_5,
        SUPERFORECASTER_NON_MARKET_PROMPT_5,
        SUPERFORECASTER_JOINT_QUESTION_PROMPT_5,
    ),
}

In [7]:
models = {
    "gpt_4o": {"source": "OAI", "full_name": "gpt-4o"},
    "mistral_large": {
        "source": "MISTRAL",
        "full_name": "mistral-large-latest",
    },
    "qwen_1p5_110b": {
        "source": "TOGETHER",
        "full_name": "Qwen/Qwen1.5-110B-Chat",
    },
}

In [8]:
# Mapping retrieved summaries back into questions
for question_source in [single_non_acled_questions, single_acled_questions]:
    for q in question_source:
        reformatted_id = q["id"].replace("/", "_")
        filename = f"info_retrieval/{reformatted_id}.pickle"
        with open(filename, "rb") as file:
            retrieved_info = pickle.load(file)
        q["info_retrieval"] = retrieved_info

for q in combo_questions_unrolled:
    for sub_q in q["combination_of"]:
        reformatted_id = sub_q["id"].replace("/", "_")
        filename = f"info_retrieval/{reformatted_id}.pickle"
        with open(filename, "rb") as file:
            retrieved_info = pickle.load(file)
        sub_q["info_retrieval"] = retrieved_info

In [9]:
def get_all_retrieved_info(all_retrieved_info):
    retrieved_info = ""
    for summary in all_retrieved_info:
        retrieved_info += f"Article title: {summary['title']}" + "\n"
        retrieved_info += f"Summary: {summary['summary']}" + "\n\n"
    return retrieved_info


def worker(
    index,
    model_name,
    save_dict,
    questions_to_eval,
    article_random_seed,
    SCRATCH_PAD_WITH_SUMMARIES_MARKET_PROMPT,
    SCRATCH_PAD_WITH_SUMMARIES_NON_MARKET_PROMPT,
    SCRATCH_PAD_WITH_SUMMARIES_JOINT_QUESTION_PROMPT,
):
    if save_dict[index] != "":
        return

    logger.info(f"Starting {model_name} - {index}")

    random.seed(article_random_seed)

    if questions_to_eval[index]["source"] != "acled":
        selected_articles = random.sample(
            questions_to_eval[index]["info_retrieval"],
            min(10, len(questions_to_eval[index]["info_retrieval"])),
        )
        prompt = SCRATCH_PAD_WITH_SUMMARIES_MARKET_PROMPT.format(
            question=questions_to_eval[index]["question"],
            background=questions_to_eval[index]["background"]
            + "\n"
            + questions_to_eval[index]["source_resolution_criteria"],
            resolution_criteria=questions_to_eval[index]["resolution_criteria"],
            close_date=questions_to_eval[index]["source_close_datetime"],
            retrieved_info=get_all_retrieved_info(selected_articles),
        )

        response = model_eval.get_response_from_model(
            prompt=prompt,
            max_tokens=2000,
            model_name=models[model_name]["full_name"],
            temperature=0,
            wait_time=30,
        )

        save_dict[index] = (model_eval.reformat_answers(response=response, single=True), response)

    else:
        all_resolution_dates = []
        for horizon in questions_to_eval[index]["forecast_horizons"]:
            resolution_date = datetime.fromisoformat(
                questions_to_eval[index]["freeze_datetime"]
            ) + timedelta(days=7 + horizon)
            resolution_date = resolution_date.isoformat()
            all_resolution_dates.append(resolution_date)

        if questions_to_eval[index]["combination_of"] == "N/A":
            selected_articles = random.sample(
                questions_to_eval[index]["info_retrieval"],
                min(10, len(questions_to_eval[index]["info_retrieval"])),
            )
            prompt = SCRATCH_PAD_WITH_SUMMARIES_NON_MARKET_PROMPT.format(
                question=questions_to_eval[index]["question"],
                background=questions_to_eval[index]["background"]
                + "\n"
                + questions_to_eval[index]["source_resolution_criteria"],
                resolution_criteria=questions_to_eval[index]["resolution_criteria"],
                freeze_datetime=questions_to_eval[index]["freeze_datetime"],
                value_at_freeze_datetime=questions_to_eval[index]["value_at_freeze_datetime"],
                value_at_freeze_datetime_explanation=questions_to_eval[index][
                    "value_at_freeze_datetime_explanation"
                ],
                retrieved_info=get_all_retrieved_info(selected_articles),
                list_of_resolution_dates=all_resolution_dates,
            )
            response = model_eval.get_response_from_model(
                prompt=prompt,
                max_tokens=2000,
                model_name=models[model_name]["full_name"],
                temperature=0,
                wait_time=30,
            )
            save_dict[index] = (
                model_eval.reformat_answers(
                    response=response, prompt=prompt, question=questions_to_eval[index]
                ),
                response,
            )
        else:
            selected_articles_1 = random.sample(
                questions_to_eval[index]["combination_of"][0]["info_retrieval"],
                min(10, len(questions_to_eval[index]["combination_of"][0]["info_retrieval"])),
            )
            selected_articles_2 = random.sample(
                questions_to_eval[index]["combination_of"][1]["info_retrieval"],
                min(10, len(questions_to_eval[index]["combination_of"][1]["info_retrieval"])),
            )

            prompt = SCRATCH_PAD_WITH_SUMMARIES_JOINT_QUESTION_PROMPT.format(
                human_prompt=human_joint_prompts[questions_to_eval[index]["combo_index"]],
                question_1=questions_to_eval[index]["combination_of"][0]["question"],
                question_2=questions_to_eval[index]["combination_of"][1]["question"],
                background_1=questions_to_eval[index]["combination_of"][0]["background"]
                + "\n"
                + questions_to_eval[index]["combination_of"][0]["source_resolution_criteria"],
                background_2=questions_to_eval[index]["combination_of"][1]["background"]
                + "\n"
                + questions_to_eval[index]["combination_of"][1]["source_resolution_criteria"],
                resolution_criteria_1=questions_to_eval[index]["combination_of"][0][
                    "resolution_criteria"
                ],
                resolution_criteria_2=questions_to_eval[index]["combination_of"][1][
                    "resolution_criteria"
                ],
                freeze_datetime_1=questions_to_eval[index]["combination_of"][0]["freeze_datetime"],
                freeze_datetime_2=questions_to_eval[index]["combination_of"][1]["freeze_datetime"],
                value_at_freeze_datetime_1=questions_to_eval[index]["combination_of"][0][
                    "value_at_freeze_datetime"
                ],
                value_at_freeze_datetime_2=questions_to_eval[index]["combination_of"][1][
                    "value_at_freeze_datetime"
                ],
                value_at_freeze_datetime_explanation_1=questions_to_eval[index]["combination_of"][
                    0
                ]["value_at_freeze_datetime_explanation"],
                value_at_freeze_datetime_explanation_2=questions_to_eval[index]["combination_of"][
                    1
                ]["value_at_freeze_datetime_explanation"],
                retrieved_info_1=get_all_retrieved_info(selected_articles_1),
                retrieved_info_2=get_all_retrieved_info(selected_articles_2),
                list_of_resolution_dates=all_resolution_dates,
            )

            response = model_eval.get_response_from_model(
                prompt=prompt,
                max_tokens=2000,
                model_name=models[model_name]["full_name"],
                temperature=0,
                wait_time=30,
            )

            save_dict[index] = (
                model_eval.reformat_answers(
                    response=response, prompt=prompt, question=questions_to_eval[index]
                ),
                response,
            )

    logger.info(f"Answer: {save_dict[index][0]}")

    return None


def executor(
    max_workers,
    model_name,
    save_dict,
    questions_to_eval,
    article_random_seed,
    SCRATCH_PAD_WITH_SUMMARIES_MARKET_PROMPT,
    SCRATCH_PAD_WITH_SUMMARIES_NON_MARKET_PROMPT,
    SCRATCH_PAD_WITH_SUMMARIES_JOINT_QUESTION_PROMPT,
):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        worker_with_args = partial(
            worker,
            model_name=model_name,
            save_dict=save_dict,
            questions_to_eval=questions_to_eval,
            article_random_seed=article_random_seed,
            SCRATCH_PAD_WITH_SUMMARIES_MARKET_PROMPT=SCRATCH_PAD_WITH_SUMMARIES_MARKET_PROMPT,
            SCRATCH_PAD_WITH_SUMMARIES_NON_MARKET_PROMPT=SCRATCH_PAD_WITH_SUMMARIES_NON_MARKET_PROMPT,
            SCRATCH_PAD_WITH_SUMMARIES_JOINT_QUESTION_PROMPT=SCRATCH_PAD_WITH_SUMMARIES_JOINT_QUESTION_PROMPT,
        )
        return list(executor.map(worker_with_args, range(len(questions_to_eval))))

In [10]:
results = {}
model_result_loaded = {}
models_to_test = list(models.keys())[:]
prompt_type = "llm_crowd"

for question in [single_acled_questions, combo_questions_unrolled, single_non_acled_questions]:
    for article_random_seed in range(1, 4):
        for llm_crowd_prompt_index in range(3, 6):
            questions_to_eval = question
            (
                SCRATCH_PAD_WITH_SUMMARIES_MARKET_PROMPT,
                SCRATCH_PAD_WITH_SUMMARIES_NON_MARKET_PROMPT,
                SCRATCH_PAD_WITH_SUMMARIES_JOINT_QUESTION_PROMPT,
            ) = all_llm_crowd_prompts[llm_crowd_prompt_index]

            if question[0]["source"] != "acled":
                q_type = "non_acled"
            elif question[0]["source"] == "acled" and question[0]["combination_of"] == "N/A":
                q_type = "acled"
            else:
                q_type = "combo"

            test_type = f"{prompt_type}/{q_type}/article_set_{article_random_seed}/prompt_{llm_crowd_prompt_index}"

            for model in models_to_test:
                if model not in model_result_loaded.keys():
                    model_result_loaded[model] = {}
                model_result_loaded[model] = False

            for model in models_to_test:
                file_path = f"{test_type}/{model}.jsonl"
                if model not in results.keys():
                    results[model] = {}
                try:
                    results[model] = read_jsonl(file_path)
                    model_result_loaded[model] = True  # Set flag to True if loaded successfully
                except:
                    results[model] = {i: "" for i in range(len(questions_to_eval))}

            for model in models_to_test:
                file_path = f"{test_type}/{model}.jsonl"
                if not model_result_loaded[model]:
                    executor_count = 50
                    if models[model]["source"] == "ANTHROPIC":
                        executor_count = 30

                    executor(
                        executor_count,
                        model,
                        results[model],
                        questions_to_eval,
                        article_random_seed,
                        SCRATCH_PAD_WITH_SUMMARIES_MARKET_PROMPT,
                        SCRATCH_PAD_WITH_SUMMARIES_NON_MARKET_PROMPT,
                        SCRATCH_PAD_WITH_SUMMARIES_JOINT_QUESTION_PROMPT,
                    )

                    current_model_forecasts = []
                    for index in range(len(questions_to_eval)):
                        if questions_to_eval[index]["source"] == "acled":
                            for forecast, horizon in zip(
                                results[model][index][0],
                                questions_to_eval[index]["forecast_horizons"],
                            ):
                                current_forecast = {
                                    "id": questions_to_eval[index]["id"],
                                    "source": questions_to_eval[index]["source"],
                                    "forecast": forecast,
                                    "horizon": horizon,
                                    "reasoning": results[model][index][1],
                                }

                                if questions_to_eval[index]["combination_of"] != "N/A":
                                    combo_index = questions_to_eval[index]["combo_index"]
                                    if combo_index == 0:
                                        current_forecast["direction"] = [1, 1]
                                    elif combo_index == 1:
                                        current_forecast["direction"] = [1, -1]
                                    elif combo_index == 2:
                                        current_forecast["direction"] = [-1, 1]
                                    else:
                                        current_forecast["direction"] = [-1, -1]

                                current_model_forecasts.append(current_forecast)

                        else:
                            current_forecast = {
                                "id": questions_to_eval[index]["id"],
                                "source": questions_to_eval[index]["source"],
                                "forecast": results[model][index][0],
                                "reasoning": results[model][index][1],
                            }
                            current_model_forecasts.append(current_forecast)

                    os.makedirs(os.path.dirname(file_path), exist_ok=True)
                    with open(file_path, "w") as file:
                        for entry in current_model_forecasts:
                            json_line = json.dumps(entry)
                            file.write(json_line + "\n")

In [11]:
prompt_type = "llm_crowd"

for model in models_to_test:
    for article_random_seed in range(1, 4):
        for llm_crowd_prompt_index in range(3, 6):
            detailed_index = f"/article_set_{article_random_seed}/prompt_{llm_crowd_prompt_index}"
            current_model_forecasts = []
            for test_type in [
                f"{prompt_type}/non_acled/{detailed_index}",
                f"{prompt_type}/acled/{detailed_index}",
                f"{prompt_type}/combo/{detailed_index}",
            ]:
                file_path = f"{test_type}/{model}.jsonl"
                questions = read_jsonl(file_path)
                current_model_forecasts.extend(questions)

            final_file_name = f"{prompt_type}/final/{detailed_index}/{model}"
            os.makedirs(os.path.dirname(final_file_name), exist_ok=True)
            with open(final_file_name, "w") as file:
                for entry in current_model_forecasts:
                    json_line = json.dumps(entry)
                    file.write(json_line + "\n")

for model in models_to_test:
    for article_random_seed in range(1, 4):
        for llm_crowd_prompt_index in range(3, 6):
            detailed_index = f"/article_set_{article_random_seed}/prompt_{llm_crowd_prompt_index}"
            file_path = f"{prompt_type}/final/{detailed_index}/{model}"
            questions = read_jsonl(file_path)
            if "gpt" in model:
                org = "OPENAI"
            elif "llama" in model:
                org = "META"
            elif "mistral" in model:
                org = "MISTRAL"
            elif "claude" in model:
                org = "ANTRHOPIC"
            elif "qwen" in model:
                org = "QWEN"

            directory = f"{prompt_type}/final_submit"
            os.makedirs(directory, exist_ok=True)

            new_file_name = f"{directory}/2024-05-03.{org}.{model}_{prompt_type}.article_set_{article_random_seed}.prompt_{llm_crowd_prompt_index}.json"

            forecast_file = {
                "organization": org,
                "model": f"{model.replace('_', ' ')} {prompt_type.replace('_', ' ')} article_set_{article_random_seed} prompt_{llm_crowd_prompt_index}",
                "question_set": "2024-05-03-llm.jsonl",
                "forecast_date": "2024-05-03",
                "forecasts": questions,
            }

            with open(new_file_name, "w") as f:
                json.dump(forecast_file, f, indent=4)

### Generate the final crowd prediction files
- One with median 
- One with geometric mean log odds

In [12]:
def geometric_mean_log_odds(probs, epsilon=1e-10):
    # Ensure probabilities are within (0, 1) to avoid log(0) issues
    probs = np.clip(probs, epsilon, 1 - epsilon)

    # Convert probabilities to log odds
    log_odds = np.log(probs / (1 - probs))

    # Compute the geometric mean of the log odds
    mean_log_odds = np.mean(log_odds)

    # Convert the mean log odds back to probability
    combined_prob = np.exp(mean_log_odds) / (1 + np.exp(mean_log_odds))

    return combined_prob

def geometric_mean(numbers):
    if not numbers:
        return 0  # Return 0 for an empty list to avoid math domain error
    product = 1.0
    for number in numbers:
        product *= number
    return product ** (1 / len(numbers))

In [13]:
median_crowd = {
    "organization": "CROWD",
    "model": "Median of (gpt 4o + mistral large + qwen 1p5 110b) using 3 superforecaster prompts and 3 random sets of articles",
    "question_set": "2024-05-03-llm.jsonl",
    "forecast_date": "2024-05-03",
    "forecasts": None,
}

geometric_mean_log_odds_crowd = {
    "organization": "CROWD",
    "model": "Geometric mean of log odds of (gpt 4o + mistral large + qwen 1p5 110b) using 3 superforecaster prompts and 3 random sets of articles",
    "question_set": "2024-05-03-llm.jsonl",
    "forecast_date": "2024-05-03",
    "forecasts": None,
}

geometric_mean_crowd = {
    "organization": "CROWD",
    "model": "Geometric mean of (gpt 4o + mistral large + qwen 1p5 110b) using 3 superforecaster prompts and 3 random sets of articles",
    "question_set": "2024-05-03-llm.jsonl",
    "forecast_date": "2024-05-03",
    "forecasts": None,
}

forecasts = []

for model in models_to_test:
    for article_random_seed in range(1, 4):
        for llm_crowd_prompt_index in range(3, 6):
            if "gpt" in model:
                org = "OPENAI"
            elif "llama" in model:
                org = "META"
            elif "mistral" in model:
                org = "MISTRAL"
            elif "claude" in model:
                org = "ANTRHOPIC"
            elif "qwen" in model:
                org = "QWEN"

            file_path = f"{directory}/2024-05-03.{org}.{model}_{prompt_type}.article_set_{article_random_seed}.prompt_{llm_crowd_prompt_index}.json"

            with open(file_path, "r") as file:
                questions = json.load(file)
            df = pd.DataFrame(questions["forecasts"])
            df["forecast"] = df["forecast"].fillna(0.5)
            forecasts.append(pd.DataFrame(df))


# Concatenate all DataFrames into one
combined_df = pd.concat(forecasts)

combined_df["horizon"] = combined_df["horizon"].fillna("NaN")
combined_df["direction"] = combined_df["direction"].apply(
    lambda x: tuple(x) if isinstance(x, list) else ("NaN", "NaN") if pd.isna(x) else x
)
combined_df["id"] = combined_df["id"].apply(lambda x: tuple(x) if isinstance(x, list) else x)

# Group by id, source, reasoning, horizon, direction and aggregate forecasts into a list
result_df = combined_df.groupby(["id", "source", "horizon", "direction"], as_index=False).agg(
    {"forecast": list}
)

In [14]:
result_df["median_forecast"] = result_df["forecast"].apply(np.median)
result_df["geometric_mean_log_odds_forecast"] = result_df["forecast"].apply(geometric_mean_log_odds)
result_df["geometric_mean_forecast"] = result_df["forecast"].apply(geometric_mean)

In [18]:
prompt_type = "llm_crowd"
fields = ["id", "source", "horizon", "direction"]

for agg_type in ["median_forecast", "geometric_mean_log_odds_forecast", 'geometric_mean_forecast']:
    # convert back into required format
    df_with_wanted_fields = result_df[fields + [agg_type]].copy()
    df_with_wanted_fields.rename(columns={agg_type: "forecast"}, inplace=True)
    df_with_wanted_fields.loc[:, "horizon"] = df_with_wanted_fields["horizon"].map(
        lambda x: None if x == "NaN" else x
    )
    df_with_wanted_fields.loc[:, "direction"] = df_with_wanted_fields["direction"].apply(
        lambda x: list(x) if isinstance(x, tuple) and x != ("NaN", "NaN") else None
    )
    df_with_wanted_fields.loc[:, "id"] = df_with_wanted_fields["id"].apply(
        lambda x: list(x) if isinstance(x, tuple) else x
    )
    list_of_forecasts = df_with_wanted_fields.to_dict(orient="records")
    if agg_type == "median_forecast":
        median_crowd["forecasts"] = list_of_forecasts
    else:
        geometric_mean_log_odds_crowd["forecasts"] = list_of_forecasts

    # save as json
    directory = f"{prompt_type}/final_submit"
    os.makedirs(directory, exist_ok=True)
    new_file_name = f"{directory}/2024-05-03.CROWD.{agg_type}.json"

    if agg_type == "median_forecast":
        median_crowd["forecasts"] = list_of_forecasts
        with open(new_file_name, "w") as f:
            json.dump(median_crowd, f, indent=4)
    elif agg_type == "geometric_mean_log_odds_forecast":
        geometric_mean_log_odds_crowd["forecasts"] = list_of_forecasts
        with open(new_file_name, "w") as f:
            json.dump(geometric_mean_log_odds_crowd, f, indent=4)
    else:
        geometric_mean_crowd["forecasts"] = list_of_forecasts
        with open(new_file_name, "w") as f:
            json.dump(geometric_mean_crowd, f, indent=4)