In [16]:
"""Notebook for getting the llm crowd forecasts."""

# You must put the candidate forecast files in the same directory.

import json
import os
from statistics import median

import numpy as np


def geometric_mean(numbers):
    """Calculate geometric_mean, ignoring None values."""
    numbers = [num for num in numbers if num is not None]
    if not numbers:
        return 0  # Return 0 for an empty list to avoid math domain error
    product = 1.0
    for number in numbers:
        product *= number
    return product ** (1 / len(numbers))


def geometric_mean_log_odds(probs):
    """Calculate geometric_mean_log_odds, ignoring None values."""
    probs = [p for p in probs if p is not None]
    if not probs:
        return 0  # Return 0 if all values were None
    # Convert probabilities to log odds
    log_odds = np.log(np.array(probs) / (1 - np.array(probs)))
    # Compute the geometric mean of the log odds
    mean_log_odds = np.mean(log_odds)
    # Convert the mean log odds back to probability
    combined_prob = np.exp(mean_log_odds) / (1 + np.exp(mean_log_odds))
    return combined_prob


def process_files(date):
    """Calculate the finale results."""
    # List all JSON files in the current directory
    json_files = [f for f in os.listdir(".") if f.startswith(date) and f.endswith(".json")]

    # Initialize dictionaries to store forecasts for each question
    forecasts_by_question = {}

    # Process each file
    for file in json_files:
        with open(file, "r") as f:
            data = json.load(f)
            for forecast in data["forecasts"]:
                question_id = forecast["id"]
                if isinstance(question_id, list):
                    question_id = question_id[0] + "<sep>" + question_id[1]
                if question_id not in forecasts_by_question:
                    forecasts_by_question[question_id] = []
                forecasts_by_question[question_id].append(
                    {
                        "forecast": forecast["forecast"],
                        "reasoning": forecast["reasoning"],
                        "direction": forecast.get("direction", None),
                        "resolution_date": forecast["resolution_date"],
                        "source": forecast["source"],
                    }
                )

    # Create crowd forecast files
    for method in ["median", "geometric_mean", "geometric_mean_log_odds"]:
        crowd_forecasts = []
        for question_id, forecasts in forecasts_by_question.items():
            forecast_values = [f["forecast"] for f in forecasts if f["forecast"]]
            if method == "median":
                crowd_forecast = median(forecast_values)
            elif method == "geometric_mean":
                crowd_forecast = geometric_mean(forecast_values)
            else:  # geometric_mean_log_odds
                crowd_forecast = geometric_mean_log_odds(forecast_values)
            if "<sep>" in question_id:
                question_id = question_id.split("<sep>")
            crowd_forecasts.append(
                {
                    "id": question_id,
                    "source": forecasts[0]["source"],
                    "forecast": crowd_forecast,
                    "resolution_date": forecasts[0]["resolution_date"][:10],
                    "reasoning": "N/A",
                    "direction": forecasts[0]["direction"],
                }
            )

        # Create the output file
        output = {
            "organization": "ForecastBench",
            "model": "LLM Crowd (gpt-4o, claude-3.5-sonnet, gemini-1.5-pro) with news",
            "question_set": f"{date}-llm.json",
            "forecast_due_date": date,
            "forecasts": crowd_forecasts,
        }

        with open(f"{date}.ForecastBench.llm_crowd_{method}_with_news.json", "w") as f:
            json.dump(output, f, indent=2)


# Usage
process_files("2024-07-21")