In [13]:
import sys
from common.path_utils import get_src_path, get_data_path
sys.path.append(str(get_src_path()))


from common.datatypes import ForecastingQuestion_stripped, ForecastingQuestion
import json

# llm_forecasting imports
from forecasters.llm_forecasting.prompts.prompts import PROMPT_DICT
from forecasters.llm_forecasting.utils.time_utils import get_todays_date, subtract_days_from_date
from forecasters.llm_forecasting import ranking, summarize, ensemble



### Load Data

In [16]:
data = []
with open(get_data_path() / "fq/real/questions_cleaned_formatted.jsonl", "r") as file:
    for line in file:
        data.append(json.loads(line))

In [18]:
sample_question = data[0]
print(sample_question['title'])
print(sample_question)

Will SpaceX land people on Mars before 2030?
{'id': 'bd976cc6-e170-4e47-a8d7-9e2f7baed9c7', 'title': 'Will SpaceX land people on Mars before 2030?', 'body': 'Resolution Criteria\nThis question will resolve as Yes if a SpaceX-branded mission successfully lands one or more living human beings on the surface of Mars before 2030. The landing itself of the human crew on Mars must occur before January 1, 2030, 00:00 UTC.\nAt least one person aboard the lander must survive the landing, however it is not necessary for the person to survive long-term or make a return trip to Earth, nor is it necessary for the mission to intend a return or long-term survival.\nA "SpaceX-branded" mission is defined to mean that the SpaceX-associated logos on the spacecraft involved (both the boosters and the Mars-bound craft) have a larger surface area than the logos of any other entity\n', 'resolution_date': '2029-12-31 00:00:00+00:00', 'question_type': 'binary', 'data_source': 'metaculus', 'url': 'https://www.m

In [20]:
fq = ForecastingQuestion(**sample_question)

### Testing "Advanced Forecaster"

In [25]:
from forecasters.advanced_forecaster import AdvancedForecaster
af = AdvancedForecaster(MAX_WORDS_NEWSCATCHER=5, MAX_WORDS_GNEWS=8, BASE_REASONING_MODEL_NAMES=["gpt-3.5-turbo-1106", "gpt-3.5-turbo-1106"])

Loading AdvancedForecaster...
Overriding retrieval_config: MAX_WORDS_NEWSCATCHER=5
Overriding retrieval_config: MAX_WORDS_GNEWS=8
Overriding reasoning_config: BASE_REASONING_MODEL_NAMES:=['gpt-3.5-turbo-1106', 'gpt-3.5-turbo-1106']
Initialized forecaster with settings:


In [27]:
final_prob = await af.call_async(sentence=fq)

Running functools.partial(<function get_async_response at 0x135e5f1a0>, model_name='gpt-4-1106-preview', temperature=0.0) on 4 datapoints with 20 concurrent queries
Calling models through OpenRouter
{'model': 'gpt-4-1106-preview', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 660
{'model': 'gpt-4-1106-preview', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 614
{'model': 'gpt-4-1106-preview', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 660
{'model': 'gpt-4-1106-preview', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 614


2024-06-29 20:27:29,726 ERROR information_retrieval: Skipping Newscatcher since no key is set.


An error occurred while fetching the article: expected string or bytes-like object, got 'dict'
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1347
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1344
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1337
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1333
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1338
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1339
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1345
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1340
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1332
{'model': 'gpt-3.5-turbo-1106', 'temperat

In [28]:
print("Final LLM probability", final_prob)

Final LLM probability 0.65


Now we test the two procedures that make up AdvancedForecaster: retrieval and reasoning.


### Retrieval

In [31]:
RETRIEVAL_CONFIG = {
    "NUM_SEARCH_QUERY_KEYWORDS": 3,
    "MAX_WORDS_NEWSCATCHER": 5,
    "MAX_WORDS_GNEWS": 8,
    "SEARCH_QUERY_MODEL_NAME": "gpt-4-1106-preview",
    "SEARCH_QUERY_TEMPERATURE": 0.0,
    "SEARCH_QUERY_PROMPT_TEMPLATES": [
        PROMPT_DICT["search_query"]["0"],
        PROMPT_DICT["search_query"]["1"],
    ],
    "NUM_ARTICLES_PER_QUERY": 5,
    "SUMMARIZATION_MODEL_NAME": "gpt-3.5-turbo-1106",
    "SUMMARIZATION_TEMPERATURE": 0.2,
    "SUMMARIZATION_PROMPT_TEMPLATE": PROMPT_DICT["summarization"]["9"],
    "NUM_SUMMARIES_THRESHOLD": 10,
    "PRE_FILTER_WITH_EMBEDDING": True,
    "PRE_FILTER_WITH_EMBEDDING_THRESHOLD": 0.32,
    "RANKING_MODEL_NAME": "gpt-3.5-turbo-1106",
    "RANKING_TEMPERATURE": 0.0,
    "RANKING_PROMPT_TEMPLATE": PROMPT_DICT["ranking"]["0"],
    "RANKING_RELEVANCE_THRESHOLD": 4,
    "RANKING_COSINE_SIMILARITY_THRESHOLD": 0.5,
    "SORT_BY": "date",
    "RANKING_METHOD": "llm-rating",
    "RANKING_METHOD_LLM": "title_250_tokens",
    "NUM_SUMMARIES_THRESHOLD": 20,
    "EXTRACT_BACKGROUND_URLS": True,
}

In [33]:
question = fq.title
background_info = fq.metadata["background_info"]
resolution_criteria = fq.body # resolution criteria and other info is in |body|

today_date = get_todays_date()
# If open date is set in data structure, change beginning of retrieval to question open date.
# Retrieve from [today's date - 1 month, today's date].
retrieval_dates = (
    subtract_days_from_date(today_date, 30),
    today_date,
)

In [35]:
(
    ranked_articles,
    all_articles,
    search_queries_list_gnews,
    search_queries_list_nc,
) = await ranking.retrieve_summarize_and_rank_articles(
    question,
    background_info,
    resolution_criteria,
    retrieval_dates,
    urls=[],
    config=RETRIEVAL_CONFIG,
    return_intermediates=True,
)

Running functools.partial(<function get_async_response at 0x135e5f1a0>, model_name='gpt-4-1106-preview', temperature=0.0) on 4 datapoints with 20 concurrent queries
{'model': 'gpt-4-1106-preview', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 660
{'model': 'gpt-4-1106-preview', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 614
{'model': 'gpt-4-1106-preview', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 660
{'model': 'gpt-4-1106-preview', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 614


2024-06-29 20:38:25,737 ERROR information_retrieval: Skipping Newscatcher since no key is set.


An error occurred while fetching the article: expected string or bytes-like object, got 'dict'
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1340
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1332
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1346
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1336
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1333
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1338
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1338
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1345
{'model': 'gpt-3.5-turbo-1106', 'temperature': 0.0, 'max_tokens': 4000} Approx num tokens: 1341
{'model': 'gpt-3.5-turbo-1106', 'temperat

In [36]:
all_summaries = summarize.concat_summaries(
    ranked_articles[: RETRIEVAL_CONFIG["NUM_SUMMARIES_THRESHOLD"]]
)

print(all_summaries[:3000], "...")

---
ARTICLES
[1] China takes small step towards the moon with rocket test (published on 2024-06-17)
Summary: China's main space contractor has successfully tested a Long March 10 rocket first stage designed for moon missions. The test, conducted in Beijing, is a step towards China's goal of putting astronauts on the moon before 2030. The rocket's first stage will be powered by seven YF-100K engines, and it will be used to carry astronauts to the moon and back. The successful test verified various aspects of the rocket and laid a solid foundation for China's manned lunar exploration program. The Long March 10 will also be used for sending crew and cargo to the Tiangong space station.

[2] Elon Musk wants SpaceX's Starship to land on the Moon, Mars — and Uranus (published on 2024-06-10)
Summary: SpaceX's Starship has completed four live tests and aims to land on Mars by 2027, with the first launch to Mars expected in less than three years. Elon Musk also expressed a goal of sending 1,000

### Reasoning

In [38]:
REASONING_CONFIG = {
    "BASE_REASONING_MODEL_NAMES": ["gpt-4-1106-preview", "gpt-4-1106-preview"],
    "BASE_REASONING_TEMPERATURE": 1.0,
    "BASE_REASONING_PROMPT_TEMPLATES": [
        [
            PROMPT_DICT["binary"]["scratch_pad"]["1"],
            PROMPT_DICT["binary"]["scratch_pad"]["2"],
        ],
        [
            PROMPT_DICT["binary"]["scratch_pad"]["new_3"],
            PROMPT_DICT["binary"]["scratch_pad"]["new_6"],
        ],
    ],
    "AGGREGATION_METHOD": "meta",
    "AGGREGATION_PROMPT_TEMPLATE": PROMPT_DICT["meta_reasoning"]["0"],
    "AGGREGATION_TEMPERATURE": 0.2,
    "AGGREGATION_MODEL_NAME": "gpt-4",
    "AGGREGATION_WEIGTHTS": None,
}

In [39]:
close_date = "N/A"  # data doesn't have explicit close date, so set to N/A
today_to_close_date = [today_date, close_date]

ensemble_dict = await ensemble.meta_reason(
    question=question,
    background_info=background_info,
    resolution_criteria=resolution_criteria,
    today_to_close_date_range=today_to_close_date,
    retrieved_info=all_summaries,
    reasoning_prompt_templates=REASONING_CONFIG["BASE_REASONING_PROMPT_TEMPLATES"],
    base_model_names=REASONING_CONFIG["BASE_REASONING_MODEL_NAMES"],
    base_temperature=REASONING_CONFIG["BASE_REASONING_TEMPERATURE"],
    aggregation_method=REASONING_CONFIG["AGGREGATION_METHOD"],
    weights=REASONING_CONFIG["AGGREGATION_WEIGTHTS"],
    meta_model_name=REASONING_CONFIG["AGGREGATION_MODEL_NAME"],
    meta_prompt_template=REASONING_CONFIG["AGGREGATION_PROMPT_TEMPLATE"],
    meta_temperature=REASONING_CONFIG["AGGREGATION_TEMPERATURE"],
)

Running functools.partial(<function get_async_response at 0x135e5f1a0>, model_name='gpt-4-1106-preview', temperature=1.0) on 2 datapoints with 20 concurrent queries
{'model': 'gpt-4-1106-preview', 'temperature': 1.0, 'max_tokens': 4000} Approx num tokens: 2502
{'model': 'gpt-4-1106-preview', 'temperature': 1.0, 'max_tokens': 4000} Approx num tokens: 2314
Running functools.partial(<function get_async_response at 0x135e5f1a0>, model_name='gpt-4-1106-preview', temperature=1.0) on 2 datapoints with 20 concurrent queries
{'model': 'gpt-4-1106-preview', 'temperature': 1.0, 'max_tokens': 4000} Approx num tokens: 2622
{'model': 'gpt-4-1106-preview', 'temperature': 1.0, 'max_tokens': 4000} Approx num tokens: 2634
{'model': 'gpt-4', 'max_tokens': 2000, 'temperature': 0.2} Approx num tokens: 6754


In [40]:
print("REASONING\n", "=================")
print(ensemble_dict["meta_reasoning"])

print("PROBABILITY\n", "=================")
print(ensemble_dict["meta_prediction"])

REASONING
1. Reasons why the answer might be no:
- SpaceX has a history of optimistic timelines that often get pushed back. The ambitious goal of landing humans on Mars by 2027 may be delayed due to technical, financial, or regulatory challenges.
- The recent tests of SpaceX's Starship have shown progress, but there are still significant issues to be resolved, such as the reentry challenges.
- The complexity of the mission, including the need for in-situ fuel generation on Mars, presents significant technical hurdles that have not yet been overcome.
- External factors such as changes in global economic conditions, regulatory constraints, or geopolitical events could impact the timeline and resources available for the mission.

2. Reasons why the answer might be yes:
- SpaceX has made significant strides in its Mars mission plans, with successful tests of its Starship rocket and plans for further improvements.
- Elon Musk's determination and track record of achieving long-term goals, de