In [1]:
%load_ext autoreload
%autoreload 2    

# Setup

In [2]:
DB_NAME = "./memories.db"

In [3]:
from broai.prompt_management.core import PromptGenerator
from broai.prompt_management.interface import Persona, Instructions, Examples, Example
from pydantic import BaseModel, Field
from typing import List
from broai.experiments.bro_agent import BroAgent
import json
from broai.interface import Context, Contexts
from broai.experiments.vector_store import DuckVectorStore

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from package.jargon_store import JargonStore, JargonRecord

In [5]:
from broai.experiments.cross_encoder import ReRanker
rr = ReRanker()

  rr = ReRanker()


In [6]:
from broai.experiments.huggingface_embedding import BAAIEmbedding, EmbeddingDimension
baai_em = BAAIEmbedding()

  baai_em = BAAIEmbedding()
Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 202623.38it/s]


In [7]:
raw_memory = DuckVectorStore(db_name=DB_NAME, table="raw_memory", embedding=baai_em)
enrich_memory = DuckVectorStore(db_name=DB_NAME, table="enrich_memory", embedding=baai_em)
longterm_memory = DuckVectorStore(db_name=DB_NAME, table="longterm_memory", embedding=baai_em)
jargon_memory = JargonStore(db_name=DB_NAME, table="jargon_memory")

  raw_memory = DuckVectorStore(db_name=DB_NAME, table="raw_memory", embedding=baai_em)
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  enrich_memory = DuckVectorStore(db_name=DB_NAME, table="enrich_memory", embedding=baai_em)
  longterm_memory = DuckVectorStore(db_name=DB_NAME, table="longterm_memory", embedding=baai_em)


# Agent Flows: 
- JargonDetector
- JargonEditor

In [8]:
from agents.jargon_detector import JargonDetector, InputMessage
from agents.jargon_editor import JargonEditor, InputEditMessage
from agents.query_decomposer import QueryDecomposer, InputMessage
from agents.context_compressor import ContextCompressor, InputContextCompressor
from agents.oracle import Oracle, InputOracle

In [16]:
def batch_conversation(original_message, model_name="us.meta.llama3-2-11b-instruct-v1:0"):
    potential_jargons = JargonDetector.run(request=InputMessage(message=original_message))
    detected_jargons = [j for j in potential_jargons.jargons if j.confidence>.5]
    proxy_message = original_message
    if len(detected_jargons) > 0:
        jargon_knowledges = []
        for j in detected_jargons:
            jk = jargon_memory.fulltext_search(search_query="STORM")
            jargon_knowledges.extend(jk)
    
        jargon_knowledges_str = "\n\n".join([f"{enum+1}: {j.jargon}\nEvidence: {j.evidence}\nExplanation: {j.explanation}" for enum, j in enumerate(jargon_knowledges)])
        edited_message = JargonEditor.run(InputEditMessage(knowledge=jargon_knowledges_str, message=original_message))
        proxy_message = edited_message.edited_message
        
    sub_queries = QueryDecomposer.run(InputMessage(message=proxy_message))
    retreived_contexts = []
    for sq in sub_queries.sub_queries:
        rc = longterm_memory.vector_search(search_query=sq, limit=10)
        retreived_contexts.extend(rc)
    id_list = []
    deduplicated_contexts = []
    for c in retreived_contexts:
        if c.id not in id_list:
            id_list.append(c.id)
            deduplicated_contexts.append(c)
    reranked_contexts, scores = rr.run(search_query=proxy_message, contexts=deduplicated_contexts, top_n=20)
    # compressed_contexts = []
    # knowledge_contexts = []
    # for enum, c in enumerate(reranked_contexts):
    #     cc = ContextCompressor.run(InputContextCompressor(context=c.context, query=proxy_message))
    #     compressed_contexts.append(cc)
    #     for i in cc.extracted_contexts:
    #         knowledge_contexts.append(
    #             Context(id=c.id, context=i, metadata=c.metadata.copy())
    #         )
    knowledge_contexts = reranked_contexts
    Oracle.model.model_name = model_name
    prior_knowledge = "\n\n".join([f"{c.context}" for c in knowledge_contexts if c.context.lower() not in "error"])
    # answer = Oracle.run(InputOracle(prior_knowledge=prior_knowledge, message=", ".join(sub_queries.sub_queries)))
    answer = Oracle.run(InputOracle(prior_knowledge=prior_knowledge, message=proxy_message))
    return answer, reranked_contexts

In [10]:
# original_message = "What does STORM do in the research study?"
original_message = "Explain how STORM works in the study to me like I'm a five years old."
potential_jargons = JargonDetector.run(request=InputMessage(message=original_message))
detected_jargons = [j for j in potential_jargons.jargons if j.confidence>.5]
detected_jargons

[PotentialJargon(jargon='STORM', confidence=0.8)]

In [23]:
proxy_message = original_message

In [11]:
jargon_knowledges = []
for j in detected_jargons:
    jk = jargon_memory.fulltext_search(search_query="STORM")
    jargon_knowledges.extend(jk)

In [12]:
jargon_knowledges_str = "\n\n".join([f"{enum+1}: {j.jargon}\nEvidence: {j.evidence}\nExplanation: {j.explanation}" for enum, j in enumerate(jargon_knowledges)])
print(jargon_knowledges_str)

1: STORM
Evidence: We present STORM to automate the pre-writing stage
Explanation: STORM is a system that automates the pre-writing stage

2: STORM
Evidence: we propose the STORM paradigm for the Synthesis of Topic Outlines through Retrieval and Multi-perspective Question Asking
Explanation: STORM is a paradigm for the Synthesis of Topic Outlines through Retrieval and Multi-perspective Question Asking

3: STORM
Evidence: STORM simulates a conversation between a Wikipedia writer and a topic expert
Explanation: STORM is a system that simulates conversations

4: STORM
Evidence: STORM discovers different perspectives by surveying existing articles from similar topics
Explanation: STORM is a system or method for discovering perspectives and controlling question asking process

5: STORM
Evidence: We propose STORM, a writing system for the Synthesis of Topic Outlines through Retrieval and Multi-perspective Question Asking
Explanation: STORM is a writing system for the Synthesis of Topic Outli

In [13]:
edited_message = JargonEditor.run(InputEditMessage(knowledge=jargon_knowledges_str, message=original_message))

In [24]:
if len(jargon_knowledges)>0:
    proxy_message = edited_message.edited_message

In [25]:
print(original_message)
print("="*10)
print(proxy_message)

Explain how STORM works in the study to me like I'm a five years old.
Explain how STORM, a system that automates the pre-writing stage, works in the study, considering it is a paradigm for the Synthesis of Topic Outlines through Retrieval and Multi-perspective Question Asking, and it simulates conversations between a writer and an expert, discovers different perspectives, and controls the question asking process, in a way that a five-year-old can understand.


In [15]:
QueryDecomposer.run(InputMessage(message=original_message))

DecomposedQueries(sub_queries=['How STORM works in a study', 'Explaining STORM to a 5-year-old'])

In [16]:
sub_queries = QueryDecomposer.run(InputMessage(message=proxy_message))
sub_queries.sub_queries

['How STORM automates the pre-writing stage',
 'How STORM simulates conversations between a writer and an expert',
 'How STORM discovers different perspectives',
 'How STORM controls the question asking process',
 'Explain STORM in simple terms for a five-year-old']

In [17]:
retreived_contexts = []
for sq in sub_queries.sub_queries:
    rc = longterm_memory.vector_search(search_query=sq, limit=10)
    retreived_contexts.extend(rc)

In [18]:
len(retreived_contexts)

50

In [19]:
id_list = []
deduplicated_contexts = []
for c in retreived_contexts:
    if c.id not in id_list:
        id_list.append(c.id)
        deduplicated_contexts.append(c)

In [20]:
len(retreived_contexts), len(deduplicated_contexts)

(50, 15)

In [26]:
reranked_contexts, scores = rr.run(search_query=proxy_message, contexts=deduplicated_contexts, top_n=10)

In [28]:
reranked_contexts[0].context

'STORM is a system that automates the pre-writing stage by researching a topic, creating an outline, and simulating conversations between a writer and an expert to generate a full-length article.\n\n<span id="page-2-8"></span>3 Method\n\nWe present STORM to automate the pre-writing stage by researching a given topic via effective question asking ([§3.1,](#page-3-0) [§3.2\\)](#page-3-1) and creating an outline ([§3.3\\)](#page-4-0). The outline will be extended to a fulllength article grounded on the collected references\n\n<span id="page-2-2"></span><sup>2</sup> In practice, S also includes organizational elements such as section and subsection titles, which do not require citations.\n\n<span id="page-2-3"></span><sup>3</sup>Obtained from [https://wikimedia.](https://wikimedia.org/api/rest_v1/metrics/edited-pages/top-by-edits/en.wikipedia/all-editor-types/content/) [org/api/rest\\\\_v1/metrics/edited-pages/](https://wikimedia.org/api/rest_v1/metrics/edited-pages/top-by-edits/en.wikiped

In [49]:
ContextCompressor.model.model_name = "us.meta.llama3-2-11b-instruct-v1:0"

In [50]:
compressed_contexts = []
knowledge_contexts = []
for enum, c in enumerate(reranked_contexts[:]):
    cc = ContextCompressor.run(InputContextCompressor(context=c.context, query=", ".join(sub_queries.sub_queries)))
    compressed_contexts.append(cc)
    for i in cc.extracted_contexts:
        knowledge_contexts.append(
            Context(id=c.id, context=i, metadata=c.metadata.copy())
        )

[91mBoth parse_structured_output and content_extractor failed:
1 validation error for ExtractedContext
  Invalid JSON: invalid escape at line 7 column 81 [type=json_invalid, input_value='\n{\n    "extracted_cont...le domain."\n    ]\n}\n', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
BroAgent.parse_structured_output() got multiple values for argument 'text'[0m
[91mBoth parse_structured_output and content_extractor failed:
1 validation error for ExtractedContext
  Invalid JSON: trailing characters at line 13 column 1 [type=json_invalid, input_value='\n{\n    "extracted_cont...n outline."\n    ]\n}\n', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
BroAgent.parse_structured_output() got multiple values for argument 'text'[0m


In [51]:
len(compressed_contexts), len(knowledge_contexts)

(10, 60)

In [62]:
compressed_contexts[1].extracted_contexts

['STORM, an LLM-based writing system, automates pre-writing stage for creating Wikipedia-like articles, improving outline and article quality.',
 'Experimental results demonstrate that the question asking mechanism in STORM improves both the outline and article quality.',
 'error',
 'error',
 'error',
 'We propose STORM, an LLM-based writing system that automates the pre-writing stage for creating Wikipedia-like articles from scratch.']

In [59]:
knowledge_contexts[12].context

'error'

In [63]:
prior_knowledge = "\n\n".join([f"{c.context}" for c in knowledge_contexts if c.context.lower() not in "error"])
print(prior_knowledge)

STORM is a system that automates the pre-writing stage by researching a topic, creating an outline, and simulating conversations between a writer and an expert to generate a full-length article.

We present STORM to automate the pre-writing stage by researching a given topic via effective question asking and creating an outline.

The outline will be extended to a fulllength article grounded on the collected references

S also includes organizational elements such as section and subsection titles, which do not require citations.

Obtained from https://wikimedia.org/api/rest_v1/metrics/edited-pages/top-by-edits/en.wikipedia/all-editor-types/content/

Since language models process and produce sequences, we can linearize O by adding "#" to indicate section titles, "##" to indicate subsection titles, etc.

It then simulates conversations between a Wikipedia writer who asks questions guided by the given perspective and an expert grounded on trustworthy online sources

The final outline is cu

In [64]:
# Oracle.model.model_name = "us.meta.llama3-2-3b-instruct-v1:0"
answer = Oracle.run(InputOracle(prior_knowledge=prior_knowledge, message=proxy_message))
print(answer)

Let me explain how STORM works in a way that's easy to understand.

Imagine you want to write a big article about a topic, like a book. But, you don't know where to start. That's where STORM comes in. STORM is like a super smart assistant that helps you write a great article.

Here's how it works:

1. **STORM finds different ideas**: STORM looks at other articles that are similar to the one you want to write. It reads those articles and finds different ideas and perspectives on the topic.
2. **STORM asks questions**: STORM then asks questions to an expert (like a super smart computer) to get more information about the topic. It asks questions like "What's the most important thing to know about this topic?" or "Can you give me an example of this?"
3. **STORM talks to the expert**: STORM has a conversation with the expert, and the expert answers the questions. STORM writes down the answers and uses them to create an outline for the article.
4. **STORM creates an outline**: The outline is

In [40]:
Oracle.model.model_name = "us.meta.llama3-2-11b-instruct-v1:0"
answer = Oracle.run(InputOracle(prior_knowledge=prior_knowledge, message=edited_message.edited_message))
print(answer.answer)

STORM simulates conversations between a Wikipedia writer and a topic expert, discovers different perspectives by surveying articles, and uses these perspectives to control question asking, prompting an LLM to generate a list of related topics and extract tables of contents to identify N perspectives that contribute to a comprehensive article.


In [41]:
Oracle.model.model_name = "us.meta.llama3-3-70b-instruct-v1:0"
answer = Oracle.run(InputOracle(prior_knowledge=prior_knowledge, message=edited_message.edited_message))
print(answer.answer)

STORM simulates conversations between a Wikipedia writer and a topic expert, discovers different perspectives by surveying articles, and creates an outline for an article by generating a draft outline from a topic and refining it with simulated conversations and LLM knowledge.


# Test model

In [19]:
model_list = [
    # "us.meta.llama3-2-3b-instruct-v1:0",
    "us.meta.llama3-2-11b-instruct-v1:0",
    "us.meta.llama3-3-70b-instruct-v1:0"
]

In [25]:
original_message = "What does STORM do in the research study?"
for m in model_list:
    answer, rr_c = batch_conversation(original_message=original_message, model_name=m)
    print("model:", m)
    print(answer)
    print("="*20)

model: us.meta.llama3-2-11b-instruct-v1:0
STORM is a writing system that automates the pre-writing stage for creating Wikipedia-like articles from scratch. It proposes a novel multi-stage approach to research a given topic, create an outline, and extend it to a full-length article grounded on collected references. STORM uses perspectives to guide question asking in the writing process, discovering different viewpoints by surveying articles and controlling the question asking process to create a comprehensive article.

STORM's main contributions include:

1. Evaluating the capacity of LLM systems at generating long-form grounded articles from scratch, and the pre-writing challenge in particular.
2. Proposing STORM, a novel system that automates the pre-writing stage by researching the topic and creating an outline using LLMs to ask incisive questions and retrieving trusted information from the Internet.
3. Demonstrating the effectiveness of STORM through both automatic and human evaluat

In [26]:
original_message = "What does the dataset used in the study?"
for m in model_list:
    answer, rr_c = batch_conversation(original_message=original_message, model_name=m)
    print("model:", m)
    print(answer)
    print("="*20)

model: us.meta.llama3-2-11b-instruct-v1:0
The dataset used in the study of STORM, a writing system for the Synthesis of Topic Outlines through Retrieval and Multi-perspective Question Asking, is called FreshWiki. It is a curated dataset of recent high-quality Wikipedia articles, which is used to evaluate the effectiveness of STORM in generating long-form articles from scratch.
model: us.meta.llama3-3-70b-instruct-v1:0
The dataset used in the study of STORM is called FreshWiki, which consists of recent, high-quality Wikipedia articles. It was curated to evaluate the performance of STORM in generating long-form articles from scratch, with comparable breadth and depth to Wikipedia pages. The dataset is used to assess the pre-writing stage and the quality of the generated outlines and articles.


In [27]:
original_message = "How does STORM work in the study?"
for m in model_list:
    answer, rr_c = batch_conversation(original_message=original_message, model_name=m)
    print("model:", m)
    print(answer)
    print("="*20)

model: us.meta.llama3-2-11b-instruct-v1:0
STORM is a writing system that automates the pre-writing stage by researching a given topic, creating an outline, and extending it to a full-length article grounded on collected references. It uses a novel multi-stage approach to discover diverse perspectives, simulate multi-turn conversations, and create a comprehensive outline. The system is based on two hypotheses: (1) diverse perspectives lead to varied questions, and (2) formulating in-depth questions requires iterative research.

STORM works by first discovering diverse perspectives by retrieving and analyzing Wikipedia articles from similar topics. It then personifies the LLM with specific perspectives for question asking. Next, it simulates multi-turn conversations where the answers to the generated questions are grounded on the Internet. Finally, based on the LLM's internal knowledge and the collected information, STORM creates an outline that can be expanded section by section to deve

In [28]:
original_message = "Explain how STORM works in the study."
for m in model_list:
    answer, rr_c = batch_conversation(original_message=original_message, model_name=m)
    print("model:", m)
    print(answer)
    print("="*20)

model: us.meta.llama3-2-11b-instruct-v1:0
STORM is a writing system that automates the pre-writing stage for creating Wikipedia-like articles from scratch. It uses a novel multi-stage approach to research a given topic, create an outline, and extend it to a full-length article grounded on collected references. The system is based on two hypotheses: (1) diverse perspectives lead to varied questions, and (2) formulating in-depth questions requires iterative research.

STORM employs a multi-stage approach to research the topic, which includes:

1. Discovering diverse perspectives by retrieving and analyzing Wikipedia articles from similar topics.
2. Simulating conversations where writers carrying different perspectives pose questions to a topic expert grounded on trusted Internet sources.
3. Curating the collected information to create an outline.

The system uses a novel multi-perspective question asking mechanism to generate questions that are grounded on the Internet and simulate multi

In [29]:
original_message = "Explain how STORM works in the study in plain English."
for m in model_list:
    answer, rr_c = batch_conversation(original_message=original_message, model_name=m)
    print("model:", m)
    print(answer)
    print("="*20)

model: us.meta.llama3-2-11b-instruct-v1:0
STORM is a writing system that helps generate long-form articles from scratch. It's like a research assistant that helps you find the right information and organize it in a way that makes sense. Here's how it works:

1. **Discovering perspectives**: STORM starts by finding different perspectives on a topic. It looks at related Wikipedia articles and extracts the tables of contents to get a sense of what's already been written about the topic.
2. **Simulating conversations**: STORM then simulates conversations between a Wikipedia writer and a topic expert. The writer asks questions, and the expert provides answers based on trusted sources from the Internet.
3. **Creating an outline**: After the conversations, STORM creates an outline for the article. It uses the information gathered to create a draft outline, and then refines it based on the conversations.
4. **Writing the article**: Finally, STORM uses the outline and the references collected t

In [30]:
original_message = "Explain how STORM works in the study to me like I'm a five years old."
for m in model_list:
    answer, rr_c = batch_conversation(original_message=original_message, model_name=m)
    print("model:", m)
    print(answer)
    print("="*20)

model: us.meta.llama3-2-11b-instruct-v1:0
So, you know how sometimes we need to write a big report or an article about something, and it can be really hard to know where to start? That's where STORM comes in. STORM is a special tool that helps us write those big reports and articles by breaking it down into smaller, more manageable pieces.

First, STORM helps us figure out what we want to write about. It looks at lots of different sources, like books and websites, to get a good understanding of the topic. Then, it asks us questions to help us think about what we want to say. It's like having a conversation with a friend, but instead of talking, we're writing!

Next, STORM helps us organize our thoughts into a plan. It creates an outline, which is like a map of what we want to say. This helps us make sure we cover all the important points and that our writing makes sense.

Finally, STORM helps us write the actual article. It uses the outline we created to guide us, and it even suggests 

In [31]:
original_message = "Explain how STORM works in the study to me in detail."
for m in model_list:
    answer, rr_c = batch_conversation(original_message=original_message, model_name=m)
    print("model:", m)
    print(answer)
    print("="*20)

model: us.meta.llama3-2-11b-instruct-v1:0
STORM, the Synthesis of Topic Outlines through Retrieval and Multi-perspective Question Asking, is a writing system that automates the pre-writing stage for creating Wikipedia-like articles. The system works in the following steps:

1. **Discovering different perspectives**: STORM discovers different perspectives on a given topic by surveying related Wikipedia articles. It extracts the tables of contents from these articles and concatenates them to create a context to prompt the LLM to identify N perspectives that can collectively contribute to a comprehensive article on the topic.

2. **Simulating conversations**: STORM simulates conversations between a Wikipedia writer and a topic expert. In each round of the conversation, the LLM-powered Wikipedia writer generates a single question based on the topic, its assigned perspective, and the conversation history. The conversation history enables the LLM to update its understanding of the topic and 

In [32]:
original_message = "What are dataset used in the study and how they are created?"
for m in model_list:
    answer, rr_c = batch_conversation(original_message=original_message, model_name=m)
    print("model:", m)
    print(answer)
    print("="*20)

model: us.meta.llama3-2-11b-instruct-v1:0
The study uses the FreshWiki dataset, which is a curated dataset of recent high-quality Wikipedia articles. The dataset is created to avoid data leakage during pretraining and is used to evaluate the effectiveness of the STORM system. The dataset is also used to establish evaluation criteria for both outline and final article quality.

In addition to the FreshWiki dataset, the study also uses a dataset of 100 samples from the FreshWiki dataset with human-written articles under 3000 words for comparison. The dataset is randomly selected and used to evaluate the performance of the STORM system.

The study also mentions the use of other datasets and resources, such as the Wikipedia API, to retrieve and analyze Wikipedia articles and to generate questions and answers. However, the primary dataset used in the study is the FreshWiki dataset.

The FreshWiki dataset is created by curating recent high-quality Wikipedia articles to avoid data leakage dur

In [33]:
%%time
m = "us.meta.llama3-2-1b-instruct-v1:0"
original_message = "What are dataset used in the study and how they are created?"
answer, rr_c = batch_conversation(original_message=original_message, model_name=m)
print(answer)

The study of STORM, a writing system for the Synthesis of Topic Outlines through Retrieval and Multi-perspective Question Asking, utilizes the following datasets:

1.  FreshWiki: A dataset of recent high-quality Wikipedia articles, curated to avoid data leakage during pretraining. This dataset is used to evaluate the effectiveness of STORM in generating grounded long-form articles.
2.  Wikipedia articles: The study also uses a set of recent Wikipedia articles to evaluate the performance of STORM in generating long-form articles.

The FreshWiki dataset is created by selecting 100 samples from the dataset with human-written articles not exceeding 3000 words. These samples are then used to train and evaluate the STORM system.

The study also mentions that the Wikipedia articles used in the dataset are curated to avoid data leakage during pretraining. This means that the dataset is not used to train the STORM system, but rather to evaluate its performance on a separate dataset.

The FreshW

In [36]:
len(rr_c)

20

In [37]:
for c in rr_c:
    print(c.metadata)

{'section': 'Abstract', 'source': '.docs/test1/storm.md', 'type': 'document', 'sequence': 1}
{'section': '1 Introduction', 'source': '.docs/test1/storm.md', 'type': 'document', 'sequence': 3}
{'section': '5 Results and Analysis', 'source': '.docs/test1/storm.md', 'type': 'document', 'sequence': 20}
{'section': '8 Conclusion', 'source': '.docs/test1/storm.md', 'type': 'document', 'sequence': 27}
{'section': '3.1 Perspective-Guided Question Asking', 'source': '.docs/test1/storm.md', 'type': 'document', 'sequence': 9}
{'section': '4 Experiments', 'source': '.docs/test1/storm.md', 'type': 'document', 'sequence': 13}
{'section': '3 Method', 'source': '.docs/test1/storm.md', 'type': 'document', 'sequence': 8}
{'section': '3.2 Simulating Conversations', 'source': '.docs/test1/storm.md', 'type': 'document', 'sequence': 10}
{'section': 'Algorithm 1: STORM', 'source': '.docs/test1/storm.md', 'type': 'document', 'sequence': 46}
{'section': '3.3 Creating the Article Outline', 'source': '.docs/test