# STORM with local documents

This notebook downloads documentation from the US Federal Emergency Management Agency (FEMA) to use as part of STORM analysis on local documents.

The notebook ...

1. Downloads some FEMA documents
2. Parses and chunks them
3. Embeds with local "BAAI/bge-m3"
4. Creates a local filesystem Qdrant vector store
5. Runs STORM using this store

# Setup

1. See [README](./README) to set up a conda environment and `.env` file

In [1]:
import os
import openai
from dotenv import load_dotenv
import os
import pandas as pd
import requests
from uuid import uuid4
import json

from langchain_community.document_loaders import PyPDFLoader
from langchain.vectorstores.chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

from knowledge_storm import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
from knowledge_storm.rm import VectorRM
from knowledge_storm.lm import OpenAIModel, AzureOpenAIModel
from knowledge_storm.utils import load_api_key, QdrantVectorStoreManager


pd.set_option("display.max_colwidth", None)

# Load environment variables from .env file
load_dotenv()

# Initialize the OpenAI API client
openai.api_key = os.getenv("OPENAI_API_KEY")

DATA_DIR = "./data"
DB_DIR = f"{DATA_DIR}/db"
PDF_DIR = f"{DATA_DIR}/pdfs"
STORM_OUTPUT_DIR=f"{DATA_DIR}/storm_output"
DB_COLLECTION_NAME="fema_docs_demo"
EMBEDDING_MODEL="BAAI/bge-m3"

for dir in [DATA_DIR, PDF_DIR, DB_DIR, STORM_OUTPUT_DIR]:
    os.makedirs(dir, exist_ok=True)

model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=EMBEDDING_MODEL, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)


  from .autonotebook import tqdm as notebook_tqdm
sentence_transformers.SentenceTransformer : INFO     : Load pretrained SentenceTransformer: BAAI/bge-m3


In [2]:
vectors = embeddings.embed_query("Bagels are the best!")
num_vectors = len(vectors)

print(f"Number of vectors: {num_vectors}")

Number of vectors: 1024


# Analysis

## Indexing FEMA Disaster preparedness documents

### Get FEMA PDF documents

In [3]:
df = pd.read_csv(f"{DATA_DIR}/fema_docs.csv")
display(df)

Unnamed: 0,Source,URL,Extra instructions,Document
0,FEMA,https://www.fema.gov/emergency-managers/risk-management/hazard-mitigation-planning/risk-reduction-activities,"Selected ""Protect my home from natural hazards""",https://www.fema.gov/sites/default/files/2020-10/fema_scenario_1-active_shooter-01102020.pdf
1,FEMA,https://www.fema.gov/emergency-managers/risk-management/hazard-mitigation-planning/risk-reduction-activities,"Selected ""Protect my home from natural hazards""",https://www.fema.gov/sites/default/files/2020-10/fema_scenario_1_active_shooter_TTX_answer_key-01102020.pdf
2,FEMA,https://www.fema.gov/emergency-managers/risk-management/hazard-mitigation-planning/risk-reduction-activities,"Selected ""Protect my home from natural hazards""",https://www.fema.gov/sites/default/files/2020-11/fema_protect-your-property_coastal-erosion.pdf
3,FEMA,https://www.fema.gov/emergency-managers/risk-management/hazard-mitigation-planning/risk-reduction-activities,"Selected ""Protect my home from natural hazards""",https://www.fema.gov/sites/default/files/2020-11/fema_protect-your-property_earthquakes.pdf
4,FEMA,https://www.fema.gov/emergency-managers/risk-management/hazard-mitigation-planning/risk-reduction-activities,"Selected ""Protect my home from natural hazards""",https://www.fema.gov/sites/default/files/2020-11/fema_protect-your-home_flooding.pdf
5,FEMA,https://www.fema.gov/emergency-managers/risk-management/hazard-mitigation-planning/risk-reduction-activities,"Selected ""Protect my home from natural hazards""",https://www.fema.gov/sites/default/files/2020-11/fema_protect-your-property_severe-wind.pdf
6,FEMA,https://www.fema.gov/emergency-managers/risk-management/hazard-mitigation-planning/risk-reduction-activities,"Selected ""Protect my home from natural hazards""",https://www.fema.gov/sites/default/files/documents/fema_protect-your-property-storm-surge.pdf
7,FEMA,https://www.fema.gov/emergency-managers/risk-management/hazard-mitigation-planning/risk-reduction-activities,"Selected ""Protect my home from natural hazards""",https://www.fema.gov/sites/default/files/2020-11/fema_protect-your-property_wildfire.pdf
8,FEMA,https://www.fema.gov/emergency-managers/individuals-communities/what-would-you-do-scenarios,,https://www.fema.gov/sites/default/files/2020-10/fema_scenario_2_tornado-01102020.pdf
9,FEMA,https://www.fema.gov/emergency-managers/individuals-communities/what-would-you-do-scenarios,,https://www.fema.gov/sites/default/files/2020-10/fema_scenario_2-tornado_TTX_answer_key-01102020.pdf


### Build Vector Database

First we will build a FEMA RAG chain for asnwering questions about preparing for disasters, using FEMA PDFs.

In [6]:
# Download all documents as defined in 'Documents' column
for doc_url in df["Document"]:
    print(f"Downloading {doc_url}")
    response = requests.get(doc_url)
    with open(f"{PDF_DIR}/{doc_url.split('/')[-1]}", "wb") as f:
        f.write(response.content)


Downloading https://www.fema.gov/sites/default/files/2020-10/fema_scenario_1-active_shooter-01102020.pdf
Downloading https://www.fema.gov/sites/default/files/2020-10/fema_scenario_1_active_shooter_TTX_answer_key-01102020.pdf
Downloading https://www.fema.gov/sites/default/files/2020-11/fema_protect-your-property_coastal-erosion.pdf
Downloading https://www.fema.gov/sites/default/files/2020-11/fema_protect-your-property_earthquakes.pdf
Downloading https://www.fema.gov/sites/default/files/2020-11/fema_protect-your-home_flooding.pdf
Downloading https://www.fema.gov/sites/default/files/2020-11/fema_protect-your-property_severe-wind.pdf
Downloading https://www.fema.gov/sites/default/files/documents/fema_protect-your-property-storm-surge.pdf
Downloading https://www.fema.gov/sites/default/files/2020-11/fema_protect-your-property_wildfire.pdf
Downloading https://www.fema.gov/sites/default/files/2020-10/fema_scenario_2_tornado-01102020.pdf
Downloading https://www.fema.gov/sites/default/files/2020

### Index documents

We will use a very simple parser and chunking methodology to ingest documents for this demo.

In [69]:
# Load the PDFs
docs = []
for pdf_file in os.listdir(PDF_DIR):
    if not pdf_file.endswith(".pdf"):
        continue
    print(f"Loading PDF: {pdf_file}")
    file_path = f"{PDF_DIR}/{pdf_file}"
    loader = PyPDFLoader(file_path)
    docs = docs + loader.load()
    print(f"Loaded {len(docs)} documents")

print(len(docs))

Loading PDF: fema_scenario_10_power_outage_answer_key_01102020.pdf
Loaded 2 documents
Loading PDF: fema_scenario_7-shelter_in_place_TTX_answer_key_01102020.pdf
Loaded 5 documents
Loading PDF: ready_12-ways-to-prepare_postcard.pdf
Loaded 7 documents
Loading PDF: fema_safeguard-critical-documents-and-valuables.pdf
Loaded 10 documents
Loading PDF: ready_document-and-insure-your-property.pdf
Loaded 16 documents
Loading PDF: fema_scenario_1-active_shooter-01102020.pdf
Loaded 18 documents
Loading PDF: fema_protect-your-property_wildfire.pdf
Loaded 26 documents
Loading PDF: fema_scenario_4-hurricane-01102020.pdf
Loaded 27 documents
Loading PDF: fema_scenario_10_power_outage_01102020.pdf
Loaded 28 documents
Loading PDF: fema_scenario_4_hurricane_flood_TTX_answer_key-01102020.pdf
Loaded 30 documents
Loading PDF: fema_scenario_11_winter_storm_01102020.pdf
Loaded 32 documents
Loading PDF: fema_protect-your-property_severe-wind.pdf
Loaded 44 documents
Loading PDF: fema_protect-your-property-storm-

In [70]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

This is a very basic population of metadata, for real-world use-cases it would be more comprehensive. 

In [71]:
new_splits = []
for doc in splits:

    # pdf name is last part of doc.metadata['source']
    pdf_name = doc.metadata['source'].split('/')[-1]

    # Find row in df where pdf_name is in URL
    row = df[df['Document'].str.contains(pdf_name)]
    page = doc.metadata["page"] + 1
    url = f"{row['Document'].values[0]}?id={str(uuid4())}#page={page}"

    doc.metadata['description'] = ""
    doc.metadata['title'] = ""
    doc.metadata['url'] = url
    doc.metadata['content'] = doc.page_content

    #print(json.dumps(doc.metadata, indent=2))
    new_splits.append(doc)

splits = new_splits

In [73]:
client = QdrantClient(path=DB_DIR)

client.create_collection(
    collection_name=DB_COLLECTION_NAME,
    vectors_config=VectorParams(size=num_vectors, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name=DB_COLLECTION_NAME,
    embedding=embeddings,
)

In [74]:
uuids = [str(uuid4()) for _ in range(len(splits))]

vector_store.add_documents(documents=splits, ids=uuids)

['917cb1aa-d44a-4a8a-ac5f-99d439d896da',
 'dea15b43-2dd3-4953-94e5-539e0bb791ab',
 'd7e2c917-aa4d-4975-a102-8373d9fa25db',
 'b8f26716-e213-4bb3-b49c-3a08072123da',
 'ce96a0b4-c2fb-4a8d-848d-f90a1e15d6d8',
 '11aea13c-4689-4351-b010-213ef90f7607',
 '30eb11ec-4332-412b-b366-9c81342417b4',
 'd3600767-86a8-40a1-b5bb-556ab4a69c55',
 'd919a2e0-0293-4851-8315-2e8a8c290b0b',
 '301f7174-e7f4-4c62-9580-057908f2b93f',
 '3228f475-e687-47d2-a512-503aa90fcb5d',
 '944b1d8b-3fed-4370-9a6d-a4923d9f6352',
 'b84c9912-dcc7-4c1e-b508-82e90dac8479',
 '9e36d1da-21c1-49ec-af04-4a796c564a57',
 '32608a9e-2b71-4bf1-8a20-1b3df7c98503',
 'b57f5594-4ceb-4401-9b8c-4060135b54b4',
 '43804a3d-aa57-4964-a017-341cb5658dec',
 '1618ef06-dedb-4570-8bc7-377a30698d73',
 'ad112634-3ae5-42d6-84a3-091aa0f6f6bb',
 '16179053-7f0e-4027-b1c9-70ca7c4e5de0',
 '78c9abca-6c43-435c-8027-88d146fa7e42',
 '521a2a2c-988d-4f42-8e71-d547e9053b11',
 '57d83745-05da-4344-90b0-8adf81aaf628',
 'fccb3348-31b1-43e3-a96a-3824fb01a5ff',
 '336b6b6f-cfca-

### Base retriever

Let's do a quick check, also read vectors from disk.

In [4]:
# Remove DB_DIR/.lock
if os.path.exists(f"{DB_DIR}/.lock"):
    os.remove(f"{DB_DIR}/.lock")

print("Loading vector_store from disk")
client = QdrantClient(path=DB_DIR)
vector_store = QdrantVectorStore(
    client=client,
    collection_name=DB_COLLECTION_NAME,
    embedding=embeddings,
)

retriever = vector_store.as_retriever(search_kwargs={"k": 15})

Loading vector_store from disk


In [5]:
results = retriever.invoke("How can I prepare my house for a flood?")
for doc in results[0:3]:
    print("=====================================")
    print(json.dumps(doc.metadata))
    print(doc.page_content)

for Alerts
Plan with
NeighborsMake a Plan
Make Your 
Home
SaferDocument and
Insure PropertySafeguard
Documents
Know 
Evacuation
Routes Practice 
Emergency 
DrillsEXIT
Save for a
Rainy DayTest Family
Communication
Plan12 WAYS TO PREPARE
Make a Plan Save for a 
Rainy Day 
Plan with 
Neighbors Document and 
Insure PropertySafeguard 
Documents Sign up 
for Alerts 
Communication 
Plan EXIT
Practice 
Emergency 
Drills 
Get Involved in 
Your Community Assemble or 
Update 
Supplies Know 
Evacuation 
Routes Make Your 
Home 
Safer 12 WAYS TO PREPARE
{"source": "./data/pdfs/fema_protect-your-property-storm-surge.pdf", "page": 3, "description": "", "title": "", "url": "https://www.fema.gov/sites/default/files/documents/fema_protect-your-property-storm-surge.pdf?id=518702cf-ce48-4452-92fd-c11a0fe54f17#page=4", "content": "your agent to  \nget coverage.\n \n PREPARE OR  \nUPDATE A LIST OF \n YOUR HOME\u2019S \nCONTENTSDocument your belongings. This will give you peace of mind \nand help with the ins

## Run STORM Using our local document vectors

From the STORM [examples](https://github.com/stanford-oval/storm/blob/main/examples/storm_examples/README.md) ...

In [6]:
def run_storm(topic):

    # Clear lock so can be read
    if os.path.exists(f"{DB_DIR}/.lock"):
        os.remove(f"{DB_DIR}/.lock")

    # Initialize the language model configurations
    engine_lm_configs = STORMWikiLMConfigs()
    openai_kwargs = {
        'api_key': os.getenv("OPENAI_API_KEY"),
        'temperature': 1.0,
        'top_p': 0.9,
    }

    ModelClass = OpenAIModel if os.getenv('OPENAI_API_TYPE') == 'openai' else AzureOpenAIModel
    # If you are using Azure service, make sure the model name matches your own deployed model name.
    # The default name here is only used for demonstration and may not match your case.
    gpt_35_model_name = 'gpt-4o-mini' if os.getenv('OPENAI_API_TYPE') == 'openai' else 'gpt-35-turbo'
    gpt_4_model_name = 'gpt-4o'
    if os.getenv('OPENAI_API_TYPE') == 'azure':
        openai_kwargs['api_base'] = os.getenv('AZURE_API_BASE')
        openai_kwargs['api_version'] = os.getenv('AZURE_API_VERSION')

    # STORM is a LM system so different components can be powered by different models.
    # For a good balance between cost and quality, you can choose a cheaper/faster model for conv_simulator_lm 
    # which is used to split queries, synthesize answers in the conversation. We recommend using stronger models
    # for outline_gen_lm which is responsible for organizing the collected information, and article_gen_lm
    # which is responsible for generating sections with citations.
    conv_simulator_lm = ModelClass(model=gpt_35_model_name, max_tokens=500, **openai_kwargs)
    question_asker_lm = ModelClass(model=gpt_35_model_name, max_tokens=500, **openai_kwargs)
    outline_gen_lm = ModelClass(model=gpt_4_model_name, max_tokens=400, **openai_kwargs)
    article_gen_lm = ModelClass(model=gpt_4_model_name, max_tokens=700, **openai_kwargs)
    article_polish_lm = ModelClass(model=gpt_4_model_name, max_tokens=4000, **openai_kwargs)

    engine_lm_configs.set_conv_simulator_lm(conv_simulator_lm)
    engine_lm_configs.set_question_asker_lm(question_asker_lm)
    engine_lm_configs.set_outline_gen_lm(outline_gen_lm)
    engine_lm_configs.set_article_gen_lm(article_gen_lm)
    engine_lm_configs.set_article_polish_lm(article_polish_lm)

    max_conv_turn=3
    max_perspective=3
    search_top_k=5
    max_thread_num=1
    device='cpu'
    vector_db_mode='offline'

    do_research=True
    do_generate_outline=True
    do_generate_article=True
    do_polish_article=True

    # Initialize the engine arguments
    engine_args = STORMWikiRunnerArguments(
        output_dir=STORM_OUTPUT_DIR,
        max_conv_turn=max_conv_turn,
        max_perspective=max_perspective,
        search_top_k=search_top_k,
        max_thread_num=max_thread_num,
    )

    # Setup VectorRM to retrieve information from your own data
    rm = VectorRM(collection_name=DB_COLLECTION_NAME, \
                    embedding_model=EMBEDDING_MODEL, \
                    device=device, \
                    k=search_top_k)

    # initialize the vector store, either online (store the db on Qdrant server) or offline (store the db locally):
    if vector_db_mode == 'offline':
        rm.init_offline_vector_db(vector_store_path=DB_DIR)

    # Initialize the STORM Wiki Runner
    runner = STORMWikiRunner(engine_args, engine_lm_configs, rm)

    # run the pipeline
    runner.run(
        topic=topic,
        do_research=do_research,
        do_generate_outline=do_generate_outline,
        do_generate_article=do_generate_article,
        do_polish_article=do_polish_article,
    )
    runner.post_run()
    runner.summary()




In [78]:
run_storm("Write a detailed and comprehensive report on how should people prepare their homes and respond in the event of extreme flood events?")

sentence_transformers.SentenceTransformer : INFO     : Load pretrained SentenceTransformer: BAAI/bge-m3
  self.qdrant = Qdrant(


Collection fema_docs_demo exists. Loading the collection...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# https://github.com/stanford-oval/storm/issues/117 Citations