In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pymupdf4llm
import uuid
from arai.utils.utils import Document
from arai.utils.llm_utils import llm_wrapper

In [3]:
llm_wrapper("Hello!")

"Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?"

In [4]:
path = './IS_example/STORM.pdf'

In [5]:
pages = pymupdf4llm.to_markdown(path, page_chunks=True)

Processing ./IS_example/STORM.pdf...


In [6]:
documents = [
    Document(
        id=str(uuid.uuid4()), 
        page_content=page['text'], 
        metadata=dict(
            page=page['metadata']['page'],
            source=page['metadata']['file_path']
        ),
        type="document"
    ) 
    for page in pages
]

In [7]:
len(documents)

27

In [250]:
page_1 = documents[0]

background = llm_wrapper(prompt=f"{page_1.page_content}\n\nRead the text above and summarize it.")

In [5]:
from arai.agents.online.inputagent import InputAgent
from arai.agents.online.organizer import OrganizerAgent
from arai.agents.online.jargon_detector import JargonDetector
from arai.memories.events import EventBus

event_bus = EventBus()
input_agent = InputAgent()
organizer_agent = OrganizerAgent()
jargon_translator = JargonDetector()

In [6]:
input_agent.start_conversation(user_input="S.O.S. is an abbreviation for Save Our Souls.")

InputAgent: Processing input: S.O.S. is an abbreviation for Save Our Souls.
OrganizerAgent: Routing request...
OrganizerAgent: parsing
JargonDetector: Extracting jargons, abbreviations, acronyms from INPUT...
JargonDetector: parsing


In [7]:
event_bus.get_latest_event()

{'user_input': 'S.O.S. is an abbreviation for Save Our Souls.',
 'jargons': {'jargons': [{'jargon': 'S.O.S.'}]},
 'source': 'JargonDetector',
 'timestamp': 1734835593,
 'event_type': 'editing_needed'}

In [8]:
event_bus.get_events()

[{'user_input': 'S.O.S. is an abbreviation for Save Our Souls.',
  'source': 'InputAgent',
  'timestamp': 1734835592,
  'event_type': 'input_received'},
 {'user_input': 'S.O.S. is an abbreviation for Save Our Souls.',
  'reason': 'INPUT contains jargons, abbreviations, or acronyms',
  'source': 'OrganizerAgent',
  'timestamp': 1734835592,
  'event_type': 'jargon_needed'},
 {'user_input': 'S.O.S. is an abbreviation for Save Our Souls.',
  'jargons': {'jargons': [{'jargon': 'S.O.S.'}]},
  'source': 'JargonDetector',
  'timestamp': 1734835593,
  'event_type': 'editing_needed'}]

In [9]:
from arai.memories.base import MemoryManagement
mm = MemoryManagement()

In [12]:
mm.list_memories()

Unnamed: 0,database,schema,name,column_names,column_types,temporary


In [18]:
mm.drop_memory()

In [13]:
mm.list_memories()

Unnamed: 0,database,schema,name,column_names,column_types,temporary


In [14]:
mm.create_memory()

In [15]:
mm.list_memories()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,memory,main,mockmemory,"[id, page_content, metadata, embedding, create...","[VARCHAR, VARCHAR, VARCHAR, FLOAT[100], INTEGE...",False


In [139]:
from arai.memories.long_term import LongTermMemory

ltm = LongTermMemory()

In [140]:
ltm.list_memories()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,memory,main,longtermschemas,"[id, page_content, metadata, embedding, create...","[VARCHAR, VARCHAR, VARCHAR, FLOAT[2], INTEGER,...",False


In [141]:
ltm.list_all()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,memory,main,longtermschemas,"[id, page_content, metadata, embedding, create...","[VARCHAR, VARCHAR, VARCHAR, FLOAT[2], INTEGER,...",False


In [142]:
ltm.add_document(
    document=["1", "page_content", "metadata", [1.1,1.2], 12345, 12345]
)

In [143]:
ltm.create_fts_index()

In [144]:
ltm.list_all()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,memory,fts_main_longtermschemas,dict,"[termid, term, df]","[BIGINT, VARCHAR, BIGINT]",False
1,memory,fts_main_longtermschemas,docs,"[docid, name, len]","[BIGINT, VARCHAR, BIGINT]",False
2,memory,fts_main_longtermschemas,fields,"[fieldid, field]","[BIGINT, VARCHAR]",False
3,memory,fts_main_longtermschemas,stats,"[num_docs, avgdl]","[BIGINT, DOUBLE]",False
4,memory,fts_main_longtermschemas,stopwords,[sw],[VARCHAR],False
5,memory,fts_main_longtermschemas,terms,"[docid, fieldid, termid]","[BIGINT, BIGINT, BIGINT]",False
6,memory,main,longtermschemas,"[id, page_content, metadata, embedding, create...","[VARCHAR, VARCHAR, VARCHAR, FLOAT[2], INTEGER,...",False


In [132]:
ltm.sample()

Unnamed: 0,id,page_content,metadata,embedding,created_at,updated_at
0,1,page_content,metadata,"[1.1, 1.2]",12345,12345


In [133]:
ltm.fts_search(query="page")

Unnamed: 0,id,page_content,metadata,score
0,1,page_content,metadata,0.124939


In [135]:
ltm.vector_search(query=[1.1,1.5])

Unnamed: 0,id,page_content,metadata,embedding,created_at,updated_at,score
0,1,page_content,metadata,"[1.1, 1.2]",12345,12345,0.994044


In [136]:
ltm.drop_all()

In [145]:
chunks = [
    [1,2,3,4,5,6,7,8,9,10],
    [11,12,13,14,15,16,17,18,19,20],
    [21,22,23,24,25,26,27,28,29,30],
    [31,32,33]
]

In [146]:
chunks[0][:6]

[1, 2, 3, 4, 5, 6]

In [203]:
document1 = [i for i in range(40)]
document2 = [i for i in range(30)]

In [31]:
def chunk_documents(documents, chunk_size=10, overlap=1):
    max = len(documents)
    start_idx = 0
    end_idx = chunk_size
    indexes = [(start_idx, end_idx)]
    response = []
    while end_idx < max:        
        start_idx = end_idx - overlap
        end_idx = start_idx + chunk_size
        indexes.append((start_idx, end_idx))
        response.append(documents[start_idx:end_idx])
    return max, indexes, response

In [32]:
chunks = documents[0:3]

In [33]:
chunks[0].__dict__

{'id': '7f2145d3-16e5-4788-a21a-d14937750006',
 'page_content': '## Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models\n\n### Yijia Shao Yucheng Jiang Theodore A. Kanell Peter Xu Omar Khattab Monica S. Lam Stanford University {shaoyj, yuchengj, tkanell, peterxu, okhattab}@stanford.edu lam@cs.stanford.edu\n\n\n### Abstract\n\n\nWe study how to apply large language models\nto write grounded and organized long-form articles from scratch, with comparable breadth\nand depth to Wikipedia pages. This underexplored problem poses new challenges at the\n_pre-writing stage, including how to research_\nthe topic and prepare an outline prior to writing. We propose STORM, a writing system\nfor the Synthesis of Topic Outlines through\n**Retrieval and Multi-perspective Question Ask-**\ning. STORM models the pre-writing stage by\n(1) discovering diverse perspectives in researching the given topic, (2) simulating conversations where writers carrying different perspectiv

In [33]:
from typing import List, Tuple
chunk_size = 100
overlap = int(chunk_size * 0.3)

# def validate_chunk_size(chunks:List[str], chunk_size:int)->int | None:
#     """validate chunk size and overlap
    
#     Steps:
#         - check if: chunks > 0
#         - create gap = chunk_size - chunks[-1]
#         - check if: gap > 0
#         - return gap
#     """
#     if len(chunks) > 0:
#         gap = chunk_size - len(chunks[-1].split(" "))
#         if gap > 0:
#             return gap
#     return None

# def chunk_documents(documents:List[Document], chunk_size:int=100, overlap:int=30)->List[Document]:
#     chunks:List[str] = []
#     for doc in documents:
#         contents = doc.page_content.replace("-\n","").replace("\n\n", "\n").split(" ")
#         gap = validate_chunk_size(chunks, chunk_size)
#         if gap is not None:
#             print(gap)
#             last_chunk = chunks[-1].split(" ")
#             chunks[-1] = " ".join(last_chunk + contents[:gap])
#         max_length = len(contents)
#         start_idx = 0
#         end_idx = chunk_size
#         response = []
#         while True:
#             response.append(" ".join(contents[start_idx:end_idx]))
#             if end_idx > max_length:
#                 break
#             start_idx = end_idx - overlap
#             end_idx = start_idx + chunk_size
#         chunks.extend(response)
#     return chunks

def validate_chunk_size(chunks:List[str], chunk_size:int)->bool:
    if len(chunks) > 0:
        gap = chunk_size - len(chunks[-1].split(" "))
        if gap > 0:
            return True
    return False

def chunk_normal(documents:List[Document], chunks:List[str], chunk_size:int, overlap:int, cnt:int)->Tuple[List[str], int]:
    """
    Steps:
        - page = documents[cnt]
        - page = page.page_content.split(" ")
        - max_length = len(page)
        - start_idx, end_idx = 0, chunk_size
        - loop
            - chunk = " ".join(page[start_idx:end_idx])
            - chunks.append(chunk)
            - if end_idx > max_length
                - break
            - start_idx = end_idx - overlap
            - end_idx = start_idx + chunk_size
    """
    # print("chunk_normal")
    page = documents[cnt]
    page = page.page_content.split(" ")
    max_length = len(page)
    start_idx = 0
    end_idx = chunk_size
    while True:
        chunks.append(" ".join(page[start_idx:end_idx]))
        if end_idx > max_length:
            break
        start_idx = end_idx - overlap
        end_idx = start_idx + chunk_size
    return chunks, cnt+1

def chunk_forward(documents:List[Document], chunks:List[str], chunk_size:int, threshold:int, cnt:int)->Tuple[str, int, int]:
    """repeat filling the chunk until it meets the chunk_size or threshold

    Args:
        documents (List[Document]): documents parsed by pymupdf as pages
        chunks (List[str]): chunks of documents
        chunk_size (int): how many words in a chunk
        threshold (int): maximum number of documents
        cnt (int): document index

    Returns:
        Tuple[str, int, int]: chunk will replace chunks[-1], dy_cnt is a new document index, offset is the index of a new document
    """
    # print("chunk_forward")
    dy_cnt = cnt
    last_chunk = chunks[-1].split(" ")
    while dy_cnt < threshold:
        gap = chunk_size - len(last_chunk)
        if gap == 0:
            break
        offset = gap
        page = documents[dy_cnt]
        page = page.page_content.split(" ")
        fill = page[:offset]
        last_chunk = last_chunk + fill
        dy_cnt += 1
    chunk = " ".join(last_chunk)
    return chunk, dy_cnt, offset

def chunk_backward(documents:List[Document], chunk_size:int, overlap:int, offset:int, cnt:int)->Tuple[str, int, int]:
    dy_cnt = cnt-1
    end_idx = (chunk_size-overlap) + offset
    chunk = documents[cnt].page_content.split(" ")[:end_idx]
    while dy_cnt > 0:
        gap = chunk_size - len(chunk)
        if gap == 0:
            break
        page = documents[dy_cnt]
        fill = page.page_content.split(" ")[-gap:]
        chunk = fill + chunk
        dy_cnt -= 1
    chunk = " ".join(chunk)
    return chunk, dy_cnt, 0

def validate_offset(offset:int, overlap:int)->bool:
    if offset!=0:
        if (offset - overlap) < 0:
            return True
    return False

def chunk_documents(documents:List[Document], chunk_size:int=100, overlap:int=30)->List[Document]:
    threshold = len(documents)
    cnt = 0
    offset = 0
    chunks:List[str] = []
    while cnt < threshold:
        if validate_chunk_size(chunks, chunk_size):
            chunk, _, offset = chunk_forward(documents, chunks, chunk_size, threshold, cnt)
            chunks[-1] = chunk
        # if validate_offset(offset, overlap): # this is not working
        #     chunk, cnt, offset = chunk_backward(documents, chunk_size, overlap, offset, cnt)
        #     chunks.append(chunk)
        if cnt >= threshold:
            break
        chunks, cnt = chunk_normal(documents, chunks, chunk_size, overlap, cnt)
    return chunks

In [9]:
len(documents)

27

In [194]:
from arai.utils.chunking import CustomChunker

chunk_size = 700
overlap = int(chunk_size*0.3)
chunker = CustomChunker(chunk_size=chunk_size, overlap=overlap)
_ = chunker.run(documents)

In [193]:
_ = chunker.chunks
chunker.check_chunks()

chunk 0 with size 12423


In [157]:
print(_[0])

## Assisting in Writing Wikipedia-like Articles From Scratch with Large


In [158]:
print(" ".join(_[0].split(" ")[-overlap:]))

Scratch with Large


In [159]:
print(" ".join(_[1].split(" ")[:overlap]))

Scratch with Large


In [160]:
print(_[1])

Scratch with Large Language Models

### Yijia Shao Yucheng Jiang Theodore


In [142]:
print(_[2])

## Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models

### Yijia Shao Yucheng Jiang Theodore A. Kanell Peter Xu Omar Khattab Monica S. Lam Stanford University {shaoyj, yuchengj, tkanell, peterxu, okhattab}@stanford.edu lam@cs.stanford.edu


### Abstract


We study how to apply large language models
to write grounded and organized long-form articles from scratch, with comparable breadth
and depth to Wikipedia pages. This underexplored problem poses new challenges at the
_pre-writing stage, including how to research_
the topic and prepare an outline prior to writing. We propose STORM, a writing system
for the Synthesis of Topic Outlines through
**Retrieval and Multi-perspective Question Ask-**
ing. STORM models the pre-writing stage by
(1) discovering diverse perspectives in researching the given topic, (2) simulating conversations where writers carrying different perspectives pose questions to a topic expert grounded
on trusted Internet sources, (3) cur

In [143]:
print(_[3])

## Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models

### Yijia Shao Yucheng Jiang Theodore A. Kanell Peter Xu Omar Khattab Monica S. Lam Stanford University {shaoyj, yuchengj, tkanell, peterxu, okhattab}@stanford.edu lam@cs.stanford.edu


### Abstract


We study how to apply large language models
to write grounded and organized long-form articles from scratch, with comparable breadth
and depth to Wikipedia pages. This underexplored problem poses new challenges at the
_pre-writing stage, including how to research_
the topic and prepare an outline prior to writing. We propose STORM, a writing system
for the Synthesis of Topic Outlines through
**Retrieval and Multi-perspective Question Ask-**
ing. STORM models the pre-writing stage by
(1) discovering diverse perspectives in researching the given topic, (2) simulating conversations where writers carrying different perspectives pose questions to a topic expert grounded
on trusted Internet sources, (3) cur

In [144]:
print(_[4])

## Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models

### Yijia Shao Yucheng Jiang Theodore A. Kanell Peter Xu Omar Khattab Monica S. Lam Stanford University {shaoyj, yuchengj, tkanell, peterxu, okhattab}@stanford.edu lam@cs.stanford.edu


### Abstract


We study how to apply large language models
to write grounded and organized long-form articles from scratch, with comparable breadth
and depth to Wikipedia pages. This underexplored problem poses new challenges at the
_pre-writing stage, including how to research_
the topic and prepare an outline prior to writing. We propose STORM, a writing system
for the Synthesis of Topic Outlines through
**Retrieval and Multi-perspective Question Ask-**
ing. STORM models the pre-writing stage by
(1) discovering diverse perspectives in researching the given topic, (2) simulating conversations where writers carrying different perspectives pose questions to a topic expert grounded
on trusted Internet sources, (3) cur