In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from package.databases.initialize import initialize_memories
initialize_memories()

In [4]:
from package.utils.data_loder import PDFLoader
from package.interface import SourceOptions
from package.flows.offline import OfflineFlow
from package.databases.management.longterm import LongTermManagement
from package.databases.management.user import UserManagement
from package.databases.management.document import DocumentManagement
from package.databases.management.project import ProjectManagement
from package.databases.management.jargon import JargonManagement
from package.databases.session import Session, get_session, Depends
from package.databases.models.user import User
from package.databases.models.document import Document
from package.databases.models.project import Project
from package.databases.models.jargon import Jargon
from package.databases.models.longterm import LongTerm

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
user = User(
    username="bank",
    password="555",
    email="bank@bank.com"
)

um = UserManagement()
user = um.create_user(user, session=Depends(get_session))

In [6]:
user.id

'ef460369-7219-4325-8239-0f89310fe6ec'

In [7]:
document1 = Document(source="./sources/storm.pdf", type="pdf")

dm = DocumentManagement()
document1 = dm.create_document(document1, session=Depends(get_session)) 

In [16]:
document1.id

'fe8d0d2b-5e8f-49ae-922c-026eabedd8d3'

In [9]:
dm.read_document_longterms(document_id=document1.id, session=Depends(get_session))

[]

In [10]:
source_ops = SourceOptions(path="./sources/storm.pdf", type="pdf")
loader = PDFLoader(
    source=source_ops
)
contexts = loader.run()

Markdown headings: max(2)


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


In [11]:
olf = OfflineFlow()

In [12]:
longterms = olf.run(document_id=document1.id, contexts=contexts)

In [13]:
ltm = LongTermManagement()
ltm.create_raws(longterms, session=Depends(get_session))

In [14]:
from package.embedding.baai import BAAIEmbedding
embedding = BAAIEmbedding()

🔍 Loading model from: BAAI/bge-m3


In [15]:
document1.id

'fe8d0d2b-5e8f-49ae-922c-026eabedd8d3'

In [17]:
longterms = ltm.read_longterms_by_document(document_id=document1.id, session=Depends(get_session))

In [17]:
sentences = [longterm.raw for longterm in longterms]
vectors = embedding.run(sentences=sentences)
for longterm, vector in zip(longterms, vectors):
    longterm.raw_embedding = vector
ltm.update_longterms(longterms=longterms, session=Depends(get_session))

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [18]:
# query = """What does ROUGE Score do?"""
query = """How was FreshWiki created?"""
vector = embedding.run(sentences=[query])[0]
results = ltm.read_similar_text(vector, limit=15, embed_method="raw", sources=[document1.source], session=Depends(get_session))

In [19]:
for result in results:
    print(result.meta)

{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '2 FreshWiki', 'sequence': 5}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '2.1 The FreshWiki Dataset', 'sequence': 7}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': 'A Dataset Details', 'sequence': 40}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '8 Conclusion', 'sequence': 29}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': 'Abstract', 'sequence': 2}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': 'Limitations', 'sequence': 30}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '4 Experiments', 'sequence': 14}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '## Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models', 'sequence': 0}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': 'Ethics Statement', 'sequence': 32}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': 'D Human Evaluation Deta

In [20]:
results[0].id

'1ad51d70-b53c-4ffc-b7b5-d523ead048d6'

In [21]:
ltm.read_longterm(longterm_id=results[0].id, session=Depends(get_session))

LongTerm(raw="2 FreshWiki\n\nWe study generating Wikipedia-like articles from scratch, placing emphasis on the pre-writing stage (Rohman, 1965), which involves the demanding sub-tasks of gathering and curating relevant information ('research'). This models the human\n\n1 Our resources and code are released at https://github. com/stanford-oval/storm .\n\nTable 1: Comparison of different Wikipedia generation setups in existing literature. Generating one paragraph does not need an article outline.\n\n|                            | Domain   | Scope        | Given Outline?   | Given Refs?   |\n|----------------------------|----------|--------------|------------------|---------------|\n| Balepur et al. (2023)      | One      | One para.    | /                | Yes           |\n| Qian et al. (2023)         | All      | One para.    | /                | No            |\n| Fan and Gardent (2022)     | One      | Full article | Yes              | No            |\n| Liu et al. (2018)          | A

In [1]:
from package.databases.destroy import drop_all_tables

drop_all_tables()

✅ All tables dropped.
