In [1]:
import sys
print(sys.version) # broai supports python3.11

3.11.12 (main, Apr  9 2025, 04:04:00) [Clang 20.1.0 ]


# Start

In [2]:
%load_ext autoreload
%autoreload 2

# Setup for the Test

In [3]:
from broai.prompt_management.core import Persona, Instructions, Example, Examples
from broai.llm_management.ollama import BedrockOllamaChat
from broai.prompt_management.core import PromptGenerator
from broai.agent_management.core import BroAgent
from pydantic import BaseModel, Field
from typing import List, Any

In [4]:
bedrock_model = BedrockOllamaChat(model_name='us.meta.llama3-2-11b-instruct-v1:0')

In [5]:
class Joke(BaseModel):
    setup:str = Field(description="this is a setup for the joke")
    punchline:str = Field(description="this is a punchline of the joke")

class Jokes(BaseModel):
    jokes:List[Joke]

In [6]:
class InputFormat(BaseModel):
    message:str = Field(description="The user message")

# BroAgent with full Framework: Happy Path

In [7]:
%%time
pg = PromptGenerator(
    persona="You are the good bro Andy.",
    instructions=Instructions(
        instructions=[
            "tell some jokes based on message",
        ],
    ),
    structured_output=Jokes,
    examples=Examples(examples=[
        Example(
            setting="Funny Andy",
            input=InputFormat(message="Gimme three jokes"),
            output=Jokes(jokes=[
                Joke(setup="the setup of the joke to build curiosity", punchline="the punchline is to complete the joke")
            ]),
        )
    ]),
    fallback=Jokes(jokes=[Joke(setup="error", punchline="error")])
)

bro = BroAgent(
    prompt_generator=pg,
    model=bedrock_model
)

response = bro.run(request=InputFormat(message="Tell me ten jokes."))
print(response)

jokes=[Joke(setup="Why don't scientists trust atoms?", punchline='Because they make up everything'), Joke(setup="Why don't eggs tell jokes?", punchline="They'd crack each other up"), Joke(setup='Why did the tomato turn red?', punchline='Because it saw the salad dressing'), Joke(setup='What do you call a fake noodle?', punchline='An impasta'), Joke(setup='Why did the scarecrow win an award?', punchline='Because he was outstanding in his field'), Joke(setup="Why don't lobsters share?", punchline="Because they're shellfish"), Joke(setup="What do you call a can opener that doesn't work?", punchline="A can't opener"), Joke(setup='I told my wife she was drawing her eyebrows too high.', punchline='She looked surprised'), Joke(setup="Why don't some couples go to the gym?", punchline="Because some relationships don't work out"), Joke(setup='Why did the bicycle fall over?', punchline='Because it was two-tired')]
CPU times: user 115 ms, sys: 12.5 ms, total: 128 ms
Wall time: 3.03 s


# BroAgent Full Framework: Pydantic Fallback

In [8]:
class Decoy(BaseModel):
    a:str
    b:str
    c:str
    e:int
    f:float

In [9]:
%%time
pg = PromptGenerator(
    persona="You are the good bro Andy.",
    instructions=Instructions(
        instructions=[
            "tell some jokes based on message",
        ],
    ),
    structured_output=Decoy,
    examples=Examples(examples=[
        Example(
            setting="Funny Andy",
            input=InputFormat(message="Gimme three jokes"),
            output=Jokes(jokes=[
                Joke(setup="the setup of the joke to build curiosity", punchline="the punchline is to complete the joke")
            ]),
        )
    ]),
    fallback=Jokes(jokes=[Joke(setup="error", punchline="error")])
)

bro = BroAgent(
    prompt_generator=pg,
    model=bedrock_model
)

response = bro.run(request="Tell me ten jokes.")
print(response)

jokes=[Joke(setup='error', punchline='error')]
CPU times: user 117 ms, sys: 4.93 ms, total: 122 ms
Wall time: 12.7 s


# BroAgent Full Framework: Default Fallback

In [10]:
%%time
pg = PromptGenerator(
    persona="You are the good bro Andy.",
    instructions=Instructions(
        instructions=[
            "tell some jokes based on message",
        ],
    ),
    structured_output=Decoy,
    examples=Examples(examples=[
        Example(
            setting="Funny Andy",
            input=InputFormat(message="Gimme three jokes"),
            output=Jokes(jokes=[
                Joke(setup="the setup of the joke to build curiosity", punchline="the punchline is to complete the joke")
            ]),
        )
    ]),
)

bro = BroAgent(
    prompt_generator=pg,
    model=bedrock_model
)

response = bro.run(request="Tell me ten jokes.")
print(response)

unknown error
CPU times: user 120 ms, sys: 29 μs, total: 120 ms
Wall time: 12.6 s


# BroAgent Full Framework: Custom Fallback in string

In [11]:
%%time
pg = PromptGenerator(
    persona="You are the good bro Andy.",
    instructions=Instructions(
        instructions=[
            "tell some jokes based on message",
        ],
    ),
    structured_output=Decoy,
    examples=Examples(examples=[
        Example(
            setting="Funny Andy",
            input=InputFormat(message="Gimme three jokes"),
            output=Jokes(jokes=[
                Joke(setup="the setup of the joke to build curiosity", punchline="the punchline is to complete the joke")
            ]),
        )
    ]),
    fallback="This is a custom fallback in string"
)

bro = BroAgent(
    prompt_generator=pg,
    model=bedrock_model
)

response = bro.run(request="Tell me ten jokes.")
print(response)

This is a custom fallback in string
CPU times: user 120 ms, sys: 599 μs, total: 121 ms
Wall time: 12.6 s


# BroAgent Full Framework: Custom Fallback not string

In [12]:
%%time
pg = PromptGenerator(
    persona="You are the good bro Andy.",
    instructions=Instructions(
        instructions=[
            "tell some jokes based on message",
        ],
    ),
    structured_output=Decoy,
    examples=Examples(examples=[
        Example(
            setting="Funny Andy",
            input=InputFormat(message="Gimme three jokes"),
            output=Jokes(jokes=[
                Joke(setup="the setup of the joke to build curiosity", punchline="the punchline is to complete the joke")
            ]),
        )
    ]),
    fallback={"error": "This is another custom fallback"}
)

bro = BroAgent(
    prompt_generator=pg,
    model=bedrock_model
)

response = bro.run(request="Tell me ten jokes.")
print(response)

{'error': 'This is another custom fallback'}
CPU times: user 120 ms, sys: 420 μs, total: 120 ms
Wall time: 12.5 s


# BroAgent with String: Happy Path

In [13]:
%%time
pg = PromptGenerator(
    persona="You are the good bro Andy.",
    instructions="tell some jokes based on message",
    structured_output="SETUP: \nthe setup of the joke to build curiosity\n\nPUNCHLINE: \nthe punchline is to complete the joke"
)

bro = BroAgent(
    prompt_generator=pg,
    model=bedrock_model
)

response = bro.run(request="Tell me ten jokes.")
print(response)

I gotcha! Here are ten jokes for ya:

1. **Setup:** I told my wife she was drawing her eyebrows too high.
**Punchline:** She looked surprised.

2. **Setup:** Why don't scientists trust atoms?
**Punchline:** Because they make up everything.

3. **Setup:** Why don't eggs tell jokes?
**Punchline:** They'd crack each other up.

4. **Setup:** What do you call a fake noodle?
**Punchline:** An impasta.

5. **Setup:** Why did the scarecrow win an award?
**Punchline:** Because he was outstanding in his field.

6. **Setup:** What do you call a can opener that doesn't work?
**Punchline:** A can't opener.

7. **Setup:** I'm reading a book about anti-gravity.
**Punchline:** It's impossible to put down.

8. **Setup:** Why did the bicycle fall over?
**Punchline:** Because it was two-tired.

9. **Setup:** What do you call a bear with no socks on?
**Punchline:** Barefoot.

10. **Setup:** Why did the banana go to the doctor?
**Punchline:** Because he wasn't peeling well.

Hope these jokes made you laugh

In [14]:
joke_in_string = response

# Bonus: Extract from normal string

In [15]:
# %%time

# class InputFormat(BaseModel):
#     content:str

# pg = PromptGenerator(
#     persona="You are a content extractor.",
#     instructions=Instructions(
#         instructions=[
#             "Extract the content into the sepcified JSON formant.",
#         ],
#     ),
#     structured_output=Jokes,
#     # examples=Examples(examples=[
#     #     Example(
#     #         setting="Joke Extraction",
#     #         input=InputFormat(content=joke_in_string),
#     #         output=Jokes(jokes=[
#     #             Joke(setup="the setup of the joke to build curiosity", punchline="the punchline is to complete the joke")
#     #         ]),
#     #     )
#     # ]),
#     fallback=Jokes(jokes=[Joke(setup="error", punchline="error")])
# )

# bro = BroAgent(
#     prompt_generator=pg,
#     model=bedrock_model
# )

# response = bro.run(request=InputFormat(content=joke_in_string))
# print(response)

# DuckStore

## Relational

In [16]:
from broai.duckdb_management.utils import get_create_table_query, get_insert_query, get_batch_update_query
from broai.duckdb_management.interface import DuckStoreInterface
from broai.duckdb_management.utils import DataTypeConversion
from broai.experiments.huggingface_embedding import EmbeddingDimension

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
converted_text = DataTypeConversion.convert_single_quote(text="My name's Jake.")
converted_text

'My name<|single_quote|>s Jake.'

In [18]:
reversed_text = DataTypeConversion.reverse_single_quote(text=converted_text)
reversed_text

"My name's Jake."

In [19]:
schemas = {
    "doc_id": "VARCHAR",
    "content": "VARCHAR",
    "data": "JSON"
}

sm = DuckStoreInterface(db="./duckmemory.db", table="sessionmemory", schemas=schemas)

In [20]:
sm.create_table()

In [21]:
sm.sql_df(query="SELECT * FROM sessionmemory;")

Unnamed: 0,doc_id,content,data


In [22]:
sm.show_schemas()

{'doc_id': 'VARCHAR', 'content': 'VARCHAR', 'data': 'JSON'}

In [23]:
_data = [
    ["0", "a"],
    ["1", "b"]
]
data = ", ".join([f"('{d[0]}', '{d[1]}')" for d in _data])
sm.add(fields=["doc_id", "content"], data=data)
sm.read(fields=["*"])

Unnamed: 0,doc_id,content,data
0,0,a,
1,1,b,


In [24]:
_data = [
    ["0", "aa"],
    ["1", "bb"]
]
data = ", ".join([f"('{d[0]}', '{d[1]}')" for d in _data])
sm.update(schemas={"doc_id": "VARCHAR", "content": "VARCHAR"}, data=data, ref_keys=["doc_id"])
sm.read()

Unnamed: 0,doc_id,content,data
0,0,aa,
1,1,bb,


In [25]:
sm.delete(where_condition="WHERE doc_id IN ('1', '2')")
sm.read()

Unnamed: 0,doc_id,content,data
0,0,aa,


In [26]:
sm.delete_table()

In [27]:
sm.drop_table()

In [28]:
sm.remove_database(confirm="remove database")

## Vector Search: not implement yet

In [29]:
schemas = {
    "id": "VARCHAR",
    "vectors": "FLOAT[3]"
}
vector_db = DuckStoreInterface(db="./duckmemory.db", table="vectors", schemas=schemas)
vector_db.create_table()

In [30]:
vector_db.read()

Unnamed: 0,id,vectors


In [31]:
vectors = [
    [0.0001,0.0001,0.0001],
    [0,.1,0],
    [.15,0,.10],
]
data = ", ".join(f"('{i}', {v})" for i, v in zip(["a", "b", "c"], vectors))
vector_db.add(fields=["id", "vectors"], data=data)
vector_db.read()

Unnamed: 0,id,vectors
0,a,"[1e-04, 1e-04, 1e-04]"
1,b,"[0.0, 0.1, 0.0]"
2,c,"[0.15, 0.0, 0.1]"


In [32]:
from typing import List
import duckdb

def vector_search(db:DuckStoreInterface, field:str, vector:List[float], embedding_size:int, limit=2):
    if len(vector) != embedding_size:
        raise Exception(f"vector must be of size, {embedding_size}. Instead got {len(vector)}")
    db_name = db.db
    table = db.table
    query = f"""SELECT *, array_cosine_similarity({field}, $searchVector::FLOAT[3]) AS score from {table} ORDER BY score DESC LIMIT {limit};"""
    with duckdb.connect(db_name) as con:
        df = con.sql(query, params=dict(searchVector=vector)).to_df()
    return df

In [33]:
vector = [0.0001,0.0001,0.0001]

vector_search(vector_db, field="vectors", vector=vector, embedding_size=3)

Unnamed: 0,id,vectors,score
0,a,"[1e-04, 1e-04, 1e-04]",1.0
1,c,"[0.15, 0.0, 0.1]",0.800641


In [34]:
vector_db.delete_table()

## Fulltext Search: not implement yet

In [35]:
schemas = {
    "id": "VARCHAR",
    "target_field": "VARCHAR"
}

fulltext_db = DuckStoreInterface(db="./duckmemory.db", table="fulltext", schemas=schemas)
fulltext_db.create_table()

In [36]:
ids = ["a", "b", "c", "d"]
targets = ["pandas", "pandee", "pandos", "PANDAS"]
data = ", ".join([f"('{i}', '{t}')" for i, t in zip(ids, targets)])

fulltext_db.add(fields=['id', 'target_field'], data=data)
fulltext_db.read()

Unnamed: 0,id,target_field
0,a,pandas
1,b,pandee
2,c,pandos
3,d,PANDAS


In [37]:
def create_fts_index(db:DuckStoreInterface, fields:List[str]):
    db_name = db.db
    table = db.table
    params = ", ".join([f"'{f}'" for f in fields])
    query=f"""
    INSTALL fts;
    LOAD fts;
    PRAGMA create_fts_index(
        '{table}', {params}, overwrite=1
    );
    """.strip()
    with duckdb.connect(db_name) as con:
        con.sql(query)

In [38]:
create_fts_index(db=fulltext_db, fields=["id", "target_field"])

In [39]:
fulltext_db.read()

Unnamed: 0,id,target_field
0,a,pandas
1,b,pandee
2,c,pandos
3,d,PANDAS


In [40]:
def fts_bm25(db:DuckStoreInterface, search_term:str, id_field, search_field:str):
    db_name = db.db
    table = db.table
    query = f"""\
    SELECT *
    FROM (
        SELECT *, fts_main_{table}.match_bm25(
            {id_field},
            '{search_term}',
            fields := '{search_field}'
        ) AS score
        FROM {table}
    ) sq
    ORDER BY score DESC;
    """
    with duckdb.connect(db_name) as con:
        df = con.sql(query).to_df()
    return df

In [41]:
fts_bm25(db=fulltext_db, search_term="pand", id_field="id", search_field="target_field")

Unnamed: 0,id,target_field,score
0,a,pandas,
1,b,pandee,
2,c,pandos,
3,d,PANDAS,


In [42]:
fulltext_db.delete_table()

# Utility

In [43]:
from broai.interface import Context, Contexts, TaskStatus

In [44]:
Context(context="Test")

Context(id='200f6b08-c96c-42fa-a00f-f86bfeff6879', context='Test', metadata=None, type='document', created_at='2025-04-25 00:40:44.616396')

In [45]:
contexts = Contexts(contexts=[Context(context=c, metadata={"source": s}) for c, s in zip(["test1", "test2"], ["source1", "source2"])])

In [46]:
print(contexts.as_knowledge())

Knowledge: 

Source: source1
Context: 
test1

Source: source2
Context: 
test2


# Experiment

## pdf_to_markdown

In [47]:
from broai.experiments.pdf_to_markdown import pdf_to_markdown

markdown_text, images = pdf_to_markdown("./docs/test1/storm.pdf")

  markdown_text, images = pdf_to_markdown("./docs/test1/storm.pdf")


Loaded layout model s3://layout/2025_02_18 on device cuda with dtype torch.float16
Loaded texify model s3://texify/2025_02_18 on device cuda with dtype torch.float16
Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded table recognition model s3://table_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16
Loaded detection model s3://inline_math_detection/2025_02_24 on device cuda with dtype torch.float16


Recognizing layout: 100%|██████████| 5/5 [00:04<00:00,  1.19it/s]
Running OCR Error Detection: 100%|██████████| 7/7 [00:00<00:00, 60.58it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Texify inference: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 2/2 [00:01<00:00,  1.45it/s]


In [48]:
with open("./docs/test1/storm.md", "w") as f:
    f.write(markdown_text)

## chunk

In [49]:
with open("./docs/test1/storm.md", "r") as f:
    markdown_text = f.read()

In [50]:
from broai.experiments.chunk import split_markdown, consolidate_markdown, get_markdown_sections, split_overlap, chunk_chunks
from broai.interface import Context, Contexts

In [51]:
chunks = split_markdown(markdown_text)

Markdown headings: max(4)


  chunks = split_markdown(markdown_text)


In [52]:
len(chunks)

54

In [53]:
consolidated_chunks = consolidate_markdown(chunks)
len(consolidated_chunks)

  consolidated_chunks = consolidate_markdown(chunks)


50

In [54]:
sections = get_markdown_sections(consolidated_chunks)
len(sections)

  sections = get_markdown_sections(consolidated_chunks)


50

In [55]:
contexts = Contexts()
source = ".docs/test1/storm.md"
for section, chunk in zip(sections, consolidated_chunks):
    contexts.add_context(Context(context=chunk, metadata={"section": section, "source": source, "type": "document"}))
len(contexts.contexts)

50

In [56]:
new_contexts = split_overlap(contexts.contexts)
new_contexts = Contexts(contexts=new_contexts)
len(new_contexts.contexts)

  new_contexts = split_overlap(contexts.contexts)


85

In [57]:
new_contexts.contexts[:3]

[Context(id='a3ddb7f2-dc10-4e22-aa0b-6870c43a6c27', context='# arXiv:2402.14207v2 [cs.CL] 8 Apr 2024\n\nAssisting in Writing Wikipedia-like Articles From Scratch with Large Language Models\n\nYijia Shao Yucheng Jiang Theodore A. Kanell Peter Xu Omar Khattab Monica S. Lam\n\nStanford University\n\n{shaoyj, yuchengj, tkanell, peterxu, okhattab}@stanford.edu lam@cs.stanford.edu\n', metadata={'section': '# arXiv:2402.14207v2 [cs.CL] 8 Apr 2024', 'source': '.docs/test1/storm.md', 'type': 'document', 'sequence': 0}, type='document', created_at='2025-04-25 00:41:11.064237'),
 Context(id='92072e95-0a56-4ea4-8a1c-bcf50951a0df', context="Abstract\n\nWe study how to apply large language models to write grounded and organized long-form articles from scratch, with comparable breadth and depth to Wikipedia pages. This underexplored problem poses new challenges at the *pre-writing* stage, including how to research the topic and prepare an outline prior to writing. We propose STORM, a writing system f

In [58]:
chunk_chunks([c.context for c in new_contexts.contexts])

[0] | tokens: 35 | chars: 309
[1] | tokens: 189 | chars: 1349
[2] | tokens: 500 | chars: 4170
[3] | tokens: 462 | chars: 3550
[4] | tokens: 500 | chars: 2030
[5] | tokens: 211 | chars: 1257
[6] | tokens: 162 | chars: 1096
[7] | tokens: 238 | chars: 1830
[8] | tokens: 196 | chars: 2327
[9] | tokens: 256 | chars: 1831
[10] | tokens: 226 | chars: 1627
[11] | tokens: 114 | chars: 759
[12] | tokens: 150 | chars: 1002
[13] | tokens: 57 | chars: 422
[14] | tokens: 166 | chars: 1305
[15] | tokens: 105 | chars: 681
[16] | tokens: 500 | chars: 1605
[17] | tokens: 500 | chars: 1131
[18] | tokens: 500 | chars: 868
[19] | tokens: 480 | chars: 1431
[20] | tokens: 500 | chars: 3018
[21] | tokens: 320 | chars: 924
[22] | tokens: 196 | chars: 1381
[23] | tokens: 500 | chars: 2242
[24] | tokens: 500 | chars: 3699
[25] | tokens: 260 | chars: 1890
[26] | tokens: 441 | chars: 3970
[27] | tokens: 88 | chars: 637
[28] | tokens: 159 | chars: 1194
[29] | tokens: 62 | chars: 413
[30] | tokens: 230 | chars: 1487

## Enmedding: BAAI/bge-m3

In [59]:
from broai.experiments.huggingface_embedding import BAAIEmbedding, EmbeddingDimension

In [60]:
EmbeddingDimension.BAAI_BGE_M3.value

1024

In [61]:
baai_em = BAAIEmbedding()

  baai_em = BAAIEmbedding()
Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 188649.36it/s]


In [62]:
test_vector = baai_em.run(["test", "tost"])
test_vector.shape[1]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1024

In [63]:
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]

In [64]:
baai_em.run(sentences_1)

array([[-0.03412  , -0.047    , -0.0009174, ...,  0.04828  ,  0.00756  ,
        -0.0296   ],
       [-0.010376 , -0.04483  , -0.02428  , ..., -0.00822  ,  0.01502  ,
         0.011086 ]], shape=(2, 1024), dtype=float16)

## CrossEncoder: cross-encoder/ms-marco-MiniLM-L6-v2

In [65]:
from broai.experiments.cross_encoder import ReRanker
from broai.interface import Context

In [66]:
rr = ReRanker()

  rr = ReRanker()


In [67]:
query = "pandas is good"
contexts = [Context(context=con, metadata={"source":"test"}) for con in ["pandas is goose", "pandas is good", "pandas is great", "pandas is goat", "pandas is gang"]]

In [68]:
reranked_contexts, scores = rr.run(query, contexts, top_n=3)
reranked_contexts

[Context(id='8a888faa-f629-4c3c-8d53-d40a22585bf7', context='pandas is good', metadata={'source': 'test'}, type='document', created_at='2025-04-25 00:41:16.031791'),
 Context(id='8989ec56-1d60-47fb-bda0-a091ffa33ad4', context='pandas is great', metadata={'source': 'test'}, type='document', created_at='2025-04-25 00:41:16.031811'),
 Context(id='e810ba0c-65af-4698-b1a0-76aba4ef03af', context='pandas is goat', metadata={'source': 'test'}, type='document', created_at='2025-04-25 00:41:16.031828')]

In [69]:
scores

[8.630863189697266, 7.362998962402344, 0.6360796689987183]

## ExpBroAgent

In [70]:
from broai.experiments.bro_agent import BroAgent

In [71]:
%%time
pg = PromptGenerator(
    persona=Persona(name="Bro Andy", description="You are the best bro who's cool and supportive."),
    instructions=Instructions(
        instructions=[
            "tell some jokes based on message",
        ],
    ),
    structured_output=Jokes,
    examples=Examples(examples=[
        Example(
            setting="Funny Andy",
            input=InputFormat(message="Gimme three jokes"),
            output=Jokes(jokes=[
                Joke(setup="the setup of the joke to build curiosity", punchline="the punchline is to complete the joke")
            ]),
        )
    ]),
    fallback=Jokes(jokes=[Joke(setup="error", punchline="error")])
)

bro = BroAgent(
    prompt_generator=pg,
    model=bedrock_model
)

response = bro.run(request=InputFormat(message="Tell me twenty jokes."))
print(response)

jokes=[Joke(setup="Why don't scientists trust atoms?", punchline='Because they make up everything'), Joke(setup="Why don't eggs tell jokes?", punchline="They'd crack each other up"), Joke(setup='Why did the tomato turn red?', punchline='Because it saw the salad dressing'), Joke(setup='What do you call a fake noodle?', punchline='An impasta'), Joke(setup='Why did the scarecrow win an award?', punchline='Because he was outstanding in his field'), Joke(setup="Why don't lobsters share?", punchline="Because they're shellfish"), Joke(setup="What do you call a can opener that doesn't work?", punchline="A can't opener"), Joke(setup='I told my wife she was drawing her eyebrows too high.', punchline='She looked surprised'), Joke(setup="Why don't some couples go to the gym?", punchline="Because some relationships don't work out"), Joke(setup='Why did the bicycle fall over?', punchline='Because it was two-tired'), Joke(setup='What do you call a bear with no socks on?', punchline='Barefoot'), Joke(

In [72]:
for enum, j in enumerate(response.jokes):
    print("Joke:", enum+1)
    print("setup:", j.setup)
    print("punchline:", j.punchline)
    print("="*20)

Joke: 1
setup: Why don't scientists trust atoms?
punchline: Because they make up everything
Joke: 2
setup: Why don't eggs tell jokes?
punchline: They'd crack each other up
Joke: 3
setup: Why did the tomato turn red?
punchline: Because it saw the salad dressing
Joke: 4
setup: What do you call a fake noodle?
punchline: An impasta
Joke: 5
setup: Why did the scarecrow win an award?
punchline: Because he was outstanding in his field
Joke: 6
setup: Why don't lobsters share?
punchline: Because they're shellfish
Joke: 7
setup: What do you call a can opener that doesn't work?
punchline: A can't opener
Joke: 8
setup: I told my wife she was drawing her eyebrows too high.
punchline: She looked surprised
Joke: 9
setup: Why don't some couples go to the gym?
punchline: Because some relationships don't work out
Joke: 10
setup: Why did the bicycle fall over?
punchline: Because it was two-tired
Joke: 11
setup: What do you call a bear with no socks on?
punchline: Barefoot
Joke: 12
setup: Why did the bana

## VectorStore

In [73]:
from broai.experiments.vector_store import DuckVectorStore

In [74]:
import json
from broai.interface import Context
from broai.experiments.huggingface_embedding import BAAIEmbedding, EmbeddingDimension
baai_em = BAAIEmbedding()

  baai_em = BAAIEmbedding()
Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 210769.05it/s]


In [75]:
vector_store = DuckVectorStore(db_name="./duckmemory.db", table="raw", embedding=baai_em)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [76]:
contexts = [
    Context(context="test's", metadata={"source":"source", "section": "section", "sequence":0}),
    Context(context="tost", metadata={"source":"source", "section": "section", "sequence":1}),
    Context(context="tast", metadata={"source":"source", "section": "section", "sequence":2}),
]
vector_store.add_contexts(contexts=contexts)

In [77]:
vector_store.read()

Unnamed: 0,id,context,metadata,embedding
0,94c7ae40-74b4-4a45-9ed2-bdfaf8824f87,test's,"{""source"":""source"",""section"":""section"",""sequen...","[-0.015838623, 0.0026397705, -0.061553955, -0...."
1,07c9d822-f4dc-479d-8764-dd5b8dbd73c7,tost,"{""source"":""source"",""section"":""section"",""sequen...","[0.03491211, 0.040252686, 0.010864258, 0.02061..."
2,7cb27154-caa5-4f80-9936-9329c4f4c99e,tast,"{""source"":""source"",""section"":""section"",""sequen...","[-0.027191162, 0.010307312, 0.0028152466, -0.0..."


In [78]:
records = vector_store.vector_search(search_term="tost", context=False)
records

Unnamed: 0,id,context,metadata,embedding,score
0,07c9d822-f4dc-479d-8764-dd5b8dbd73c7,tost,"{""source"":""source"",""section"":""section"",""sequen...","[0.03491211, 0.040252686, 0.010864258, 0.02061...",1.0
1,7cb27154-caa5-4f80-9936-9329c4f4c99e,tast,"{""source"":""source"",""section"":""section"",""sequen...","[-0.027191162, 0.010307312, 0.0028152466, -0.0...",0.635021
2,94c7ae40-74b4-4a45-9ed2-bdfaf8824f87,test's,"{""source"":""source"",""section"":""section"",""sequen...","[-0.015838623, 0.0026397705, -0.061553955, -0....",0.601116


In [79]:
records = vector_store.vector_search(search_term="tost", context=True)
records

[Context(id='07c9d822-f4dc-479d-8764-dd5b8dbd73c7', context='tost', metadata={'source': 'source', 'section': 'section', 'sequence': 1}, type='document', created_at='2025-04-25 00:41:26.902988'),
 Context(id='7cb27154-caa5-4f80-9936-9329c4f4c99e', context='tast', metadata={'source': 'source', 'section': 'section', 'sequence': 2}, type='document', created_at='2025-04-25 00:41:26.903027'),
 Context(id='94c7ae40-74b4-4a45-9ed2-bdfaf8824f87', context="test's", metadata={'source': 'source', 'section': 'section', 'sequence': 0}, type='document', created_at='2025-04-25 00:41:26.903039')]

In [80]:
records = vector_store.fulltext_search(search_term="tost", context=False)
records

Unnamed: 0,id,context,metadata,embedding,score
0,07c9d822-f4dc-479d-8764-dd5b8dbd73c7,tost,"{""source"":""source"",""section"":""section"",""sequen...","[0.03491211, 0.040252686, 0.010864258, 0.02061...",0.425969
1,94c7ae40-74b4-4a45-9ed2-bdfaf8824f87,test's,"{""source"":""source"",""section"":""section"",""sequen...","[-0.015838623, 0.0026397705, -0.061553955, -0....",
2,7cb27154-caa5-4f80-9936-9329c4f4c99e,tast,"{""source"":""source"",""section"":""section"",""sequen...","[-0.027191162, 0.010307312, 0.0028152466, -0.0...",


In [81]:
records = vector_store.fulltext_search(search_term="tost", context=True)
records

[Context(id='07c9d822-f4dc-479d-8764-dd5b8dbd73c7', context='tost', metadata={'source': 'source', 'section': 'section', 'sequence': 1}, type='document', created_at='2025-04-25 00:41:27.106760')]

In [82]:
vector_store.delete_table()

In [83]:
vector_store.drop_table()

In [84]:
vector_store.remove_database(confirm="remove database")