In [11]:
import pandas as pd
from search import search
from baml_client import b
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer

In [12]:
questions_df = pd.read_parquet("qa_testset.parquet")
question = questions_df.iloc[-3]
question.question

'Compare the longevity of Tetris (Game Boy) to Dr. Mario as puzzle games.'

In [13]:
docs = search(
    question,
    method='bm25',
    k=5,
    filter_by_entity=False,
    do_rerank=True,
)

In [24]:
print(docs[3].payload["text"])

# Dr. Mario

Dr. Mario is a 1990 puzzle video game developed and published by Nintendo for the Nintendo Entertainment System and Game Boy. It was produced by Gunpei Yokoi and designed by Takahiro Harada. The soundtrack was composed by Hirokazu Tanaka.
It is a falling block puzzle game, in which the player's objective is to destroy the viruses populating the on-screen playing field by using colored vitamin capsules that are automatically tossed into the field by Dr. Mario. The player manipulates the falling capsules, to align the same colors, which destroys viruses. The player progresses through the game by eliminating all the viruses on the screen in each level.
Dr. Mario was a commercial success, with more than 10 million copies sold worldwide across all platforms. It received generally positive reviews, appearing on several lists of "Best Nintendo Games of All Time". It has been ported, remade, or had a sequel on every Nintendo home console since the NES, and on most portable console

In [33]:
import numpy as np
from sentence_transformers import CrossEncoder
from sentence_splitter import SentenceSplitter, split_text_into_sentences
import torch

crossencoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2", device='cuda')

def conditioned_extractive_summarization(query, documents, top_n = 20):
    sentences = split_text_into_sentences(documents, language="en")
    sentences = documents.split("\n")
    sentences = np.array([s for s in sentences if len(s) > 0])

    scores = crossencoder.predict([(query, s) for s in sentences], activation_fn=torch.nn.Sigmoid())

    index = np.argsort(scores).tolist()[::-1]
    #index = index[:len(sentences) // 3]

    for i in index:
        print(f"{scores[i]:.9f}: {sentences[i]}")

    print(np.sum(scores))

    #index = np.sort(index)

    return sentences[index], scores

conditioned_extractive_summarization(question.question, docs[3].payload["text"], top_n=20)

0.861404717: Dr. Mario is a 1990 puzzle video game developed and published by Nintendo for the Nintendo Entertainment System and Game Boy. It was produced by Gunpei Yokoi and designed by Takahiro Harada. The soundtrack was composed by Hirokazu Tanaka.
0.293372869: Dr. Mario was a commercial success, with more than 10 million copies sold worldwide across all platforms. It received generally positive reviews, appearing on several lists of "Best Nintendo Games of All Time". It has been ported, remade, or had a sequel on every Nintendo home console since the NES, and on most portable consoles, including a re-release in 2004 on the Game Boy Advance in the Classic NES Series. It was modified into minigames in WarioWare, Inc.: Mega Microgames!, Brain Age 2: More Training in Minutes a Day!, and Brain Age: Concentration Training. Dr. Luigi is a spin-off for Wii U, released on December 31, 2013, as part of the Year of Luigi celebration.
0.135608926: It is a falling block puzzle game, in which th

(array(['Dr. Mario is a 1990 puzzle video game developed and published by Nintendo for the Nintendo Entertainment System and Game Boy. It was produced by Gunpei Yokoi and designed by Takahiro Harada. The soundtrack was composed by Hirokazu Tanaka.',
        'Dr. Mario was a commercial success, with more than 10 million copies sold worldwide across all platforms. It received generally positive reviews, appearing on several lists of "Best Nintendo Games of All Time". It has been ported, remade, or had a sequel on every Nintendo home console since the NES, and on most portable consoles, including a re-release in 2004 on the Game Boy Advance in the Classic NES Series. It was modified into minigames in WarioWare, Inc.: Mega Microgames!, Brain Age 2: More Training in Minutes a Day!, and Brain Age: Concentration Training. Dr. Luigi is a spin-off for Wii U, released on December 31, 2013, as part of the Year of Luigi celebration.',
        "It is a falling block puzzle game, in which the player

In [4]:

model = SentenceTransformer("all-MiniLM-L6-v2")

#triples_flat_embeddings = model.encode(triples_flatten, show_progress_bar=True)

In [5]:
import duckdb

def most_similar_triple(term):
    term = term.lower()

    with duckdb.connect("triples.db") as con:
        match = con.sql(f"""
        SELECT
            head,
            relation,
            tail,
            1 - levenshtein($${term}$$, flatten) / GREATEST(LENGTH($${term}$$), LENGTH(flatten)) as similarity
        FROM triples
        ORDER BY similarity DESC
        LIMIT 1
        """).fetchall()

    return match[0][:-1]


def get_neighbours(triple, side):
    with duckdb.connect("triples.db") as con:
        match = con.sql(f"""
        SELECT
            head,
            relation,
            tail,
            1 - levenshtein($${triple}$$, {side}) / GREATEST(LENGTH($${triple}$$), LENGTH({side})) as similarity
        FROM triples
        WHERE similarity > 0.95
        ORDER BY similarity DESC
        """).fetchall()

    return match

def get_neighbours(triple, side):
    with duckdb.connect("triples.db") as con:
        match = con.sql(f"""
        SELECT
            head,
            relation,
            tail,
        FROM triples
        WHERE {side} = $${triple}$$
        """).fetchall()

    return match

def find_neighbours(t):
    neighbours = get_neighbours(t[0], "head")
    neighbours += get_neighbours(t[0], "tail")
    neighbours += get_neighbours(t[-1], "head")
    neighbours += get_neighbours(t[-1], "tail")
    return [[[tuple(n)[:3]]] for n in neighbours]

In [6]:
import numpy as np

def expand_beam(beam, query_embedding, gamma=4, n_neighbours=100):
    t, s = beam

    neighbours = find_neighbours(t[-1])
    neighbours = [
        n for n in neighbours
        if n[0] not in t
    ]

    neighbours_str = [
        ", ".join(
            (" ".join(t) for t in n[0])
        )
        for n in neighbours
    ]

    n_emb = model.encode(neighbours_str)
    n_scores = model.similarity(query_embedding, n_emb).numpy()[0]

    for n_, s_ in zip(neighbours, n_scores):
        n_.append(s + s_)

    neighbours = sorted(neighbours, key=lambda x: x[-1], reverse=True)
    for i in range(len(neighbours)):
        neighbours[i][-1] = neighbours[i][-1] * np.exp(-min(i, gamma)/ gamma) 

    neighbours = sorted(neighbours, key=lambda x: x[-1], reverse=True)
    return neighbours[:n_neighbours]

In [7]:
def beam_search(beams, query_embedding, beam_size=10, length=2, gamma=2):
    beams_next = []
    for b in beams:
        new_beams = expand_beam(b, query_embedding, gamma=gamma)
        for new_beam in new_beams:
            new_beam[0] = b[0] + new_beam[0]
        beams_next += new_beams

    beams_next = sorted(beams_next, key=lambda x: x[-1], reverse=True)
    beams_next = beams_next[:beam_size]

    if length == 0:
        return beams_next

    return beam_search(beams_next, query_embedding, beam_size=beam_size, length=length-1)

In [8]:
def retrieve_docs_from_triples(triples):
    with duckdb.connect("triples.db") as con:
        uuids = con.sql(f"""
            SELECT DISTINCT uuid
            FROM triples
            WHERE flatten IN ({", ".join([f"$${t}$$" for t in triples])})
        """).fetchall()

    uuids = [u[0] for u in uuids]

    client = QdrantClient("http://localhost:6333")
    return client.retrieve(
        collection_name="dev_articles",
        ids=uuids
    )

In [9]:
def graph_expansion(question, documents):
    q_emb = model.encode(question)

    # Extract triples from retrieved documents
    documents_str = [f"Document {i}:\n{d.payload["text"]}" for i, d in enumerate(documents)]
    documents_str = "\n\n".join(documents_str)
    triples = b.ExtractTriples(question, documents_str)
    triples_flatten = [" ".join(triple) for triple in triples]

    # Match triples to triples in the BD
    ts = [
        most_similar_triple(t)
        for t in triples_flatten
    ]

    # Create beams (score triples)
    t_emb = model.encode([" ".join(t) for t in ts])
    t_scores = model.similarity(q_emb, t_emb).numpy()[0]
    beams = [
        [[t], s]
        for t, s in zip(ts, t_scores)
    ]

    # Expand graph
    r = beam_search(beams, q_emb, length=1)

    # Format triples from graph expansion
    final_triples = sum([
        b[0]
        for b in r
    ], [])

    final_triples_flat = list(set([
        " ".join(t)
        for t in final_triples
    ]))

    # Retrieve corresponding documents
    docs_uuid = [d.id for d in documents]
    retrieved_docs =  retrieve_docs_from_triples(final_triples_flat)
    retrieved_docs = [d for d in retrieved_docs if d.id in docs_uuid]

    return retrieved_docs

In [10]:
graph_expansion(
    question.question,
    docs
)

2025-05-12T12:50:53.345 [BAML [92mINFO[0m] [35mFunction ExtractTriples[0m:
    [33mClient: GeminiFlash (gemini-2.0-flash) - 1016ms. StopReason: STOP. Tokens(in/out): 1514/83[0m
    [34m---PROMPT---[0m
    [2m[43muser: [0m[2mYour task is to find facts that help answer an input question.
    You should present these facts as knowlege triples, which are structured as ("subject", "predicate", "object").
    Example:
    Question: When was Neville A. Stanton’s employer founded?
    Facts: ("Neville A. Stanton", "employer", "University of Southampton"), ("University of Southampton", "founded in",
    "1862")
    Now you are given some documents:
    Document 0:
    # Tetris & Dr. Mario
    
    Tetris & Dr. Mario is a 1994 puzzle video game compilation published by Nintendo for the Super Nintendo Entertainment System. It contains enhanced remakes of Tetris (1989) and Dr. Mario (1990), which were originally released for both the Nintendo Entertainment System and Game Boy in North 

IOException: IO Error: Could not set lock on file "/home/felix/Documents/lexum-test/tmp/triples.db": Conflicting lock is held in /home/felix/.local/share/uv/python/cpython-3.12.9-linux-x86_64-gnu/bin/python3.12 (PID 314583) by user felix. See also https://duckdb.org/docs/connect/concurrency