In [None]:
import duckdb
import pandas as pd
from sentence_transformers import SentenceTransformer

In [None]:
df = pd.read_parquet('../../data/articles.parquet')
metadata = pd.read_parquet('../../data/metadata')

In [31]:
df = df.merge(metadata[["_id", "triples"]], how="left", on="_id")

In [32]:
triples = df[['uuid', 'triples']].explode("triples").dropna()
triples = triples[triples.triples.apply(lambda x: len(x)) == 3]

In [None]:
triples['head']     = triples['triples'].apply(lambda x: x[0]).str.lower()
triples['relation'] = triples['triples'].apply(lambda x: x[1]).str.lower()
triples['tail']     = triples['triples'].apply(lambda x: x[2]).str.lower()

In [None]:
triples["flatten"] = triples.apply(
    lambda x: f"{x['head']} {x['relation']} {x['tail']}",
    axis=1
).str.lower()

In [None]:
triples = triples.drop(columns=['triples'])

In [None]:
con = duckdb.connect("triples.db")
con.execute("CREATE TABLE triples AS SELECT * FROM triples")
con.close()

In [14]:
model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(triples["flatten"].tolist(), show_progress_bar=True, device="cuda")

Batches:   0%|          | 0/6422 [00:00<?, ?it/s]

In [36]:
from fastembed import SparseTextEmbedding

bm25_model = SparseTextEmbedding(model_name="Qdrant/bm25")
triples['sparse'] = list(bm25_model.embed(triples['flatten'].tolist()))

triples['sparse_values'] = triples['sparse'].apply(
    lambda x: [float(v) for v in x.values]
)
triples['sparse_indices'] = triples['sparse'].apply(
    lambda x: [int(v) for v in x.indices]
)

triples = triples.drop(columns=['sparse'])

In [38]:
from qdrant_client import QdrantClient, models

client = QdrantClient("http://localhost:6333")

client.delete_collection("dev_triples")
client.create_collection(
    collection_name="dev_triples",
    vectors_config={},
    sparse_vectors_config={
        "text": models.SparseVectorParams(
            index=models.SparseIndexParams(on_disk=False),
            modifier=models.Modifier.IDF,
        )
    },
)

True

In [39]:
from tqdm import tqdm

batch_size = 1000

for i in tqdm(range(0, len(triples), batch_size)):
    batch = triples.iloc[i:i+batch_size]
    points = []
    for _, row in batch.iterrows():
        points.append(
            models.PointStruct(
                id=row.uuid,
                payload={
                    "uuid": row.uuid,
                    "head": row['head'],
                    "relation": row.relation,
                    "tail": row['tail'],
                    "flatten": row.flatten,
                },
                vector={
                    "text": models.SparseVector(
                        indices = row.sparse_indices,
                        values = row.sparse_values,
                    )
                }
            )
        )

    client.upsert(
        collection_name="dev_triples",
        points=points,
        wait=True,
    )

100%|██████████| 206/206 [00:12<00:00, 16.93it/s]
