In [None]:
from uuid import uuid4
import pandas as pd
import duckdb

from qdrant_client import QdrantClient, models
from fastembed import SparseTextEmbedding

In [None]:
import sys
sys.path.append('../../src')

from utils import read_raw_dataset
from embeddings import generate_embeddings

In [None]:
kaggle_dataset_path = "../../../data/Wikipedia.json"
df = read_raw_dataset(kaggle_dataset_path)

Ajout des entites extraites

In [None]:
metadata = pd.read_parquet('../../data/metadata')

types = ['Game', 'Console', 'Publisher']
for t in types:
    metadata[t] = metadata.entities.apply(
        lambda x: [
            e['name'].lower()
            for e in x if e['type'] == t
        ]
    )

In [23]:
df = df.merge(
    metadata[['_id'] + types],
    on='_id',
    how='left'
)

for t in types:
    df[t] = df[t].apply(lambda x: [] if not isinstance(x, list) else x)

Generation des vecteurs pour le BM25 (sans le terme IDF, qui est gere par Qdrant)

In [None]:
bm25_model = SparseTextEmbedding(model_name="Qdrant/bm25")

df['sparse'] = list(bm25_model.embed(df.article.tolist()))

df['sparse_values'] = df['sparse'].apply(
    lambda x: [float(v) for v in x.values]
)
df['sparse_indices'] = df['sparse'].apply(
    lambda x: [int(v) for v in x.indices]
)

df = df.drop(columns=['sparse'])

Generation des vecteurs pour la recherche semantique

In [None]:
df['embedding'] = generate_embeddings(df.article.tolist())

Generation d'un uuid pour Qdrant

In [None]:
df['uuid'] = df['_id'].apply(
    lambda: str(uuid4())
)

Persistances des articles

In [None]:
df.to_parquet('../../data/articles.parquet', index=False)

Ingestion des donnees dans Qdrant

In [None]:
client = QdrantClient("http://localhost:6333")

# client.delete_collection("dev_articles")

client.create_collection(
    collection_name="dev_articles",
    vectors_config={
        "embedding" :models.VectorParams(
            size=768,
            distance=models.Distance.COSINE
        ),
    },
    sparse_vectors_config={
        "text": models.SparseVectorParams(
            index=models.SparseIndexParams(on_disk=False),
            modifier=models.Modifier.IDF,
        )
    },
)

True

In [26]:
batch_size = 100

for i in range(0, len(df), batch_size):
    batch = df.iloc[i:i+batch_size]
    points = []
    for _, row in batch.iterrows():
        points.append(
            models.PointStruct(
                id=row.uuid,
                payload={
                    "text":row.article,
                    "game": row.Game,
                    "console": row.Console,
                    "publisher": row.Publisher,
                },
                vector={
                    "embedding": list(row.embedding),
                    "text": models.SparseVector(
                        indices = row.sparse_indices,
                        values = row.sparse_values,
                    )
                }
            )
        )

    client.upsert(
        collection_name="dev_articles",
        points=points,
        wait=True,
    )

Persistance des entites (Games, Consoles, Publishers) avec duckdb

In [None]:
conn = duckdb.connect('entities.db')

In [None]:
conn.sql("CREATE TABLE IF NOT EXISTS games (name VARCHAR)")
conn.sql("CREATE TABLE IF NOT EXISTS consoles (name VARCHAR)")
conn.sql("CREATE TABLE IF NOT EXISTS publishers (name VARCHAR)")

In [None]:
def insert_entities(df, entity_type):
    conn.sql(f"""
        INSERT INTO {entity_type.lower()}s
        SELECT DISTINCT UNNEST({entity_type})
        FROM df
        WHERE {entity_type} IS NOT NULL
    """)

for t in types:
    insert_entities(df, t)

In [35]:
conn.close()