## 1. LOAD NECESSARY LIBRARIES

In [1]:
from dotenv import load_dotenv
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from qdrant_client import QdrantClient, models

from qdrant_client.models import Distance, VectorParams, PointStruct
from qdrant_client.http.models import VectorParams, Distance, SparseVectorParams, SparseIndexParams
from qdrant_client.http.models import PointStruct, SparseVector

from collections import defaultdict
# from qdrant_client.http import models
from langchain_community.embeddings import GPT4AllEmbeddings
import hashlib
import json
load_dotenv()



True

## 2. CONSTRUCT LOCAL QDRANT DATABASE

Get embedding model

In [4]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
EMBEDDING_MODEL_NAME = os.getenv("EMBEDDING_MODEL_NAME")
COLLECTION_NAME = os.getenv("QDRANT_COLLECTION")
embedding_model = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL_NAME, api_key = GOOGLE_API_KEY)
sub_embedding_model = GPT4AllEmbeddings()

Define qdrant database

In [3]:
qdrant = QdrantClient(path="qdarnt_anime_db")
if not qdrant.collection_exists(collection_name=COLLECTION_NAME):
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config={
            "dense": VectorParams(size=384, distance=Distance.COSINE)
        },
        sparse_vectors_config={
            "sparse": SparseVectorParams(
                index=SparseIndexParams(on_disk=True)
            )
        }
    )

## 3. INSERT ANIME DATA INTO DATABASE

Create unique id for each data point

In [4]:
def make_id(text: str) -> int:
    return int(hashlib.md5(text.encode()).hexdigest(), 16) % (10**12)

Load each anime data into Qdrant database

In [6]:
def dedup_sparse(indices, values):
    agg = defaultdict(float)
    for i, v in zip(indices, values):
        agg[i] += v
    new_indices, new_values = zip(*agg.items())
    return list(new_indices), list(new_values)

# ✅ Hàm tạo vector thưa từ text (cực đơn giản: hash token -> chỉ số)
def get_sparse_vector(text: str, vocab_size: int = 1000) -> SparseVector:
    tokens = text.lower().split()
    indices = [abs(hash(t)) % vocab_size for t in tokens]
    values = [1.0] * len(tokens)

    indices, values = dedup_sparse(indices, values)

    return SparseVector(indices=indices, values=values)

# ✅ Tạo ID ổn định dựa trên title
def make_id(text: str):
    return int(hashlib.sha1(text.encode("utf-8")).hexdigest(), 16) % (10**8)

# ✅ Hàm load 1 anime vào Qdrant
def load_anime(embedding_model, qdrant, collection_name, anime):
    title = anime.get("title", "")
    synopsis = anime.get("sypnosis", "")
    genres = anime.get("genres", [])

    if not synopsis:
        return  # bỏ qua nếu không có synopsis

    # Dense vector cho semantic search
    synopsis_embedding = embedding_model.embed_query(synopsis)

    # Sparse vector cho BM25-like search
    synopsis_bm25_vector = get_sparse_vector(synopsis)

    # Upsert vào Qdrant
    qdrant.upsert(
        collection_name=collection_name,
        points=[
            PointStruct(
                id=make_id(title),
                vector={
                    "dense": synopsis_embedding,   # vector dày đặc
                    "sparse": synopsis_bm25_vector # vector thưa
                },
                payload={
                    "title": title,
                    "synopsis": synopsis,
                    "genres": genres
                }
            )
        ]
    )

Load each anime data into Qdrant database

In [19]:
def load_anime(embedding_model, qdrant, collection_name, anime):
    title = anime.get("title", "")
    synopsis = anime.get("sypnosis", "")
    genres = anime.get("genres", [])


    synopsis_embedding = embedding_model.embed_query(synopsis)
    qdrant.upsert(
        collection_name=collection_name,
        points=[
            PointStruct(
                id=make_id(title),
                vector=synopsis_embedding,
                payload={
                    "title": title,
                    "synopsis": synopsis,
                    "genres": genres
                }
            )
        ]
    ) 


Execution

In [10]:
with open("short_file.json", "r", encoding="utf-8") as f:
    anime_data = json.load(f)

for anime in anime_data:
    load_anime(sub_embedding_model, qdrant, COLLECTION_NAME, anime)

## 3. TESTING WITH EXAMPLE

In [2]:
qdrant = QdrantClient(path="qdarnt_anime_db")

In [None]:
query_text = "animes about ninjas"

query_dense = sub_embedding_model.embed_query(query_text)

query_sparse = get_sparse_vector(query_text)  

indices, values = query_sparse.indices, query_sparse.values 


results = qdrant.query_points(
    collection_name=COLLECTION_NAME,
    prefetch=[
        models.Prefetch(
            query=models.SparseVector(indices=indices, values=values),
            using="sparse",
            limit=20,
        ),
        models.Prefetch(
            query=query_dense,   # list / array floats
            using="dense",
            limit=20,
        ),
    ],
    query=models.FusionQuery(fusion=models.Fusion.RRF),
    limit=5
)

for r in results.points:   # thay vì results
    print(r.payload["title"], r.score)


Kamonohashi Ron no Kindan Suiri 0.5
Fairy Tail OVA 0.5
Ai no Utagoe wo Kikasete 0.3333333333333333
Eromanga-sensei OVA 0.3333333333333333
Ballroom e Youkoso 0.25


In [10]:
query_text = "magic adventure with elves"

# Dense embedding từ model semantic
query_dense = sub_embedding_model.embed_query(query_text)

results = qdrant.query_points(
    collection_name="anime",
    query=query_dense,   # chỉ cần đưa dense vector
    using="dense",       # tên vector
    limit=5
)

for r in results.points:
    print(r.payload["title"], r.score)


Fairy Tail OVA 0.45333303130284963
Eromanga-sensei OVA 0.4325153296221808
Magi: The Labyrinth of Magic 0.4325131148627007
Princess Connect! Re:Dive 0.43043671949057566
Black Clover: Mahou Tei no Ken 0.42623522033830785
