## 1. LOAD NECESSARY LIBRARIES

In [3]:
from dotenv import load_dotenv
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from qdrant_client import QdrantClient, models

from qdrant_client.models import Distance, VectorParams, PointStruct
from qdrant_client.http.models import VectorParams, Distance, SparseVectorParams, SparseIndexParams
from qdrant_client.http.models import PointStruct, SparseVector

from collections import defaultdict
# from qdrant_client.http import models
from langchain_community.embeddings import GPT4AllEmbeddings
import hashlib
import json
load_dotenv()



True

## 2. CONSTRUCT LOCAL QDRANT DATABASE

Get embedding model

In [4]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
EMBEDDING_MODEL_NAME = os.getenv("EMBEDDING_MODEL_NAME")
COLLECTION_NAME = os.getenv("QDRANT_COLLECTION")
embedding_model = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL_NAME, api_key = GOOGLE_API_KEY)
sub_embedding_model = GPT4AllEmbeddings()

Define qdrant database

In [3]:
qdrant = QdrantClient(path="qdarnt_anime_db")
if not qdrant.collection_exists(collection_name=COLLECTION_NAME):
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config={
            "dense": VectorParams(size=384, distance=Distance.COSINE)
        },
        sparse_vectors_config={
            "sparse": SparseVectorParams(
                index=SparseIndexParams(on_disk=True)
            )
        }
    )

## 3. INSERT ANIME DATA INTO DATABASE

Create unique id for each data point

In [5]:
def make_id(text: str):
    return int(hashlib.sha1(text.encode("utf-8")).hexdigest(), 16) % (10**8)

Load each anime data into Qdrant database

In [None]:
def dedup_sparse(indices, values):
    agg = defaultdict(float)
    for i, v in zip(indices, values):
        agg[i] += v
    new_indices, new_values = zip(*agg.items())
    return list(new_indices), list(new_values)

def get_sparse_vector(text: str, vocab_size: int = 1000) -> SparseVector:
    tokens = text.lower().split()
    indices = [abs(hash(t)) % vocab_size for t in tokens]
    values = [1.0] * len(tokens)

    indices, values = dedup_sparse(indices, values)

    return SparseVector(indices=indices, values=values)


def load_anime(embedding_model, qdrant, collection_name, anime):
    name = anime.get("Name", "")
    synopsis = anime.get("Sypnosis", "")
    genres = anime.get("Genres", [])
    id = anime.get("ID", "")
    score = anime.get("Scores", "")
    type = anime.get("Type", "")
    episode = anime.get("Episodes", "")
    aired = anime.get("Aired", "")

    if not synopsis:
        return  

    synopsis_embedding = embedding_model.embed_query(synopsis)

    synopsis_bm25_vector = get_sparse_vector(synopsis)

    qdrant.upsert(
        collection_name=collection_name,
        points=[
            PointStruct(
                id=make_id(name),
                vector={
                    "dense": synopsis_embedding,   # vector dày đặc
                    "sparse": synopsis_bm25_vector # vector thưa
                },
                payload={
                    "name": name,
                    "synopsis": synopsis,
                    "genres": genres,
                    "id": id,
                    "score": score,
                    "type": type,
                    "episode": episode,
                    "aired": aired
                }
            )
        ]
    )

Load each anime data into Qdrant database

Execution

In [8]:
with open("output.json", "r", encoding="utf-8") as f:
    anime_data = json.load(f)

for anime in anime_data:
    load_anime(sub_embedding_model, qdrant, COLLECTION_NAME, anime)

## 3. TESTING WITH EXAMPLE

In [7]:
import os

print(os.getcwd())

d:\Python\Anime Recommender system\Backend\database


In [13]:
qdrant.close()

In [8]:
qdrant = QdrantClient(path="qdrant_anime_db")
collections = qdrant.get_collections()
print(collections)

collections=[CollectionDescription(name='anime')]


In [10]:
query_text = "football"

query_dense = sub_embedding_model.embed_query(query_text)

query_sparse = get_sparse_vector(query_text)  

indices, values = query_sparse.indices, query_sparse.values 


results = qdrant.query_points(
    collection_name=COLLECTION_NAME,
    prefetch=[
        models.Prefetch(
            query=models.SparseVector(indices=indices, values=values),
            using="sparse",
            limit=50,
        ),
        models.Prefetch(
            query=query_dense,   # list / array floats
            using="dense",
            limit=20,
        ),
    ],
    query=models.FusionQuery(fusion=models.Fusion.RRF),
    limit=15
)

for r in results.points:   # thay vì results
    print(r.payload["name"], r.score)


Shoujo Kakumei Utena 0.5
All Out!! 0.5
Initial D Fifth Stage 0.3333333333333333
High Score Girl II 0.3333333333333333
Tensei shitara Slime Datta Ken 2nd Season 0.25
Boku no Hero Academia 5th Season 0.25
Vampire Knight 0.2
Blue Lock 0.2
Mahouka Koukou no Rettousei 0.16666666666666666
Ao Ashi 0.16666666666666666
Initial D Fourth Stage 0.14285714285714285
Death Billiards 0.14285714285714285
Love Live! School Idol Project 2nd Season 0.125
Quanzhi Gaoshou Specials 0.125
Hitoribocchi no Marumaru Seikatsu 0.1111111111111111


In [10]:
query_text = "magic adventure with elves"

query_dense = sub_embedding_model.embed_query(query_text)

results = qdrant.query_points(
    collection_name="anime",
    query=query_dense,   # chỉ cần đưa dense vector
    using="dense",       # tên vector
    limit=5
)

for r in results.points:
    print(r.payload["name"], r.score)


Fairy Tail OVA 0.4533330314884486
Eromanga-sensei OVA 0.4325153662413365
Magi: The Labyrinth of Magic 0.43251311532249836
Princess Connect! Re:Dive 0.43043671920795784
Black Clover: Mahou Tei no Ken 0.4262352205974336


In [11]:
qdrant.close()