In [1]:
import json

from typing import List
from tokenizers import Tokenizer

from FlagEmbedding import BGEM3FlagModel


### Load model (BGE-M3)

In [2]:

model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

### Embed data

In [3]:
# Movie descriptions dataset
descriptions = ["In 1431, Jeanne d'Arc is placed on trial on charges of heresy. The ecclesiastical jurists attempt to force Jeanne to recant her claims of holy visions.",
 "A film projectionist longs to be a detective, and puts his meagre skills to work when he is framed by a rival for stealing his girlfriend's father's pocketwatch.",
 "A group of high-end professional thieves start to feel the heat from the LAPD when they unknowingly leave a clue at their latest heist.",
 "A petty thief with an utter resemblance to a samurai warlord is hired as the lord's double. When the warlord later dies the thief is forced to take up arms in his place.",
 "A young boy named Kubo must locate a magical suit of armour worn by his late father in order to defeat a vengeful spirit from the past.",
 "A biopic detailing the 2 decades that Punjabi Sikh revolutionary Udham Singh spent planning the assassination of the man responsible for the Jallianwala Bagh massacre.",
 "When a machine that allows therapists to enter their patients' dreams is stolen, all hell breaks loose. Only a young female therapist, Paprika, can stop it.",
 "An ordinary word processor has the worst night of his life after he agrees to visit a girl in Soho whom he met that evening at a coffee shop.",
 "A story that revolves around drug abuse in the affluent north Indian State of Punjab and how the youth there have succumbed to it en-masse resulting in a socio-economic decline.",
 "A world-weary political journalist picks up the story of a woman's search for her son, who was taken away from her decades ago after she became pregnant and was forced to live in a convent.",
 "Concurrent theatrical ending of the TV series Neon Genesis Evangelion (1995).",
 "During World War II, a rebellious U.S. Army Major is assigned a dozen convicted murderers to train and lead them into a mass assassination mission of German officers.",
 "The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home.",
 "A soldier fighting aliens gets to relive the same day over and over again, the day restarting every time he dies.",
 "After two male musicians witness a mob hit, they flee the state in an all-female band disguised as women, but further complications set in.",
 "Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household.",
 "A renegade reporter trailing a young runaway heiress for a big story joins her on a bus heading from Florida to New York, and they end up stuck with each other when the bus leaves them behind at one of the stops.",
 "Story of 40-man Turkish task force who must defend a relay station.",
 "Spinal Tap, one of England's loudest bands, is chronicled by film director Marty DiBergi on what proves to be a fateful tour.",
 "Oskar, an overlooked and bullied boy, finds love and revenge through Eli, a beautiful but peculiar girl."]

In [4]:
embeddings = model.encode(descriptions, return_dense=True, return_sparse=True, return_colbert_vecs=True)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [5]:
dense_embeddings = list(embeddings['dense_vecs'])
sparse_embeddings = list(embeddings['lexical_weights'])
colbert_embeddings = list(embeddings['colbert_vecs'])

### Check dimension

In [6]:
colbert_embeddings[0].shape, colbert_embeddings[1].shape

((42, 1024), (44, 1024))

In [7]:
dense_embeddings[0].shape

(1024,)

In [12]:
sparse_embeddings[0]

defaultdict(int,
            {'360': 0.0576,
             '616': 0.1642,
             '5016': 0.293,
             '16162': 0.2402,
             '86': 0.2198,
             '104': 0.189,
             '25': 0.06125,
             '7614': 0.2036,
             '238': 0.1659,
             '83': 0.006912,
             '158012': 0.1361,
             '98': 0.1133,
             '110324': 0.2754,
             '124666': 0.1661,
             '111': 0.0667,
             '3688': 0.2006,
             '4861': 0.164,
             '230907': 0.11053,
             '141': 0.01884,
             '112811': 0.1666,
             '7': 0.007507,
             '81887': 0.1313,
             '37772': 0.1365,
             '456': 0.07477,
             '36520': 0.1991,
             '140526': 0.1456,
             '739': 0.0752,
             '538': 0.0563,
             '37831': 0.2269})

### Connect to qdrant

In [8]:
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient, models d

True

In [9]:
# Load environment variables
load_dotenv('.env')

# Initialize Qdrant client
qdrant_client = QdrantClient(
    url=os.getenv("QDRANT_URL"), 
    api_key=os.getenv("QDRANT_API_KEY"),
    timeout=60
)

print(qdrant_client.ge t_collections())

collections=[CollectionDescription(name='test_collection')]


### Create Collection

In [None]:
# Create a collection
qdrant_client.create_collection(
    collection_name="movies",
    vectors_config={
        "dense": models.VectorParams(
            size=1024,
            distance=models.Distance.COSINE,
        ),
        "colbert": models.VectorParams(
            size=1024,
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM
            )
        ),
    },
    sparse_vectors_config={
        "sparse": models.SparseVectorParams()
    },
)

True

In [11]:
print(qdrant_client.get_collections())

collections=[CollectionDescription(name='movies'), CollectionDescription(name='test_collection')]


In [13]:
# Chuyển đổi danh sách các dict thành danh sách các SparseVector
converted_sparse_embeddings = []

for sparse_dict in sparse_embeddings:
    # Chuyển đổi các khóa từ chuỗi thành số nguyên
    indices = [int(k) for k in sparse_dict.keys()]
    # Lấy các giá trị tương ứng
    values = [sparse_dict[k] for k in sparse_dict.keys()]
    
    # Tạo đối tượng SparseVector
    sparse_vector = models.SparseVector(
        indices=indices,
        values=values
    )
    
    converted_sparse_embeddings.append(sparse_vector)

# In kết quả để kiểm tra
for vec in converted_sparse_embeddings:
    print(vec)

indices=[360, 616, 5016, 16162, 86, 104, 25, 7614, 238, 83, 158012, 98, 110324, 124666, 111, 3688, 4861, 230907, 141, 112811, 7, 81887, 37772, 456, 36520, 140526, 739, 538, 37831] values=[0.057586669921875, 0.1641845703125, 0.29296875, 0.240234375, 0.2198486328125, 0.18896484375, 0.061248779296875, 0.20361328125, 0.1658935546875, 0.0069122314453125, 0.1361083984375, 0.11328125, 0.275390625, 0.1661376953125, 0.06671142578125, 0.2005615234375, 0.1639404296875, 0.11053466796875, 0.0188446044921875, 0.1666259765625, 0.00750732421875, 0.13134765625, 0.136474609375, 0.07476806640625, 0.1990966796875, 0.1456298828125, 0.0751953125, 0.056304931640625, 0.2269287109375]
indices=[62, 1346, 13452, 87108, 18, 4989, 7, 47, 186, 10, 149, 100034, 4, 3884, 1919, 10548, 7134, 59376, 4488, 3229, 764, 2674, 297, 43876, 100, 67788, 2069, 169713, 67373, 151002, 35414, 5] values=[0.03570556640625, 0.186767578125, 0.188720703125, 0.180908203125, 0.133544921875, 0.2021484375, 0.0477294921875, 0.08880615234375,

### Upload to qdrant

In [14]:
# create a list of PointStructs
points = []
for i, desc in enumerate(descriptions):
    # Tạo một PointStruct cho mỗi mô tả
    # vector["dense"] là 1 vector 1024 chiều
    # vector["colbert"] là danh sách các vector (multivector)
    # vector["sparse"] là sparse vector (dùng SparseVector)
    point = models.PointStruct(
        id=i,  # hoặc bạn có thể dùng UUID hoặc bất kỳ id nào
        vector={
            "dense": dense_embeddings[i],
            "colbert": colbert_embeddings[i],
            "sparse": converted_sparse_embeddings[i]
        },
        payload={
            "description": desc
            # Thêm bất kỳ metadata nào khác bạn muốn
        }
    )
    points.append(point)

# Gửi batch lên Qdrant
qdrant_client.upsert(
    collection_name="movies",
    points=points
)

print("Done uploading points to Qdrant!")

Done uploading points to Qdrant!
