In [1]:
import json
import tqdm

from typing import List
from tokenizers import Tokenizer
import pandas as pd

from FlagEmbedding import BGEM3FlagModel



## Load model (BGE-M3)

In [2]:

model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

## Embed data

In [3]:
from datasets import load_dataset

dataset = load_dataset("BeIR/scifact", "corpus", split="corpus")
dataset[0]

{'_id': '4983',
 'title': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.',
 'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, t

In [4]:
dataset = dataset[:2000]

In [5]:
embeddings = model.encode(dataset["text"], return_dense=True, return_sparse=True, return_colbert_vecs=True)

pre tokenize: 100%|██████████| 8/8 [00:00<00:00, 25.73it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 8/8 [06:01<00:00, 45.23s/it]


In [6]:
dense_embeddings = list(embeddings['dense_vecs'])
sparse_embeddings = list(embeddings['lexical_weights'])
colbert_embeddings = list(embeddings['colbert_vecs'])

## Connect to qdrant

In [7]:
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient, models 

In [8]:
# Load environment variables
load_dotenv('.env')

# Initialize Qdrant client
qdrant_client = QdrantClient(
    url=os.getenv("QDRANT_URL"), 
    api_key=os.getenv("QDRANT_API_KEY"),
    timeout=60
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='movies'), CollectionDescription(name='test_collection')]


In [9]:
# Chuyển đổi danh sách các dict thành danh sách các SparseVector
converted_sparse_embeddings = []

for sparse_dict in sparse_embeddings:
    # Chuyển đổi các khóa từ chuỗi thành số nguyên
    indices = [int(k) for k in sparse_dict.keys()]
    # Lấy các giá trị tương ứng
    values = [sparse_dict[k] for k in sparse_dict.keys()]
    
    # Tạo đối tượng SparseVector
    sparse_vector = models.SparseVector(
        indices=indices,
        values=values
    )
    
    converted_sparse_embeddings.append(sparse_vector)

# In kết quả để kiểm tra
for vec in converted_sparse_embeddings:
    print(vec)

indices=[39176, 21094, 159958, 119856, 35011, 26866, 70, 168698, 14135, 78574, 831, 52490, 8231, 70760, 34754, 136, 16750, 23, 123309, 164462, 13315, 44954, 45755, 92105, 9, 165598, 297, 214706, 3332, 191, 7154, 86898, 177, 594, 16625, 16, 944, 3956, 1492, 4970, 114137, 190659, 72350, 173676, 552, 13, 24500, 45964, 4, 74481, 67, 35845, 1866, 991, 29813, 53, 2256, 2182, 17262, 157955, 109197, 479, 32166, 15, 19, 2203, 729, 4393, 145048, 49413, 202120, 93425, 111, 170176, 2481, 39395, 700, 41311, 209, 3542, 22282, 71, 10, 17932, 1733, 99, 13579, 9879, 29459, 1372, 148, 92, 509, 11192, 79875, 11948, 39, 45792, 4432, 227204, 154732, 47, 39225, 400, 6492, 70796, 150143, 4240, 11044, 35066, 15044, 20028, 21373, 119475, 231839, 77546, 20903, 42, 127319, 678, 117396, 89931, 3501, 1914, 91977, 617, 615, 1837, 194692, 89678, 1126, 915, 60978, 92319, 58555, 154186, 148477, 6, 8892, 17596, 29094, 6746, 74, 151152, 1398, 12465, 97109, 757, 5, 110156, 3775, 1176, 37755, 27686, 7, 88591, 11782, 232, 

### Create Collection

In [10]:
# Create a collection
qdrant_client.create_collection(
    collection_name="movies1_batch",
    vectors_config={
        "dense": models.VectorParams(
            size=1024,
            distance=models.Distance.COSINE,
        ),
        "colbert": models.VectorParams(
            size=1024,
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM
            )
        ),
    },
    sparse_vectors_config={
        "sparse": models.SparseVectorParams()
    },
)

True

In [11]:
print(qdrant_client.get_collections())

collections=[CollectionDescription(name='movies'), CollectionDescription(name='movies1_batch'), CollectionDescription(name='test_collection')]


In [12]:
len(dataset)

3

## Batch Upload

In [13]:
len(dataset)

3

In [14]:
len(dataset["_id"])

2000

In [16]:
batch_size = 3
n = len(dataset["_id"])
for start_idx in tqdm.trange(0, n, batch_size):
    end_idx = min(start_idx + batch_size, n)
    
    # Lấy "sub-batch" (phần dữ liệu con) từ dataset
    sub_ids = dataset["_id"][start_idx:end_idx]
    sub_titles = dataset["title"][start_idx:end_idx]
    sub_texts = dataset["text"][start_idx:end_idx]

    # Lấy phần embedding tương ứng
    dense_sub = dense_embeddings[start_idx:end_idx]
    colbert_sub = colbert_embeddings[start_idx:end_idx]
    sparse_sub = converted_sparse_embeddings[start_idx:end_idx]

    qdrant_client.upload_points(
        collection_name="movies1_batch",
        points=[
            models.PointStruct(
                id=int(sub_ids[i]),
                vector={
                    "dense": dense_sub[i],
                    "colbert": colbert_sub[i],
                    "sparse": sparse_sub[i]
                },
                payload={
                    "_id": sub_ids[i],
                    "title": sub_titles[i],
                    "text": sub_texts[i],
                }
            )
            for i in range(len(sub_ids))
        ],
        batch_size=batch_size
    )


  0%|          | 0/667 [00:01<?, ?it/s]


KeyboardInterrupt: 