In [1]:
import time

import numpy as np
from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

fmt = "\n=== {:30} ===\n"
search_latency_fmt = "search latency = {:.4f}s"
num_entities, dim = 3000, 8

#################################################################################
# 1. connect to Milvus
# Add a new connection alias `default` for Milvus server in `localhost:19530`
# Actually the "default" alias is a buildin in PyMilvus.
# If the address of Milvus is the same as `localhost:19530`, you can omit all
# parameters and call the method as: `connections.connect()`.
#
# Note: the `using` parameter of the following methods is default to "default".
print(fmt.format("start connecting to Milvus"))
connections.connect("default", host="localhost", port="19530")


=== start connecting to Milvus     ===



In [2]:
utility.drop_collection("test")

In [3]:
has = utility.has_collection("test")
print(f"Does collection test exist in Milvus: {has}")

Does collection test exist in Milvus: False


In [4]:
fields = [
    FieldSchema(name="pmid", dtype=DataType.INT64, is_primary=True, auto_id=False, max_length=100),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="chunk", dtype=DataType.VARCHAR, max_length=2048),
    FieldSchema(name="abstract_embedded", dtype=DataType.FLOAT_VECTOR, dim=768)
]

schema = CollectionSchema(fields, "test db")

print(fmt.format("Create collection `test`"))
test = Collection("test", schema, consistency_level="Strong")


=== Create collection `test`       ===



In [6]:
import pandas as pd
df = pd.read_csv('data/New Master/master_with_embeddings.csv')
df.head()

Unnamed: 0,id,title,chunk,chunk_embedded
0,379829911,Efficacy and Safety of Methylphenidate and Ato...,BACKGROUND AND OBJECTIVE: Methylphenidate (MPH...,"[-0.21812883019447327, 0.00931312795728445, 0...."
1,379829912,Efficacy and Safety of Methylphenidate and Ato...,"analyze correlations associated with age, sex,...","[-0.24057511985301971, 0.13817532360553741, 0...."
2,379829913,Efficacy and Safety of Methylphenidate and Ato...,"and AEs associated with age, sex, and differen...","[-0.4178210496902466, -0.026887519285082817, 0..."
3,379829914,Efficacy and Safety of Methylphenidate and Ato...,response rates were 84.6% in the MPH-treated g...,"[-0.5015849471092224, -0.08108298480510712, 0...."
4,379829915,Efficacy and Safety of Methylphenidate and Ato...,"children experienced AEs during MPH treatment,...","[-0.6629533171653748, -0.0032602110877633095, ..."


In [17]:
df['splitted_chunk'] = df['chunk'].str.split(' ')

In [52]:
df['splitted_chunk']

0      [BACKGROUND, AND, OBJECTIVE:, Methylphenidate,...
1      [Study, participants, were, treated, with, eit...
2      [RESULTS:, A, total, of, 1050, children, were,...
3      [05)., The, response, rates, were, 84., 6%, in...
4      [3%, in, the, ATX-treated, group., Subgroup, a...
                             ...                        
501    [The, rapid, development, of, social, reform, ...
502    [This, paper, discusses, the, principle, of, A...
503    [This, paper, takes, foreign, and, non-foreign...
504    [Psychological, health, problems, include, lea...
505    [This, paper, hopes, to, provide, data, refere...
Name: splitted_chunk, Length: 506, dtype: object

In [7]:
import ast
Embedings = [ast.literal_eval(x) for x in df.chunk_embedded]
entities = [list(df.id), list(df.title), list(df.chunk), Embedings]

In [8]:
insert_result = test.insert(entities)
test.flush()
print(f"Number of entities in Milvus: {test.num_entities}")  

Number of entities in Milvus: 412


In [9]:
import torch
from transformers import AutoTokenizer, AutoModel

# Load pre-trained BERT model and tokenizer
bert_model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)

def embed_text_with_bert(text, tokenizer=tokenizer, bert_model=bert_model):
    # Tokenize and embed text using BERT
    tokenized_text = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        model_output = bert_model(**tokenized_text)
        embedding = model_output.last_hidden_state.mean(dim=1)

    return embedding.squeeze().tolist()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "IP",
    "params": {"nlist": 128},
}

test.create_index("abstract_embedded", index)

Status(code=0, message=)

In [11]:
#test.release()
#test.drop_index()
test.load()

In [12]:
vector_to_search = embed_text_with_bert('Tell me something about autism disease')
search_params = {
    "metric_type": "IP",#,
    #"params": {"nprobe": 10},
    "expr": "id == 379751973"
}
result = test.search([vector_to_search], "abstract_embedded", search_params, limit=10, output_fields=["title"])

In [13]:

for hits in result:
    for hit in hits:
        print(f"hit: {hit}, title field: {hit.entity.get('title')}")

hit: id: 379795015, distance: 45.9149169921875, entity: {'title': 'Development and feasibility testing of an artificially intelligent chatbot to answer immunization-related queries of caregivers in Pakistan: A mixed-methods study.'}, title field: Development and feasibility testing of an artificially intelligent chatbot to answer immunization-related queries of caregivers in Pakistan: A mixed-methods study.
hit: id: 379809283, distance: 44.752960205078125, entity: {'title': 'Automatic machine learning versus human knowledge-based models, property-based models and the fatigue problem.'}, title field: Automatic machine learning versus human knowledge-based models, property-based models and the fatigue problem.
hit: id: 379782666, distance: 44.15021514892578, entity: {'title': 'Prediction of therapeutic intensity level from automatic multiclass segmentation of traumatic brain injury lesions on CT-scans.'}, title field: Prediction of therapeutic intensity level from automatic multiclass se

In [14]:
len(vector_to_search)

768