In [1]:
import time

import numpy as np
from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

fmt = "\n=== {:30} ===\n"
search_latency_fmt = "search latency = {:.4f}s"
num_entities, dim = 3000, 8

#################################################################################
# 1. connect to Milvus
# Add a new connection alias `default` for Milvus server in `localhost:19530`
# Actually the "default" alias is a buildin in PyMilvus.
# If the address of Milvus is the same as `localhost:19530`, you can omit all
# parameters and call the method as: `connections.connect()`.
#
# Note: the `using` parameter of the following methods is default to "default".
print(fmt.format("start connecting to Milvus"))
connections.connect("default", host="localhost", port="19530")


=== start connecting to Milvus     ===



In [2]:
has = utility.has_collection("test")
print(f"Does collection test exist in Milvus: {has}")

Does collection test exist in Milvus: False


In [36]:
fields = [
    FieldSchema(name="pmid", dtype=DataType.INT64, is_primary=True, auto_id=False, max_length=100),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="abstract_embedded", dtype=DataType.FLOAT_VECTOR, dim=768)
]

schema = CollectionSchema(fields, "test db")

print(fmt.format("Create collection `test`"))
test = Collection("test", schema, consistency_level="Strong")


=== Create collection `test`       ===



In [9]:
import pandas as pd
df = pd.read_csv('data/Master_embedded/master_with_embeddings.csv')
df.head()

Unnamed: 0,PMID,Title,Abstract,Abstract_Embedded
0,37982991,Efficacy and Safety of Methylphenidate and Ato...,BACKGROUND AND OBJECTIVE: Methylphenidate (MPH...,"[-0.5406845808029175, -0.013531085103750229, 0..."
1,37982957,Trait emotional intelligence and resilience: g...,BACKGROUND: Previous studies have reported str...,"[-0.4673023521900177, -0.03475865721702576, 0...."
2,37982736,Exploring the feasibility of an artificial int...,Objective: Skin examination to detect cutaneou...,"[-0.23864585161209106, -0.05839263275265694, 0..."
3,37982681,Deep learning-based NT-proBNP prediction from ...,OBJECTIVES: The biomarker N-terminal pro B-typ...,"[-0.37724465131759644, -0.21561074256896973, 0..."
4,37982677,Artificial Intelligence Versus Expert Plastic ...,"Introduction: Large language models, such as C...","[-0.26065611839294434, -0.06109241023659706, 0..."


In [29]:
import ast
Embedings = [ast.literal_eval(x) for x in df.Abstract_Embedded]
entities = [list(df.PMID), list(df.Title), Embedings]

In [43]:
insert_result = test.insert(entities)
test.flush()
print(f"Number of entities in Milvus: {test.num_entities}")  

Number of entities in Milvus: 100


In [40]:
import torch
from transformers import AutoTokenizer, AutoModel

# Load pre-trained BERT model and tokenizer
bert_model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)

def embed_text_with_bert(text, tokenizer=tokenizer, bert_model=bert_model):
    # Tokenize and embed text using BERT
    tokenized_text = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        model_output = bert_model(**tokenized_text)
        embedding = model_output.last_hidden_state.mean(dim=1)

    return embedding.squeeze().tolist()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [105]:
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "IP",
    "params": {"nlist": 128},
}

test.create_index("abstract_embedded", index)

Status(code=0, message=)

In [106]:
#test.release()
#test.drop_index()
test.load()

In [121]:
vector_to_search = embed_text_with_bert('Tell me something about autism disease')
search_params = {
    "metric_type": "IP"#,
    #"params": {"nprobe": 10},
}
result = test.search([vector_to_search], "abstract_embedded", search_params, limit=10, output_fields=["title"])

In [122]:
for hits in result:
    for hit in hits:
        print(f"hit: {hit}, title field: {hit.entity.get('title')}")

hit: id: 37980928, distance: 35.89857864379883, entity: {'title': 'Automatic machine learning versus human knowledge-based models, property-based models and the fatigue problem.'}, title field: Automatic machine learning versus human knowledge-based models, property-based models and the fatigue problem.
hit: id: 37973680, distance: 35.85193634033203, entity: {'title': "Students' Foreign Language Learning Adaptability and Mental Health Supported by Artificial Intelligence."}, title field: Students' Foreign Language Learning Adaptability and Mental Health Supported by Artificial Intelligence.
hit: id: 37974108, distance: 34.998130798339844, entity: {'title': 'The influence of emotional intelligence on academic stress among medical students in Neyshabur, Iran.'}, title field: The influence of emotional intelligence on academic stress among medical students in Neyshabur, Iran.
hit: id: 37978257, distance: 34.92961120605469, entity: {'title': 'Sensory sensitivity and intelligence are correl

In [79]:
len(vector_to_search)

768