In [1]:
import time
import pandas as pd
import numpy as np
from IPython.display import display, HTML
from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

fmt = "\n=== {:30} ===\n"
search_latency_fmt = "search latency = {:.4f}s"
num_entities, dim = 3000, 8

#################################################################################
# 1. connect to Milvus
# Add a new connection alias `default` for Milvus server in `localhost:19530`
# Actually the "default" alias is a buildin in PyMilvus.
# If the address of Milvus is the same as `localhost:19530`, you can omit all
# parameters and call the method as: `connections.connect()`.
#
# Note: the `using` parameter of the following methods is default to "default".
print(fmt.format("start connecting to Milvus"))
connections.connect("default", host="localhost", port="19530")

test = Collection("test",consistency_level="Strong")
test.load()


=== start connecting to Milvus     ===



In [2]:
import torch
from transformers import AutoTokenizer, AutoModel

# Load pre-trained BERT model and tokenizer
bert_model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)

def embed_text_with_bert(text, tokenizer=tokenizer, bert_model=bert_model):
    # Tokenize and embed text using BERT
    tokenized_text = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        model_output = bert_model(**tokenized_text)
        embedding = model_output.last_hidden_state.mean(dim=1)

    return embedding.squeeze().tolist()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def retrieve_relevant_chunks(query, limit=10, collection=test, tokenizer=tokenizer, bert_model=bert_model):
    return_df = pd.DataFrame(columns=['id', 'metric', 'title'])#, 'chunk'])
    vector_to_search = embed_text_with_bert(query)
    search_params = {
        "metric_type": "IP"
    }
    result = test.search([vector_to_search], "abstract_embedded", search_params, limit=limit, output_fields=["title"])#, 'chunk'])
    for hits in result:
        for hit in hits:
            return_df = return_df.append({'id':hit.id, 'metric':hit.distance, 'title':hit.entity.get('title')}, ignore_index=True) # 'chunk':hit.entity.get('chunk')},
    return return_df


In [4]:
result = retrieve_relevant_chunks('How does emotional intelligence affect student lives?', limit=5)
display(HTML(result.to_html()))

  return_df = return_df.append({'id':hit.id, 'metric':hit.distance, 'title':hit.entity.get('title')}, ignore_index=True) # 'chunk':hit.entity.get('chunk')},
  return_df = return_df.append({'id':hit.id, 'metric':hit.distance, 'title':hit.entity.get('title')}, ignore_index=True) # 'chunk':hit.entity.get('chunk')},
  return_df = return_df.append({'id':hit.id, 'metric':hit.distance, 'title':hit.entity.get('title')}, ignore_index=True) # 'chunk':hit.entity.get('chunk')},
  return_df = return_df.append({'id':hit.id, 'metric':hit.distance, 'title':hit.entity.get('title')}, ignore_index=True) # 'chunk':hit.entity.get('chunk')},
  return_df = return_df.append({'id':hit.id, 'metric':hit.distance, 'title':hit.entity.get('title')}, ignore_index=True) # 'chunk':hit.entity.get('chunk')},


Unnamed: 0,id,metric,title
0,379816725,66.044441,"Nursing students' emotional empathy, emotional intelligence and higher education-related stress: a cross-sectional study."
1,379829575,62.934387,Trait emotional intelligence and resilience: gender differences among university students.
2,379829571,62.330231,Trait emotional intelligence and resilience: gender differences among university students.
3,379741081,62.036442,"The influence of emotional intelligence on academic stress among medical students in Neyshabur, Iran."
4,379794648,61.873623,An analysis of physiological responses as indicators of driver takeover readiness in conditionally automated driving.


In [5]:
result = retrieve_relevant_chunks('What is used in brain cancer imaging?', limit=5)
display(HTML(result.to_html()))

  return_df = return_df.append({'id':hit.id, 'metric':hit.distance, 'title':hit.entity.get('title')}, ignore_index=True) # 'chunk':hit.entity.get('chunk')},
  return_df = return_df.append({'id':hit.id, 'metric':hit.distance, 'title':hit.entity.get('title')}, ignore_index=True) # 'chunk':hit.entity.get('chunk')},
  return_df = return_df.append({'id':hit.id, 'metric':hit.distance, 'title':hit.entity.get('title')}, ignore_index=True) # 'chunk':hit.entity.get('chunk')},
  return_df = return_df.append({'id':hit.id, 'metric':hit.distance, 'title':hit.entity.get('title')}, ignore_index=True) # 'chunk':hit.entity.get('chunk')},
  return_df = return_df.append({'id':hit.id, 'metric':hit.distance, 'title':hit.entity.get('title')}, ignore_index=True) # 'chunk':hit.entity.get('chunk')},


Unnamed: 0,id,metric,title
0,379796535,66.093437,Uncertainty-based Active Learning by Bayesian U-Net for Multi-label Cone-Beam CT Segmentation.
1,379759206,64.716415,Impact of real-life use of artificial intelligence as support for human reading in a population-based breast cancer screening program with mammography and tomosynthesis.
2,379738554,64.699219,Association of AI-determined Kellgren-Lawrence grade with medial meniscus extrusion and cartilage thickness by AI-based 3D MRI analysis in early knee osteoarthritis.
3,379760866,64.134018,Artificial Intelligence-Based Methods for Integrating Local and Global Features for Brain Cancer Imaging: Scoping Review.
4,379785666,64.035873,Extracellular vesicle-based liquid biopsy biomarkers and their application in precision immuno-oncology.
