IMPORTANT
1. Run Milvus Docker first

Imports

In [1]:
from pymilvus import connections, DataType, CollectionSchema, FieldSchema, Collection, Partition, utility
import openai
import pandas as pd
import numpy as np
import re
import json
from openai.embeddings_utils import get_embedding

Constant

In [2]:
OPENAI_API_KEY = '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'
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000
dimensions =1536
openai.api_key = OPENAI_API_KEY

Mutable variables

In [3]:
collection_name = 'rmrj'
partition_name = 'rmrj_articles'
json_path = 'rmrj/rmrj.json'
description = 'description'

Function definitions:

In [4]:
def get_embedding(text, model=embedding_model):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

Connection

In [5]:
# Check if the connection already exists
if connections.has_connection('default'):
    connections.remove_connection('default')  # Disconnect if it exists

# Now, reconnect with your new configuration
connections.connect(alias='default', host='localhost', port='19530')

Collection schema definition

In [20]:
collection_schema = CollectionSchema(fields=[
    FieldSchema(name="uuid", dtype=DataType.VARCHAR, is_primary=True, max_length=36),  # Define primary key here with max_length
    FieldSchema(name="author", dtype=DataType.VARCHAR, max_length=200),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="published_date", dtype=DataType.VARCHAR, max_length=50),
    FieldSchema(name="doi", dtype=DataType.VARCHAR, max_length=200),
    FieldSchema(name="link", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dimensions),
    FieldSchema(name="summary", dtype=DataType.VARCHAR, max_length=5000)
], description=description)


Collection creation

In [23]:
utility.drop_collection(collection_name)

List collections

In [24]:
utility.list_collections()

['LangChainCollection']

In [25]:
collection = Collection(name=collection_name, schema=collection_schema)

Partition creation

In [26]:
partition = Partition(collection, partition_name)

List partitions

In [27]:
collection.partitions

[{"name": "_default", "collection_name": "rmrj", "description": ""},
 {"name": "rmrj_articles", "collection_name": "rmrj", "description": ""}]

Index definition

In [28]:
index_params = {
  "metric_type": "L2", # Euclidean distance
  "index_type": "FLAT", # FLAT index type
  "params": {} # No additional parameters needed for FLAT
}

Index creation

In [29]:
collection.create_index("embeddings", index_params)

Status(code=0, message=)

Data loading

In [30]:
with open(json_path) as f:
    data = json.load(f)

Data processing

In [31]:
refactored_data = []
for item in data:
    try:
        uuid = item[0]
        author = item[1]['Author']
        title = item[1]['Title']
        published_date = item[1]['Published Date']
        doi = item[1]['DOI']
        link = item[1]['Link']
        summary = item[1]['text']
        embeddings = item[2]
        
        refactored_data.append({
            'uuid': uuid, 
            'author': author, 
            'title': title, 
            'published_date': published_date,
            'doi': doi,
            'link': link,
            'embeddings': embeddings,
            'summary': summary})
            
    except KeyError as e:
        print(f"Missing key {e} in item: {item}")
    except IndexError:
        print(f"Item has unexpected structure: {item}")
data = refactored_data

Insertion

In [32]:
print(collection.insert(data, partition_name="rmrj_articles"))

(insert count: 151, delete count: 0, upsert count: 0, timestamp: 442799135247564802, success count: 151, err count: 0)


In [33]:
collection.flush()

Querying

In [34]:
collection = Collection("rmrj")      # Get an existing collection.
collection.load()

Vectorization and reshaping

In [49]:
query_vectors = get_embedding("religious embeddings")  # Your query vectors here

# Ensure query_vectors is a 2-D array
query_vectors = np.array(query_vectors)
if len(query_vectors.shape) == 1:
    query_vectors = query_vectors.reshape(1, -1)

# vectors should be a 2-D array and limit is the maximum number of total returned results




Search params

In [50]:
search_params = {
    "metric_type": "L2",  # Distance metric, can be L2, IP (Inner Product), etc.
    "params": {"nprobe": 10},
    "offset": 0,
}

Searching

In [51]:
results = collection.search(
    data=query_vectors, 
    anns_field="embeddings", 
    param=search_params, 
    limit=10,
    partition_names=[partition_name],
    output_fields=['author', 'title', 'summary'],
    consistency_level="Strong"
)

Printing hits

In [52]:
for result in results[0]:
    print('\nTITLE')
    print(result.entity.get('title'))
    print('\nAUTHOR')
    print(result.entity.get('author'))
    print('\nSUMMARY')
    print(result.entity.get('summary'))
    print(result.distance)


TITLE
School – Parental Engagement of Filipino Women Married to Koreans: Inputs for Policy Formulation

AUTHOR
Inero Ancho, Sae-Hoon Park

SUMMARY
title: School – Parental Engagement of Filipino Women Married to Koreans: Inputs for Policy Formulation, keywords: school engagement , migration, multiculturalism, Filipinos abroad, policy, author: Inero Ancho, Sae-Hoon Park, doi: https://doi.org/10.32871/rmrj2311.01.01, abstract: With the increasing number of multicultural families in Korea, opportunities and challenges arise, particularly in terms of children’s welfare and education. The present study investigates the experiences of Filipino women married to Koreans and their school parental engagement. Through unstructured interviews, 13 Filipino women participated in the study regarding their involvement in school affairs regarding their child’s attendance at Korean schools. The challenges faced by the participants were also explored. Findings show Filipino mothers’ eagerness to attend 

In [None]:
res = collection.query(
  expr = 'author like "Joje"', 
  output_fields = ["author", "title"],
  consistency_level="Strong"
)
sorted_res = sorted(res, key=lambda k: k['author'])
sorted_res


In [None]:
# Now you can iterate over the results
for i, hit in enumerate(results[0]):
    print("Result #{0}: ID={1}, Metadata={2}, Distance={3}\n".format(i, hit.id, hit.entity.get(), hit.distance))