IMPORTANT
1. Run Milvus Docker first

Imports

In [2]:
from pymilvus import connections, DataType, CollectionSchema, FieldSchema, Collection, Partition, utility
import openai
import pandas as pd
import numpy as np
import re

import json
from openai.embeddings_utils import get_embedding
import time
from tqdm import tqdm

Constant

In [3]:
OPENAI_API_KEY = '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'
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000
dimensions =1536
openai.api_key = OPENAI_API_KEY

Mutable variables

In [4]:
collection_names = ['author', 'title','published_date', 'text']
# collection_names =['text']
partition_name = 'rmrj_articles'
json_path = 'rmrj/rmrj.json'
description = 'description'

Function definitions:

In [5]:
def get_embedding(text, model=embedding_model):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

Connection

In [7]:
# Check if the connection already exists
if connections.has_connection('default'):
    connections.remove_connection('default')  # Disconnect if it exists

# Now, reconnect with your new configuration
connections.connect(alias='default', host='localhost', port='19530')

Collection schema definition

In [78]:
collections = {}  # To store the created collections

for name in collection_names:
    fields = [
        FieldSchema(name="uuid", dtype=DataType.VARCHAR, is_primary=True, max_length=36),
        FieldSchema(name=name, dtype=DataType.VARCHAR, max_length=2500),
        FieldSchema(name="data_lists_embeds", dtype=DataType.FLOAT_VECTOR, dim=dimensions)
    ]

    schema = CollectionSchema(fields=fields, description=f"Collection for {name}")

    # Create the collection and store it in the dictionary
    collections[name] = Collection(name=f"{name}_collection", schema=schema)


collection_schema = CollectionSchema(fields=[
    FieldSchema(name="uuid", dtype=DataType.VARCHAR, is_primary=True, max_length=36),  # Define primary key here with max_length
    FieldSchema(name="author", dtype=DataType.OBJECt, max_length=200),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="published_date", dtype=DataType.VARCHAR, max_length=50),
    FieldSchema(name="doi", dtype=DataType.VARCHAR, max_length=200),
    FieldSchema(name="link", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dimensions),
    FieldSchema(name="summary", dtype=DataType.VARCHAR, max_length=5000)
], description=description)


Collection creation

In [76]:
for name in collection_names:
    utility.drop_collection(f"{name}_collection")

List collections

In [90]:
utility.list_collections()

['search_article_in_medium',
 'published_date_collection',
 'LangChainCollection',
 'title_collection',
 'text_collection',
 'author_collection']

collection = Collection(name=collection_name, schema=collection_schema)

Partition creation

In [81]:
for collection in collections.values():
    partition = Partition(collection, partition_name)

List partitions

In [82]:
for collection in collections.values():
    display(collection.partitions)

[{"name": "_default", "collection_name": "author_collection", "description": ""},
 {"name": "rmrj_articles", "collection_name": "author_collection", "description": ""}]

[{"name": "_default", "collection_name": "title_collection", "description": ""},
 {"name": "rmrj_articles", "collection_name": "title_collection", "description": ""}]

[{"name": "_default", "collection_name": "published_date_collection", "description": ""},
 {"name": "rmrj_articles", "collection_name": "published_date_collection", "description": ""}]

[{"name": "_default", "collection_name": "text_collection", "description": ""},
 {"name": "rmrj_articles", "collection_name": "text_collection", "description": ""}]

Index definition

In [83]:
index_params = {
  "metric_type": "L2", # Euclidean distance
  "index_type": "FLAT", # FLAT index type
  "params": {} # No additional parameters needed for FLAT
}

Index creation

In [84]:
for collection in collections.values():
    collection.create_index("data_lists_embeds", index_params)

Data loading

In [85]:
with open(json_path) as f:
    data = json.load(f)

In [16]:
for row in data:
    # Check if the second element of the row is a dictionary
    if isinstance(row[1], dict):
        # Create a new dictionary with keys in lowercase
        new_dict = {k.lower(): v for k, v in row[1].items()}
        
        # Check if 'published date' is a key in the new dictionary
        if 'published date' in new_dict:
            # If it is, rename it to 'published_date'
            new_dict['published_date'] = new_dict.pop('published date')
        
        # Replace the old dictionary with the new one
        row[1] = new_dict


In [17]:
data[0]

['111553fe-23fc-45e4-ad46-0c56b61aee0e',
 {'chunk': 0,
  'text': 'title: Timeless Existence and Principle of Creation: Notions Embedded in John 1:1, "In the Beginning Was the Word", keywords: John 1:1, Word, beginning, timeless existence, principle of creation, intentionality, author: Emiliano C. De Catalina, doi: https://doi.org/10.32871/rmrj2210.01.01, abstract: St. John\'s Gospel begins with a prologue, serving as an overture to the whole Gospel. This paper investigates the philosophical notions embedded in the first three lines of John 1:1. The inquiry focuses on whether or not the accepted meaning of this line as "indicating timeless existence" can be deduced from John 1:1 and whether or not John 1:1 also indicates the meaning of the "principle of creation." This paper proceeds to make this inquiry in the following order: Introduction; The questions arising in John 1:1; Word as God is eternal, outside time; "In the beginning" as predicate; "The Word was in the beginning"; Timeless

Data processing

In [18]:
data_lists = {f"{name}_obj": [] for name in collection_names}

for record in data:
    for name in collection_names:
        data_lists[f"{name}_obj"].append(record[1][name])
        


In [19]:
uuid_list = []
for item in data:
    uuid_list.append(item[0])

In [21]:
uuid_list

['111553fe-23fc-45e4-ad46-0c56b61aee0e',
 '737dc86a-c28d-4002-ab11-4e1ae1ae946d',
 'c4d3893e-977e-42ab-904a-80a94a67b9b1',
 '449a1ec7-c2d8-4674-b714-a039305e9602',
 'a9e24e6e-cfd0-4bc5-a30a-8786816b040e',
 '59e0c628-187d-4efe-9956-6672cd2fe2cb',
 '665d5cea-d64e-4443-9445-cf4b00414ce5',
 '95943fe3-cdf5-47de-a590-f6ceab81c892',
 'f611a855-bf25-43ef-86ab-a797d78905ba',
 '6281ce43-1b00-40d4-b9d5-8fc8892c09ef',
 'f739c250-da63-4de3-9cca-2326d50b3fda',
 '5c6fd354-221a-43ff-9937-60a8a4e139f9',
 'e3fdbe48-ff94-489e-82ea-a6fad5397342',
 '02c03d3a-836c-4aba-a9a7-4f8df3b46464',
 '25478b72-54dc-4728-b62a-d034c27ff4b8',
 'abd02174-6c24-44a8-9ef5-a65a32c10163',
 '96b40848-dead-4a06-bb67-befb5071711f',
 '1e00059c-75ab-4709-aa9a-258e4febabed',
 '9e91744f-a7b7-4ea7-8e3f-00b557300496',
 '8f4c5c04-f0ce-44c7-ba08-510697fb62b7',
 '07e8ff92-e1f9-40fa-85fa-928efda9af19',
 '1b92bbb3-d5bb-4ee5-a863-29e4f9913bbb',
 '66f00877-7b43-433c-9d6c-8988b90985bd',
 'f0c37cbb-fdd8-47bb-a97c-deff69eea0e7',
 '36d25628-6f2c-

In [22]:
for name in collection_names:
    print(name)
    display(data_lists[f'{name}_obj'][0])

author


'Emiliano C. De Catalina'

title


'Timeless Existence and Principle of Creation: Notions Embedded in John 1:1, "In the Beginning Was the Word"'

published_date


'2022-05-25'

text


'title: Timeless Existence and Principle of Creation: Notions Embedded in John 1:1, "In the Beginning Was the Word", keywords: John 1:1, Word, beginning, timeless existence, principle of creation, intentionality, author: Emiliano C. De Catalina, doi: https://doi.org/10.32871/rmrj2210.01.01, abstract: St. John\'s Gospel begins with a prologue, serving as an overture to the whole Gospel. This paper investigates the philosophical notions embedded in the first three lines of John 1:1. The inquiry focuses on whether or not the accepted meaning of this line as "indicating timeless existence" can be deduced from John 1:1 and whether or not John 1:1 also indicates the meaning of the "principle of creation." This paper proceeds to make this inquiry in the following order: Introduction; The questions arising in John 1:1; Word as God is eternal, outside time; "In the beginning" as predicate; "The Word was in the beginning"; Timeless existence and the verb was; The Word as God is a principle of cr

In [23]:
data_lists['author_obj'][1]

'Dionesio M. BaÅˆoc, Victor B. Asio'

In [24]:
data[0]

['111553fe-23fc-45e4-ad46-0c56b61aee0e',
 {'chunk': 0,
  'text': 'title: Timeless Existence and Principle of Creation: Notions Embedded in John 1:1, "In the Beginning Was the Word", keywords: John 1:1, Word, beginning, timeless existence, principle of creation, intentionality, author: Emiliano C. De Catalina, doi: https://doi.org/10.32871/rmrj2210.01.01, abstract: St. John\'s Gospel begins with a prologue, serving as an overture to the whole Gospel. This paper investigates the philosophical notions embedded in the first three lines of John 1:1. The inquiry focuses on whether or not the accepted meaning of this line as "indicating timeless existence" can be deduced from John 1:1 and whether or not John 1:1 also indicates the meaning of the "principle of creation." This paper proceeds to make this inquiry in the following order: Introduction; The questions arising in John 1:1; Word as God is eternal, outside time; "In the beginning" as predicate; "The Word was in the beginning"; Timeless

In [51]:
for name in collection_names:
    obj_data = []
    for item, uuid in zip(tqdm(data_lists[f'{name}_obj'], desc=f'Processing {name}'), uuid_list):
        embedding = get_embedding(item)
        
        # Print the dimensions of the embeddings for debugging
        print(f"Embedding dimensions for {uuid}: {len(embedding)}")
        
        data_lists_embeds[f'{name}_obj'].append(embedding)
        # Add a time break of 1.2 seconds (adjust as needed)
        time.sleep(1.2)
        obj_data.append({
            'uuid': uuid,
            f'{name}': item,
            'data_lists_embeds': embedding
        })
    obj_list[name] = obj_data
    with open(f'{name}.json', 'w') as file:
        json.dump(obj_data, file)


Processing author:   0%|                                                                                                                                                        | 0/151 [00:00<?, ?it/s]

Embedding dimensions for 111553fe-23fc-45e4-ad46-0c56b61aee0e: 1536


Processing author:   1%|▉                                                                                                                                               | 1/151 [00:02<05:55,  2.37s/it]

Embedding dimensions for 737dc86a-c28d-4002-ab11-4e1ae1ae946d: 1536


Processing author:   1%|█▉                                                                                                                                              | 2/151 [00:03<04:39,  1.88s/it]

Embedding dimensions for c4d3893e-977e-42ab-904a-80a94a67b9b1: 1536


Processing author:   1%|█▉                                                                                                                                              | 2/151 [00:04<05:55,  2.39s/it]


KeyboardInterrupt: 

In [26]:
json_path = "json_per_collection/"

In [25]:
import time
from tqdm import tqdm
import json

data_lists_embeds = {f"{name}_obj": [] for name in collection_names}
obj_list = {}

for name in collection_names:
    obj_data = []
    for item, id_uuid in zip(tqdm(data_lists[f'{name}_obj'], desc=f'Processing {name}'), uuid_list):
        embedding = get_embedding(item)
        data_lists_embeds[f'{name}_obj'].append(embedding)
        # Add a time break of 1 second (adjust as needed)
        time.sleep(1)
        obj_data.append({
            'uuid': id_uuid,
            f'{name}': item,
            'data_lists_embeds': embedding
        })
    obj_list[name] = obj_data
    with open(f'{name}.json', 'w') as file:
        json.dump(obj_data, file)


Processing author: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [03:38<00:00,  1.45s/it]
Processing title: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [03:44<00:00,  1.48s/it]
Processing published_date: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [03:44<00:00,  1.49s/it]
Processing text: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [03:54<00:00,  1.55s/it]


In [111]:
obj_list['text'][0]

{'uuid': '111553fe-23fc-45e4-ad46-0c56b61aee0e',
 'text': 'title: Timeless Existence and Principle of Creation: Notions Embedded in John 1:1, "In the Beginning Was the Word", keywords: John 1:1, Word, beginning, timeless existence, principle of creation, intentionality, author: Emiliano C. De Catalina, doi: https://doi.org/10.32871/rmrj2210.01.01, abstract: St. John\'s Gospel begins with a prologue, serving as an overture to the whole Gospel. This paper investigates the philosophical notions embedded in the first three lines of John 1:1. The inquiry focuses on whether or not the accepted meaning of this line as "indicating timeless existence" can be deduced from John 1:1 and whether or not John 1:1 also indicates the meaning of the "principle of creation." This paper proceeds to make this inquiry in the following order: Introduction; The questions arising in John 1:1; Word as God is eternal, outside time; "In the beginning" as predicate; "The Word was in the beginning"; Timeless existe

In [86]:
print(collections['author'].insert(obj_list['author'], partition_name="rmrj_articles"))
print(collections['title'].insert(obj_list['title'], partition_name="rmrj_articles"))
print(collections['published_date'].insert(obj_list['published_date'], partition_name="rmrj_articles"))
print(collections['text'].insert(obj_list['text'], partition_name="rmrj_articles"))


(insert count: 151, delete count: 0, upsert count: 0, timestamp: 442821374219124737, success count: 151, err count: 0)
(insert count: 151, delete count: 0, upsert count: 0, timestamp: 442821374231969794, success count: 151, err count: 0)
(insert count: 151, delete count: 0, upsert count: 0, timestamp: 442821374258446338, success count: 151, err count: 0)
(insert count: 151, delete count: 0, upsert count: 0, timestamp: 442821374284398594, success count: 151, err count: 0)


In [56]:
for name in collection_names:
    data_to_insert = {
        "uuid": [item['uuid'] for item in obj_list[name]],
        name: [item[name] for item in obj_list[name]],
        "data_lists_embeds": [item['data_lists_embeds'] for item in obj_list[name]]
    }

    # Check the dimensions of all embeddings
    for i, embedding in enumerate(data_to_insert['data_lists_embeds']):
        if len(embedding) != 1536:
            print(f"Embedding at index {i} has incorrect dimension: {len(embedding)}")

    print(collections[name].insert(data_to_insert, partition_name="rmrj_articles"))


RPC error: [insert_rows], <ParamError: (code=1, message=Collection field dim is 1536, but entities field dim is 151)>, <Time:{'RPC start': '2023-07-13 14:18:38.463359', 'RPC error': '2023-07-13 14:18:38.463490'}>


ParamError: <ParamError: (code=1, message=Collection field dim is 1536, but entities field dim is 151)>

Insertion

In [None]:
print(collection.insert(data, partition_name="rmrj_articles"))

In [None]:
collection.flush()

Querying

In [8]:
query_vectors = get_embedding("Embeddings in religious text")  
query_vectors = np.array(query_vectors)
if len(query_vectors.shape) == 1:
    query_vectors = query_vectors.reshape(1, -1)

search_params = {
    "metric_type": "L2",  # Distance metric, can be L2, IP (Inner Product), etc.
    "offset": 0,
}
    
results = []
for i, name in enumerate(collection_names):
    collection = Collection(f"{name}_collection")
    collection.load()
    result = collection.search(
    data=query_vectors, 
    anns_field="data_lists_embeds",
    param=search_params, 
    limit=5,
    partition_names=[partition_name],
    output_fields=[name, 'uuid'],
    consistency_level="Strong")
    results.append(result)

In [122]:
collection_names = ['author', 'title', 'published_date', 'text']
results = []
ids = []
for name, result_set in zip(collection_names, results):
    for result in result_set[0]:
        result_id = result.entity.get('uuid')
        if result_id not in ids:
            ids.append(result_id)
        ids[result_id]['ave distance'] = if 0, add result.distance else add result.distance and get average of both

author
Binoy Mathew K V, Maryelizabeth Tidiya Walarine
441dfda9-2870-46d0-b063-9307b15b5173
0.44759461283683777
author
Shella B. Cacatian
e869ef0c-9ed1-4f81-81e4-97578cfd5de4
0.45023611187934875
author
Levitah C. Mapatac
c5973031-19bc-44bf-82ed-02048381e726
0.452766478061676
author
Levitah C. Mapatac
88308d34-8df3-4423-8a99-8ac2fe9113cb
0.452766478061676
author
Levitah C. Mapatac
fc8ebb3c-cfd8-479f-b1d2-6558bf60c723
0.452766478061676
title
Timeless Existence and Principle of Creation: Notions Embedded in John 1:1, "In the Beginning Was the Word"
111553fe-23fc-45e4-ad46-0c56b61aee0e
0.3897976279258728
title
Paul Ricoeur's Mystery of Hermeneutics and Identity
62cb29f3-2434-44d3-8f25-383a59cc9d32
0.39707720279693604
title
Paul Ricoeur's Mystery of Hermeneutics and Identity
821a3b07-0f20-4c77-9081-c98ed7864059
0.39707720279693604
title
Morality and Religiosity: A Filipino Experience
77546a54-7149-4e63-946e-e1cc15331d48
0.4029955267906189
title
Reality of Human Experience and the Search for

In [None]:
collection = Collection("rmrj")      # Get an existing collection.
collection.load()

Vectorization and reshaping

In [None]:
query_vectors = get_embedding("Emiliano")  # Your query vectors here
og = query_vectors
# Ensure query_vectors is a 2-D array
query_vectors = np.array(query_vectors)
if len(query_vectors.shape) == 1:
    query_vectors = query_vectors.reshape(1, -1)

# vectors should be a 2-D array and limit is the maximum number of total returned results




In [None]:
og 

In [None]:
file_path = "text.txt"
with open(file_path, "w") as file:
    file.write(str(og))

Search params

In [None]:
search_params = {
    "metric_type": "L2",  # Distance metric, can be L2, IP (Inner Product), etc.
    "offset": 0,
}

Searching

In [None]:
results = collection.search(
    data=query_vectors, 
    anns_field={"embeddings", "author"},
    param=search_params, 
    limit=10,
    partition_names=[partition_name],
    output_fields=['author', 'title', 'summary'],
    consistency_level="Strong"
)

Printing hits

In [None]:
for result in results[0]:
    print('\nTITLE')
    print(result.entity.get('title'))
    print('\nAUTHOR')
    print(result.entity.get('author'))
    print('\nSUMMARY')
    print(result.entity.get('summary'))
    print(result.distance)

In [None]:
data[0]