## IMPORTANT
1. Run Milvus Docker first

## Scheme Preparation

Imports

In [1]:
from pymilvus import connections, DataType, CollectionSchema, FieldSchema, Collection, Partition, utility
import openai
import pandas as pd
import numpy as np
import re
import json
from openai.embeddings_utils import get_embedding
import time
from tqdm import tqdm

Constants

In [2]:
OPENAI_API_KEY = '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'
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000
dimensions =1536
openai.api_key = OPENAI_API_KEY

Mutable variables

In [3]:
collection_names = ['rmrj_articles':{'author', 'title','published_date', 'text'}, facebook_posts:]
partition_name = 'rmrj_articles'
json_path = 'raw_jsons/'
description = 'description'

Function definitions:

In [4]:
def get_embedding(text, model=embedding_model):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

Connection

In [5]:
# Check if the connection already exists
if connections.has_connection('default'):
    connections.remove_connection('default')  # Disconnect if it exists

# Now, reconnect with your new configuration
connections.connect(alias='default', host='localhost', port='19530')

Drop collection

In [6]:
for name in collection_names:
    utility.drop_collection(f"{name}_collection")
utility.list_collections()

['LangChainCollection', 'search_article_in_medium']

Collection schema definition

In [7]:
collections = {}  # To store the created collections

for name in collection_names:
    fields = [
        FieldSchema(name="uuid", dtype=DataType.VARCHAR, is_primary=True, max_length=36),
        FieldSchema(name=name, dtype=DataType.VARCHAR, max_length=2500),
        FieldSchema(name="embeds", dtype=DataType.FLOAT_VECTOR, dim=dimensions)
    ]

    schema = CollectionSchema(fields=fields, description=f"Collection for {name}")

    # Create the collection and store it in the dictionary
    collections[name] = Collection(name=f"{name}_collection", schema=schema)

List collections

In [8]:
utility.list_collections()

['title_collection',
 'search_article_in_medium',
 'text_collection',
 'author_collection',
 'published_date_collection',
 'LangChainCollection']

Partition creation

In [9]:
for collection in collections.values():
    partition = Partition(collection, partition_name)

List partitions

In [10]:
for collection in collections.values():
    display(collection.partitions)

[{"name": "_default", "collection_name": "author_collection", "description": ""},
 {"name": "rmrj_articles", "collection_name": "author_collection", "description": ""}]

[{"name": "_default", "collection_name": "title_collection", "description": ""},
 {"name": "rmrj_articles", "collection_name": "title_collection", "description": ""}]

[{"name": "_default", "collection_name": "published_date_collection", "description": ""},
 {"name": "rmrj_articles", "collection_name": "published_date_collection", "description": ""}]

[{"name": "_default", "collection_name": "text_collection", "description": ""},
 {"name": "rmrj_articles", "collection_name": "text_collection", "description": ""}]

Index definition

In [11]:
index_params = {
  "metric_type": "L2", # Euclidean distance
  "index_type": "FLAT", # FLAT index type
  "params": {} # No additional parameters needed for FLAT
}

Index creation

In [12]:
for collection in collections.values():
    collection.create_index("embeds", index_params)

## Data Processing

Data loading

In [13]:
with open(json_path) as f:
    data = json.load(f)

Lowercasing dictionary

In [14]:
for row in data:
    # Check if the second element of the row is a dictionary
    if isinstance(row[1], dict):
        # Create a new dictionary with keys in lowercase
        new_dict = {k.lower(): v for k, v in row[1].items()}
        
        # Check if 'published date' is a key in the new dictionary
        if 'published date' in new_dict:
            # If it is, rename it to 'published_date'
            new_dict['published_date'] = new_dict.pop('published date')
        
        # Replace the old dictionary with the new one
        row[1] = new_dict

Time refactoring

In [15]:
from datetime import datetime

def change_date_format(date_string):
    date_object = datetime.strptime(date_string, '%Y-%m-%d')
    return date_object.strftime('%Y-%m-%d %B %d, %Y')

In [16]:
for item in data:
    item[1]['published_date']=change_date_format(item[1]['published_date'])

In [17]:
data[0]

['111553fe-23fc-45e4-ad46-0c56b61aee0e',
 {'chunk': 0,
  'text': 'title: Timeless Existence and Principle of Creation: Notions Embedded in John 1:1, "In the Beginning Was the Word", keywords: John 1:1, Word, beginning, timeless existence, principle of creation, intentionality, author: Emiliano C. De Catalina, doi: https://doi.org/10.32871/rmrj2210.01.01, abstract: St. John\'s Gospel begins with a prologue, serving as an overture to the whole Gospel. This paper investigates the philosophical notions embedded in the first three lines of John 1:1. The inquiry focuses on whether or not the accepted meaning of this line as "indicating timeless existence" can be deduced from John 1:1 and whether or not John 1:1 also indicates the meaning of the "principle of creation." This paper proceeds to make this inquiry in the following order: Introduction; The questions arising in John 1:1; Word as God is eternal, outside time; "In the beginning" as predicate; "The Word was in the beginning"; Timeless

Dividing attributes to their corresponding collection (based on collection_names above)

In [18]:
data_lists = {f"{name}_obj": [] for name in collection_names}

for record in data:
    for name in collection_names:
        if name in record[1]:
            data_lists[f"{name}_obj"].append(record[1][name])
        else:
            print(f"The key '{name}' is not in the record.")

In [19]:
for name in collection_names:
    print(name, " - ", data_lists[f'{name}_obj'][0])

author  -  Emiliano C. De Catalina
title  -  Timeless Existence and Principle of Creation: Notions Embedded in John 1:1, "In the Beginning Was the Word"
published_date  -  2022-05-25 May 25, 2022
text  -  title: Timeless Existence and Principle of Creation: Notions Embedded in John 1:1, "In the Beginning Was the Word", keywords: John 1:1, Word, beginning, timeless existence, principle of creation, intentionality, author: Emiliano C. De Catalina, doi: https://doi.org/10.32871/rmrj2210.01.01, abstract: St. John's Gospel begins with a prologue, serving as an overture to the whole Gospel. This paper investigates the philosophical notions embedded in the first three lines of John 1:1. The inquiry focuses on whether or not the accepted meaning of this line as "indicating timeless existence" can be deduced from John 1:1 and whether or not John 1:1 also indicates the meaning of the "principle of creation." This paper proceeds to make this inquiry in the following order: Introduction; The quest

Save uuids as list

In [20]:
uuid_list = []
for item in data:
    uuid_list.append(item[0])
uuid_list

['111553fe-23fc-45e4-ad46-0c56b61aee0e',
 '737dc86a-c28d-4002-ab11-4e1ae1ae946d',
 'c4d3893e-977e-42ab-904a-80a94a67b9b1',
 '449a1ec7-c2d8-4674-b714-a039305e9602',
 'a9e24e6e-cfd0-4bc5-a30a-8786816b040e',
 '59e0c628-187d-4efe-9956-6672cd2fe2cb',
 '665d5cea-d64e-4443-9445-cf4b00414ce5',
 '95943fe3-cdf5-47de-a590-f6ceab81c892',
 'f611a855-bf25-43ef-86ab-a797d78905ba',
 '6281ce43-1b00-40d4-b9d5-8fc8892c09ef',
 'f739c250-da63-4de3-9cca-2326d50b3fda',
 '5c6fd354-221a-43ff-9937-60a8a4e139f9',
 'e3fdbe48-ff94-489e-82ea-a6fad5397342',
 '02c03d3a-836c-4aba-a9a7-4f8df3b46464',
 '25478b72-54dc-4728-b62a-d034c27ff4b8',
 'abd02174-6c24-44a8-9ef5-a65a32c10163',
 '96b40848-dead-4a06-bb67-befb5071711f',
 '1e00059c-75ab-4709-aa9a-258e4febabed',
 '9e91744f-a7b7-4ea7-8e3f-00b557300496',
 '8f4c5c04-f0ce-44c7-ba08-510697fb62b7',
 '07e8ff92-e1f9-40fa-85fa-928efda9af19',
 '1b92bbb3-d5bb-4ee5-a863-29e4f9913bbb',
 '66f00877-7b43-433c-9d6c-8988b90985bd',
 'f0c37cbb-fdd8-47bb-a97c-deff69eea0e7',
 '36d25628-6f2c-

Accessing data_lists

In [21]:
data_lists['published_date_obj'][1]

'2022-06-14 June 14, 2022'

Embeddings

In [22]:
json_path = "json_per_collection/"
def get_data_embeds(collection_names, data_lists, uuid_list):
    data_lists_embeds = {f"{name}_obj": [] for name in collection_names}
    for name in collection_names:
        for item, id_uuid in zip(tqdm(data_lists[f'{name}_obj'], desc=f'Processing {name}'), uuid_list):
            embedding = get_embedding(item)
            data_lists_embeds[f'{name}_obj'].append(embedding)
            time.sleep(1)  # Add a time break of 1 second (adjust as needed)
    return data_lists_embeds

def create_obj_data(collection_names, data_lists, uuid_list):
    data_lists_embeds = get_data_embeds(collection_names, data_lists, uuid_list)
    obj_list = {}
    for name in collection_names:
        obj_data = [
            {
                'uuid': id_uuid,
                f'{name}': item,
                'embeds': embedding
            } 
            for item, id_uuid, embedding in zip(data_lists[f'{name}_obj'], uuid_list, data_lists_embeds[f'{name}_obj'])
        ]
        obj_list[name] = obj_data
    return obj_list

def save_obj_data_to_json(obj_list):
    for name, obj_data in obj_list.items():
        with open(f'{json_path}{name}.json', 'w') as file:
            json.dump(obj_data, file)

In [23]:
obj_list = create_obj_data(collection_names, data_lists, uuid_list)
save_obj_data_to_json(obj_list)

Processing author: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [04:15<00:00,  1.69s/it]
Processing title: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [04:07<00:00,  1.64s/it]
Processing published_date: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [04:16<00:00,  1.70s/it]
Processing text: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [04:27<00:00,  1.77s/it]


##  Upserting

Loading

In [24]:
json_path = "json_per_collection/" 
def open_json(filename):
    with open(filename + ".json") as file:
        return json.load(file)

obj_list = {}
for name in collection_names:
    obj_list[name] = open_json(json_path + name)


Upserting

In [25]:
for name in collection_names:
    collection = Collection(f"{name}_collection")
    print(collection.insert(obj_list[name], partition_name=partition_name))


(insert count: 151, delete count: 0, upsert count: 0, timestamp: 442825707936022530, success count: 151, err count: 0)
(insert count: 151, delete count: 0, upsert count: 0, timestamp: 442825707948867590, success count: 151, err count: 0)
(insert count: 151, delete count: 0, upsert count: 0, timestamp: 442825707961974795, success count: 151, err count: 0)
(insert count: 151, delete count: 0, upsert count: 0, timestamp: 442825707988451331, success count: 151, err count: 0)


In [27]:
print(collection.flush())

None
