## IMPORTANT
1. Run Milvus Docker first

## Scheme Preparation

Imports

In [196]:
from pymilvus import connections, DataType, CollectionSchema, FieldSchema, Collection, Partition, utility
import openai
import pandas as pd
import numpy as np
import re
import json
from openai.embeddings_utils import get_embedding
import time
from tqdm import tqdm

Constants

In [197]:
OPENAI_API_KEY = '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'
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000
dimensions =1536
openai.api_key = OPENAI_API_KEY

Mutable variables

In [198]:
partition_name = 'usjr_about'
bundled_schema = {'rmrj_articles': ['author', 'title', 'published_date', 'text'],
                  'facebook_posts': ['text', 'time', 'link'],
                  'usjr_about': ['text', 'content_id'],
                  'all': ['author', 'title', 'published_date', 'text', 'time', 'post', 'link', 'content_id']}
collection_names = bundled_schema[partition_name]
json_path = 'raw_jsons/usjr_about.json'
description = 'description'

Function definitions:

In [199]:
def get_embedding(text, model=embedding_model):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

Connection

In [200]:
# Check if the connection already exists
if connections.has_connection('default'):
    connections.remove_connection('default')  # Disconnect if it exists

# Now, reconnect with your new configuration
connections.connect(alias='default', host='localhost', port='19530')

Drop collection

In [125]:
for name in collection_names:
    utility.drop_collection(f"{name}_collection")
utility.list_collections()

['LangChainCollection']

Collection schema definition

In [201]:
collections = {}  # To store the created collections

for name in collection_names:
    if name not in utility.list_collections():
        fields = [
            FieldSchema(name="uuid", dtype=DataType.VARCHAR, is_primary=True, max_length=36),
            FieldSchema(name=name, dtype=DataType.VARCHAR, max_length=5000),
            FieldSchema(name="embeds", dtype=DataType.FLOAT_VECTOR, dim=dimensions)
        ]

        schema = CollectionSchema(fields=fields, description=f"Collection for {name}")

        # Create the collection and store it in the dictionary
        collections[name] = Collection(name=f"{name}_collection", schema=schema)

List collections

In [202]:
utility.list_collections()

['author_collection',
 'published_date_collection',
 'LangChainCollection',
 'time_collection',
 'text_collection',
 'content_id_collection',
 'title_collection',
 'link_collection']

Partition creation

In [203]:
for collection in collections.values():
    partition = Partition(collection, partition_name)

List partitions

In [204]:
for collection in collections.values():
    display(collection.partitions)

[{"name": "_default", "collection_name": "text_collection", "description": ""},
 {"name": "rmrj_articles", "collection_name": "text_collection", "description": ""},
 {"name": "facebook_posts", "collection_name": "text_collection", "description": ""},
 {"name": "usjr_about", "collection_name": "text_collection", "description": ""}]

[{"name": "_default", "collection_name": "content_id_collection", "description": ""},
 {"name": "usjr_about", "collection_name": "content_id_collection", "description": ""}]

Index definition

In [205]:
index_params = {
  "metric_type": "L2", # Euclidean distance
  "index_type": "FLAT", # FLAT index type
  "params": {} # No additional parameters needed for FLAT
}

Index creation

In [206]:
for collection in collections.values():
    collection.create_index("embeds", index_params)

## Data Processing

Data loading

In [213]:
with open(json_path) as f:
    data = json.load(f)

Lowercasing dictionary

In [214]:
for row in data:
    # Check if the second element of the row is a dictionary
    if isinstance(row[1], dict):
        # Create a new dictionary with keys in lowercase
        new_dict = {k.lower(): v for k, v in row[1].items()}
        
        # Check if 'published date' is a key in the new dictionary
        if 'published date' in new_dict:
            # If it is, rename it to 'published_date'
            new_dict['published_date'] = new_dict.pop('published date')
        
        # Replace the old dictionary with the new one
        row[1] = new_dict

In [215]:
data[0][1]

{'content_id': 'eb87b395-1abc-4520-bcae-5da4d1054aeb',
 'text': 'RMRJ offers open access to its contents on the principle that it supports a greater global exchange of knowledge. Hence, it does not charge its readers any subscription fee to access full text of all its articles.  Permission to read, download, and print from the publisher or author is not necessary. Also, the journal accepts articles for publication at no cost on the part of the author.\n\nRMRJ is licensed under a Creative Commons Attribution-Noncommercial 4.0 International (CC BY-NC 4.0).\n\nAuthors grant the publisher an exclusive publication right but retain copyright in their article. In this case, the author/s have the right to (a) share their article in the same ways permitted to third parties under the relevant user license so long as it contains the publisherâ€™s logo, and a link to the version of record on Recoletos Multidisciplinary Research Journal; (b) retain patent, trademark and other intellectual property 

Time refactoring

In [95]:
from datetime import datetime

def change_date_format(date_string):
    date_object = datetime.strptime(date_string, '%Y-%m-%d')
    return date_object.strftime('%Y-%m-%d %B %d, %Y')

In [208]:
from datetime import datetime

def change_time_format(time_string):
    dt = datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S")
    formatted_time = dt.strftime("%Y-%m-%d %B %d, %Y %H:%M:%S")
    return formatted_time

for item in data:
    item[1]['time']=change_time_format(item[1]['time'])

In [96]:
for item in data:
    item[1]['published_date']=change_date_format(item[1]['published_date'])

In [216]:
data

[['34f773a3-9cf1-4276-9965-1016d005258c',
  {'content_id': 'eb87b395-1abc-4520-bcae-5da4d1054aeb',
   'text': 'RMRJ offers open access to its contents on the principle that it supports a greater global exchange of knowledge. Hence, it does not charge its readers any subscription fee to access full text of all its articles.  Permission to read, download, and print from the publisher or author is not necessary. Also, the journal accepts articles for publication at no cost on the part of the author.\n\nRMRJ is licensed under a Creative Commons Attribution-Noncommercial 4.0 International (CC BY-NC 4.0).\n\nAuthors grant the publisher an exclusive publication right but retain copyright in their article. In this case, the author/s have the right to (a) share their article in the same ways permitted to third parties under the relevant user license so long as it contains the publisherâ€™s logo, and a link to the version of record on Recoletos Multidisciplinary Research Journal; (b) retain pate

Dividing attributes to their corresponding collection (based on collection_names above)

In [217]:
data_lists = {f"{name}_obj": [] for name in collection_names}

for record in data:
    for name in collection_names:
        if name in record[1]:
            data_lists[f"{name}_obj"].append(record[1][name])
        else:
            print(f"The key '{name}' is not in the record.")

In [218]:
for name in collection_names:
    print(name, " - ", data_lists[f'{name}_obj'][0])

text  -  RMRJ offers open access to its contents on the principle that it supports a greater global exchange of knowledge. Hence, it does not charge its readers any subscription fee to access full text of all its articles.  Permission to read, download, and print from the publisher or author is not necessary. Also, the journal accepts articles for publication at no cost on the part of the author.

RMRJ is licensed under a Creative Commons Attribution-Noncommercial 4.0 International (CC BY-NC 4.0).

Authors grant the publisher an exclusive publication right but retain copyright in their article. In this case, the author/s have the right to (a) share their article in the same ways permitted to third parties under the relevant user license so long as it contains the publisherâ€™s logo, and a link to the version of record on Recoletos Multidisciplinary Research Journal; (b) retain patent, trademark and other intellectual property rights (including research data); and (c) proper attribution

Save uuids as list

In [219]:
uuid_list = []
for item in data:
    uuid_list.append(item[0])
uuid_list

['34f773a3-9cf1-4276-9965-1016d005258c',
 '9d67d88e-8fa9-40d7-b99f-d524d665825e',
 'ca294847-da13-405c-ab34-78828480d9d0',
 '1943d181-4f6f-434f-9fae-cd9fb404f0ca',
 'eaeb718b-c86a-4eb1-9f42-d9eed0d32a78',
 'b0c99c2b-03b2-42f8-9a76-5ffc483b8e45',
 '33464ac0-cfd8-468f-8164-d20f91489bee',
 '0d76a721-0772-4951-8d7e-8bec4dc6f46c',
 '768c2724-dcfb-44e2-88a4-495a96b0eb71',
 'c298635c-6e18-4149-99e0-1aea72a0eb48',
 'e4ce7220-177d-4f27-90be-caa08d1d0fe9',
 '3048aa54-88b5-43cb-80be-152a014794d0',
 '45fb753d-50cc-47d8-a5a1-ff8f398c5325',
 '5ccb78a2-632e-436e-93b4-f12d67c63a62',
 '22825fc4-caff-4e89-82fe-41396090fdb5',
 '0df153b4-b7b4-4e42-a334-7b07765bd823',
 '22579de5-8b56-4c6b-aa78-fa7f64cf5a6c',
 '1eedbb34-a099-405c-9c30-d6c4737c7ab6',
 '01569648-dbe5-4872-96d1-f069992a7a8e',
 'b39680f8-6e94-4b92-8601-fc049b81125b',
 'f7334cc5-692f-4526-bd1a-602485b674ca',
 'c55c1aa9-9880-4ac7-b82c-25032c3ae5c1',
 '35f5a888-7a70-4ea4-92bb-9c7af06d0308',
 '8849f666-7948-49fb-a4f7-20545408880b',
 '7b222372-d89d-

Accessing data_lists

Embeddings

In [220]:
json_path = "json_per_collection/"
def get_data_embeds(collection_names, data_lists, uuid_list):
    data_lists_embeds = {f"{name}_obj": [] for name in collection_names}
    for name in collection_names:
        for item, id_uuid in zip(tqdm(data_lists[f'{name}_obj'], desc=f'Processing {name}'), uuid_list):
            embedding = get_embedding(item)
            data_lists_embeds[f'{name}_obj'].append(embedding)
            time.sleep(1)  # Add a time break of 1 second (adjust as needed)
    return data_lists_embeds

def create_obj_data(collection_names, data_lists, uuid_list):
    data_lists_embeds = get_data_embeds(collection_names, data_lists, uuid_list)
    obj_list = {}
    for name in collection_names:
        obj_data = [
            {
                'uuid': id_uuid,
                f'{name}': item,
                'embeds': embedding
            } 
            for item, id_uuid, embedding in zip(data_lists[f'{name}_obj'], uuid_list, data_lists_embeds[f'{name}_obj'])
        ]
        obj_list[name] = obj_data
    return obj_list

def save_obj_data_to_json(obj_list):
    for name, obj_data in obj_list.items():
        with open(f'{json_path}{partition_name}_{name}.json', 'w') as file:
            json.dump(obj_data, file)

In [221]:
obj_list = create_obj_data(collection_names, data_lists, uuid_list)
save_obj_data_to_json(obj_list)

Processing text: 100%|██████████████████████████| 31/31 [00:51<00:00,  1.65s/it]
Processing content_id: 100%|████████████████████| 31/31 [00:46<00:00,  1.52s/it]


##  Upserting

Loading

In [222]:
json_path = "json_per_collection/" 
def open_json(filename):
    with open(filename + ".json") as file:
        return json.load(file)

obj_list = {}
for name in collection_names:
    obj_list[name] = open_json(json_path + f"{partition_name}_{name}")

for name in collection_names:
    for obj in obj_list[name]:
        if len(obj[name]) > 5000:
            obj[name] = obj[name][:2480]


Upserting

In [223]:
for name in collection_names:
    collection = Collection(f"{name}_collection")
    print(collection.insert(obj_list[name], partition_name=partition_name))


(insert count: 31, delete count: 0, upsert count: 0, timestamp: 442910031377596419, success count: 31, err count: 0)
(insert count: 31, delete count: 0, upsert count: 0, timestamp: 442910031390703623, success count: 31, err count: 0)


In [224]:
print(collection.flush())

None
