## IMPORTANT
1. Run Milvus Docker first

## Scheme Preparation

Imports

In [26]:
from pymilvus import connections, DataType, CollectionSchema, FieldSchema, Collection, Partition, utility
import openai
import pandas as pd
import numpy as np
import re
import json
from openai.embeddings_utils import get_embedding
import time
from tqdm import tqdm
import fasttext


Constants

In [27]:
OPENAI_API_KEY = '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'
max_tokens = 8000
dimensions = {'openai' : 1536,
            'fasttext' : 300}
openai.api_key = OPENAI_API_KEY

Mutable variables

In [30]:
# Change partition_name based on kind of data
partition_name = 'scs_about'
# Change embedder to either fasttext openai 

embedder = 'openai'
dimension = dimensions[embedder]
bundled_schema = {'rmrj_articles': ['author', 'title', 'published_date', 'text'],
                  'facebook_posts': ['text', 'time', 'link'],
                  'usjr_about': ['text', 'content_id'],
                  'contacts': ['text', 'contact', 'department'],
                  'scs_about': ['text', 'link', 'title'],
                  'religious_admin': ['text', 'name', 'position', 'media'],
                  'all': ['author', 'title', 'published_date', 'text', 'time', 'post', 'link', 'content_id']}
collection_names = bundled_schema[partition_name]
json_path = f'raw_jsons/{partition_name}.json'
description = 'description'
if embedder == 'fasttext':
    fasttext_model = fasttext.load_model('/Users/garfieldgreglim/Library/Mobile Documents/com~apple~CloudDocs/Josenian-Query/Final Outputs/Jupyter Notebooks/Embedder/crawl-300d-2M-subword.bin')

Function definitions:

In [32]:
def get_embedding(text, embedding_type):
    text = text.replace("\n", " ")
    model = "text-embedding-ada-002"
    if embedding_type == 'openai':
        return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
    elif embedding_type == 'fasttext':
        return fasttext_model.get_sentence_vector(text)
    else:
        raise ValueError("Invalid embedding_type. Expected 'openai' or 'fasttext'.")


Connection

In [33]:
# Check if the connection already exists
if connections.has_connection('default'):
    connections.remove_connection('default')  # Disconnect if it exists

# Now, reconnect with your new configuration
connections.connect(alias='default', host='localhost', port='19530')

Drop collection

In [34]:
# for name in collection_names:
#     utility.drop_collection(f"{name}_collection")
# utility.list_collections()

Collection schema definition

In [35]:
collections = {}  # To store the created collections

for name in collection_names:
    if name not in utility.list_collections():
        fields = [
            FieldSchema(name="uuid", dtype=DataType.VARCHAR, is_primary=True, max_length=36),
            FieldSchema(name=name, dtype=DataType.VARCHAR, max_length=5000),
            FieldSchema(name="embeds", dtype=DataType.FLOAT_VECTOR, dim=dimension)
        ]

        schema = CollectionSchema(fields=fields, description=f"Collection for {name}")

        # Create the collection and store it in the dictionary
        collections[name] = Collection(name=f"{name}_collection", schema=schema)

List collections

In [36]:
utility.list_collections()

['published_date_collection',
 'department_collection',
 'media_collection',
 'LangChainCollection',
 'text_collection',
 'content_id_collection',
 'link_collection',
 'title_collection',
 'time_collection',
 'author_collection',
 'contact_collection',
 'name_collection',
 'position_collection']

Partition creation

In [37]:
for collection in collections.values():
    partition = Partition(collection, partition_name)

List partitions

In [38]:
for collection in collections.values():
    display(collection.partitions)

[{"name": "_default", "collection_name": "text_collection", "description": ""},
 {"name": "usjr_about", "collection_name": "text_collection", "description": ""},
 {"name": "facebook_posts", "collection_name": "text_collection", "description": ""},
 {"name": "rmrj_articles", "collection_name": "text_collection", "description": ""},
 {"name": "contacts", "collection_name": "text_collection", "description": ""},
 {"name": "religious_admin", "collection_name": "text_collection", "description": ""},
 {"name": "scs_about", "collection_name": "text_collection", "description": ""}]

[{"name": "_default", "collection_name": "link_collection", "description": ""},
 {"name": "facebook_posts", "collection_name": "link_collection", "description": ""},
 {"name": "scs_about", "collection_name": "link_collection", "description": ""}]

[{"name": "_default", "collection_name": "title_collection", "description": ""},
 {"name": "rmrj_articles", "collection_name": "title_collection", "description": ""},
 {"name": "scs_about", "collection_name": "title_collection", "description": ""}]

Index definition

In [39]:
index_params = {
  "metric_type": "L2", # Euclidean distance
  "index_type": "FLAT", # FLAT index type
  "params": {} # No additional parameters needed for FLAT
}

Index creation

In [40]:
for collection in collections.values():
    collection.create_index("embeds", index_params)

## Data Processing

Data loading

In [41]:
with open(json_path) as f:
    data = json.load(f)

Dividing attributes to their corresponding collection (based on collection_names above)

In [42]:
data_lists = {f"{name}_obj": [] for name in collection_names}

for record in data:
    for name in collection_names:
        if name in record[1]:
            data_lists[f"{name}_obj"].append(record[1][name])
        else:
            print(f"The key '{name}' is not in the record.")

Checking

In [43]:
for name in collection_names:
    print(name, " - ", data_lists[f'{name}_obj'][0])

text  -  	
Inspiring the Computer Scientists of Tomorrow
The School of Computer Studies offers dynamic Information technology and computing degree programs whose graduates are responsive to the rapid changing demands of the local and global community.
Center of Excellence
Certified by the Commission on Higher Education or CHED. SCS has been awarded Center of Excellence in the field of Information Technology.
What does this mean to you?
It means you’ll have access to a broad range of degrees and programs taught by highly accomplished faculty. You’ll be surrounded by a student body that can motivate and inspire you. Most of all, it means you’re going to get a great education at the University of San Jose – Recoletos.
5
DEGREE PROGRAMS
2003
YEAR FOUNDED
100%
DB2 PASSING RATE
Lv.3
PAASCU CERTIFIED
title: About School of Computer Sciences (SCS), link: https://usjr.edu.ph/scs/about/
link  -  https://usjr.edu.ph/scs/about/
title  -  About School of Computer Sciences (SCS)


Save uuids as list

In [44]:
uuid_list = []
for item in data:
    uuid_list.append(item[0])
uuid_list

['797fe1d9-b6e4-47a0-8014-5f2891502bd8',
 'd358d64e-bb16-4ba7-ab6c-5c7f2a3c5ab4',
 '37c38470-38fa-4a2d-a372-6643b56b45c2',
 '22fa4973-98db-4201-af93-35749dc93073',
 '646dd8b8-bd61-49eb-98a3-8375d047df02',
 'd268877c-4ff9-475d-b9cd-8f65d1b6350d',
 'b66ecef1-37e4-45a7-a5a3-ecaa6b55b060',
 '875f456d-5ef3-4846-9d8b-8c72db4e0498',
 'f2e87c60-5b9a-4ffd-816d-9a8fe727d6d1',
 '8595cc5d-3c06-4ead-af0e-444d0cd96f04',
 '0eb23a7b-93fb-4b7e-bb40-2dd3915e47bd']

Embeddings

In [45]:
import string
json_path = "json_per_collection/"
def get_data_embeds(collection_names, data_lists, uuid_list):
    data_lists_embeds = {f"{name}_obj": [] for name in collection_names}
    for name in collection_names:
        if name == 'link' or name == 'media':
            continue
        for item, id_uuid in zip(tqdm(data_lists[f'{name}_obj'], desc=f'Processing {name}'), uuid_list):
            item_lower = item.lower()
            print(item_lower)
            embedding = get_embedding(item_lower, embedding_type = embedder)
            data_lists_embeds[f'{name}_obj'].append(embedding)
            time.sleep(1)  # Add a time break of 1 second (adjust as needed)
    return data_lists_embeds

def create_obj_data(collection_names, data_lists, uuid_list):
    data_lists_embeds = get_data_embeds(collection_names, data_lists, uuid_list)
    obj_list = {}
    for name in collection_names:
        if name == 'link' or name == 'media':  # Handle 'link' differently, use 'text' embeddings
            obj_data = [
                {
                    'uuid': id_uuid,
                    f'{name}': item,
                    'embeds': text_embedding  # Use the corresponding 'text' embedding
                } 
                for item, id_uuid, text_embedding in zip(data_lists[f'{name}_obj'], uuid_list, data_lists_embeds['text_obj'])
            ]
        else:
            obj_data = [
                {
                    'uuid': id_uuid,
                    f'{name}': item,
                    'embeds': embedding
                } 
                for item, id_uuid, embedding in zip(data_lists[f'{name}_obj'], uuid_list, data_lists_embeds[f'{name}_obj'])
            ]
        obj_list[name] = obj_data
    return obj_list


def save_obj_data_to_json(obj_list):
    for name, obj_data in obj_list.items():
        with open(f'{json_path}{partition_name}_{name}.json', 'w') as file:
            json.dump(obj_data, file)

In [46]:
obj_list = create_obj_data(collection_names, data_lists, uuid_list)

Processing text:   0%|                                   | 0/11 [00:00<?, ?it/s]

	
inspiring the computer scientists of tomorrow
the school of computer studies offers dynamic information technology and computing degree programs whose graduates are responsive to the rapid changing demands of the local and global community.
center of excellence
certified by the commission on higher education or ched. scs has been awarded center of excellence in the field of information technology.
what does this mean to you?
it means you’ll have access to a broad range of degrees and programs taught by highly accomplished faculty. you’ll be surrounded by a student body that can motivate and inspire you. most of all, it means you’re going to get a great education at the university of san jose – recoletos.
5
degree programs
2003
year founded
100%
db2 passing rate
lv.3
paascu certified
title: about school of computer sciences (scs), link: https://usjr.edu.ph/scs/about/


Processing text:   9%|██▍                        | 1/11 [00:01<00:15,  1.57s/it]

undergraduate degree programs
the university of san jose-recoletos is open to students who meet its academic standards and who are personally qualified to acquire a formal education, and willing to abide by the rules and ideas of the institution.

computer science
it aims to produce it professionals who will specialize in research, design and development of different computer and information systems

computer science, bachelor science [bscs]
information technology
this program is responsive to the nations thrust to promote the development of information technology to spur productivity, growth and global competitiveness; and to produce a base of it professionals and experts to meet a large workforce demand potential.
information technology, bachelor science [bsit]
also available: [mit] [masters degree]
information systems
this program is responsive to the nations thrust to promote the development of information technology to spur productivity, growth and global competitiveness; and to p

Processing text:  18%|████▉                      | 2/11 [00:03<00:13,  1.52s/it]

graduate programs

information technology
this course introduces students the advanced theoretical and practical aspects of information and communications technology. there is an in-depth approach to programming, a good balance in the development of competencies in both open source based and proprietary technologies.
information technology, masters [mit]
title: school of computer studies (scs) graduate programs, link: https://usjr.edu.ph/scs/degrees/graduate/


Processing text:  27%|███████▎                   | 3/11 [00:04<00:11,  1.43s/it]

bachelor of science in computer science

about this course
the bachelor of science in computer science (bscs) program is responsive to the nations thrust to become a source and developer of technology instead of just being a consumer of technology. it aims to produce it professionals who will specialize in research, design and development of different computer and information systems. the curriculum is laden with subjects that deal with comprehension of mathematical and profound computer science concepts, analysis and design of computer systems. the bscs graduate will have a good foundation and competence on the science on computers and information that will enable him/her to endeavor into research, design and development on various computer and it specialty areas.
careers
systems and applications programmer
systems analyst/designer
database designer/administrator
software engineer
algorithm analyst
software specialist
admission requirements
applicants are expected to have:
completed s

Processing text:  36%|█████████▊                 | 4/11 [00:05<00:09,  1.40s/it]

university of san jose-recoletos
college of information, computer and communications technology
basak campus, cebu city
bachelor of science in computer science
(effective 2015-2016)
(ladder type curriculum-first two years)
associate in computer technology (act)
id no.
name
adviser
batch no.
contact no./ e-mail ad
first year first semester
fg course
no.
description units prerequisites
comp 1 introduction to computing 3 none
prog1 computer programming 1 3 none
math 1 college algebra 3 none
math 2 plane trigonometry 3 none
english 1 study and thinking skills 3 none
reed 1 revelation and faith 3 none
pe 1 physical fitness 2 none
rotc/cwts 11 reserved officer training corps/civic welfare training service 3 none
guidance 1 adjustment to college life phase 1 0 none
23
first year second semester
fg course
no.
description units prerequisites
discrete 1 discrete structures 1 3 math 1
prog2 computer programming 2 3 prog1
web web development 3 comp 1
math 4 analytic geometry 3 math1, math2
english

Processing text:  45%|████████████▎              | 5/11 [00:07<00:08,  1.39s/it]

bachelor of science in
entertainment and multimedia computing

about this course
entertainment and multimedia computing is the study and use of concepts, principles, and techniques of computing in design and development of multimedia products and solutions it includes various applications such as in science, entertainment, education, simulations and advertising.

the program enables the students to be knowledgeable of the whole pipeline of game development. the students will acquire the independence and creative competencies to articulate project design and requirements of new projects, not necessarily based on standard templates.
careers
primary career targets for emc graduates:
design and development of multimedia products and solutions
game developer
admission requirements
applicants are expected to have:
completed senior highschool
duration
4 years
campus
university of san jose – recoletos, basak campus
units and electives
to attain the award of bachelor of science in computer scie

Processing text:  55%|██████████████▋            | 6/11 [00:09<00:08,  1.68s/it]

bachelor of science in
information systems

about this course
the bs information systems program includes the study of application and effect of information technology to organizations. graduates of the program should be able to implement an information system, which considers complex technological and organizational factors affecting it. these include components, tools, techniques, strategies, methodologies, etc. graduates are able to help an organization determine how information and technology-enabled business processes can be used as strategic tool to achieve a competitive advantage. as a result, is professionals require a sound understanding of organizational principles and practices so that they can serve as an effective bridge between the technical and management/users communities within an organization. this enables them to ensure that the organization has the information and the systems it needs to support its operations.
careers
systems and applications programmer
systems ana

Processing text:  64%|█████████████████▏         | 7/11 [00:10<00:06,  1.65s/it]

university of san jose-recoletos
college of information, computer and communications technology
basak campus, cebu city
bachelor of science in information systems
(effective 2015-2016)
id no.
name
adviser
batch no.
contact no./ e-mail ad
first year first semester
fg course
no.
description units prerequisites
comp 1 introduction to computing 3 none
prog1 computer programming 1 3 none
math 1 college algebra 3 none
math 2 plane trigonometry 3 none
english 1 study and thinking skills 3 none
reed 1 revelation and faith 3 none
pe 1 physical fitness 2 none
rotc/cwts 11 reserved officer training corps/civic welfare training service 3 none
guidance 1 adjustment to college life phase 1 0 none
23
first year second semester
fg course
no.
description units prerequisites
is fundamentals of information systems 3 comp
prog2 computer programming 2 3 prog1
web web development 3 comp1
math 4 analytic geometry 3 math1, math2
english 2 writing in the discipline 3 english 1
reed 2 christology 3 reed 1
pe 2 

Processing text:  73%|███████████████████▋       | 8/11 [00:12<00:04,  1.56s/it]

bachelor of science in
information technology

about this course
the bsit program is responsive to the nations thrust to promote the development of information technology to spur productivity, growth and global competitiveness; and to produce a base of it professionals and experts to meet a large workforce demand potential. the curriculum is composed of subjects that deal with the applications and practical knowledge.

competent training will be provided in the very useful areas like programming, database information systems development, installation and administration, computer networks and system resource management.

the bsit graduate will be primarily equipped with practical knowledge on how information systems are designed, installed, managed and administered. the graduates competence prepares him/her for the challenges of the it profession brought about by increasing and changing demands of businesses and industries.
careers
application programmer
database administrator
network a

Processing text:  82%|██████████████████████     | 9/11 [00:13<00:02,  1.49s/it]

university of san jose-recoletos
college of information, computer and communications technology
basak campus, cebu city
bachelor of science in information technology
(effective 2015-2016)
(ladder type curriculum-first two years)
associate in computer technology (act)
id no.
name
adviser
batch no.
contact no./ e-mail ad
first year first semester
fg course
no. description units prerequisites
comp 1 introduction to computing 3 none
prog1 computer programming 1 3 none
math 1 college algebra 3 none
math 2 plane trigonometry 3 none
english 1 study and thinking skills 3 none
reed 1 revelation and faith 3 none
pe 1 physical fitness 2 none
rotc/cwts 11 reserved officer training corps/civic welfare training service 3 none
guidance 1 adjustment to college life phase 1 0 none
23
first year second semester
fg course
no. description units prerequisites
discrete 1 discrete structures 1 3 math 1
prog2 computer programming 2 3 prog1
web web development 3 comp 1
math 4 analytic geometry 3 math1, math2
e

Processing text:  91%|███████████████████████▋  | 10/11 [00:15<00:01,  1.52s/it]

masters degree in
information technology

about this course
master of science in information technology (msit) and master of information technology (mit). the master of science in information technology program offered at the university of san jose – recoletos introduces students the advanced theoretical and practical aspects of information and communications technology. there is an in-depth approach to programming, a good balance in the development of competencies in both open source based and proprietary technologies.

the msit program culminates with the completion of a research and development (rnd) thesis project. a research and development (rnd) thesis project, as operationally defined, is an undertaking that exhibits concepts and theories translated to technology. a thesis project involves the analysis of an information technology case/problem and development of a solution/solutions to that problem. it should be innovative, or an improvement of an existing technology. with this 

Processing text: 100%|██████████████████████████| 11/11 [00:16<00:00,  1.54s/it]
Processing title:   0%|                                  | 0/11 [00:00<?, ?it/s]

about school of computer sciences (scs)


Processing title:   9%|██▎                       | 1/11 [00:01<00:19,  1.91s/it]

school of computer studies (scs) undergraduate degree programs


Processing title:  18%|████▋                     | 2/11 [00:03<00:15,  1.69s/it]

school of computer studies (scs) graduate programs


Processing title:  27%|███████                   | 3/11 [00:04<00:12,  1.52s/it]

about bachelor of science in
computer science
(cs or bscs)


Processing title:  36%|█████████▍                | 4/11 [00:06<00:11,  1.60s/it]

prospectus of bachelor of science in computer science (cs or bscs)


Processing title:  45%|███████████▊              | 5/11 [00:07<00:09,  1.50s/it]

about bachelor of science in entertainment and multimedia computing (bsemc or emc)


Processing title:  55%|██████████████▏           | 6/11 [00:09<00:07,  1.44s/it]

about bachelor of science in information system (bsis or is)


Processing title:  64%|████████████████▌         | 7/11 [00:10<00:06,  1.54s/it]

prospectus of bachelor of science in information system (bsis or is)


Processing title:  73%|██████████████████▉       | 8/11 [00:12<00:04,  1.47s/it]

about bachelor of science in information technology (bsit or it)


Processing title:  82%|█████████████████████▎    | 9/11 [00:13<00:02,  1.44s/it]

prospectus of bachelor of science in information technology (bsit or it)


Processing title:  91%|██████████████████████▋  | 10/11 [00:14<00:01,  1.41s/it]

about master of science in information technology (msit) and master of information technology (mit)


Processing title: 100%|█████████████████████████| 11/11 [00:16<00:00,  1.48s/it]


Saving

In [47]:
if embedder == 'fasttext':
    for name in collection_names:
        for data in obj_list[name]:
            data['embeds']= data['embeds'].tolist()

In [48]:
save_obj_data_to_json(obj_list)

In [49]:
obj_list['text'][0]

{'uuid': '797fe1d9-b6e4-47a0-8014-5f2891502bd8',
 'text': '\t\nInspiring the Computer Scientists of Tomorrow\nThe School of Computer Studies offers dynamic Information technology and computing degree programs whose graduates are responsive to the rapid changing demands of the local and global community.\nCenter of Excellence\nCertified by the Commission on Higher Education or CHED. SCS has been awarded Center of Excellence in the field of Information Technology.\nWhat does this mean to you?\nIt means you’ll have access to a broad range of degrees and programs taught by highly accomplished faculty. You’ll be surrounded by a student body that can motivate and inspire you. Most of all, it means you’re going to get a great education at the University of San Jose – Recoletos.\n5\nDEGREE PROGRAMS\n2003\nYEAR FOUNDED\n100%\nDB2 PASSING RATE\nLv.3\nPAASCU CERTIFIED\ntitle: About School of Computer Sciences (SCS), link: https://usjr.edu.ph/scs/about/',
 'embeds': [-0.0002958411641884595,
  0.02