In [8]:
from pymilvus import connections, DataType, CollectionSchema, FieldSchema, Collection, Partition, utility
from pymilvus import Milvus, DataType, Collection, MilvusException
import openai
import pandas as pd
import numpy as np
import re
import json
from openai.embeddings_utils import get_embedding
import time
from tqdm import tqdm
import fasttext

Api

In [9]:
openai.api_key = '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'

Dictionaries

In [10]:
collections_list = [
    'text_collection',
    'author_collection',
    'title_collection',
    'contact_collection',
    'name_collection',
    'position_collection',
    'department_collection',
    'date_collection',
]
fields_list = [
    'text',
    'author',
    'title',
    'contact',
    'name',
    'position',
    'department',
    'date',
]
collections_dict = {
    "text_collection": ["uuid", "text_id", "text", "embeds", "media", "link", "partition_name"],
    "author_collection": ["uuid", "author", "embeds", "partition_name"],
    "title_collection": ["uuid", "title", "embeds", "partition_name"],
    "date_collection": ["uuid", "date", "embeds", "partition_name"],
    "contact_collection": ["uuid", "contact", "embeds", "partition_name"],
    "department_collection": ["uuid", "department", "embeds", "partition_name"],
    "name_collection": ["uuid", "name", "embeds", "partition_name"],
    "position_collection": ["uuid", "position", "embeds", "partition_name"]
}

partitions = {
    "documents_partition": ["text_collection", "author_collection", "title_collection", "date_collection"],
    "social_posts_partition": ["text_collection", "date_collection"],
    "contacts_partition": ["name_collection", "text_collection", "contact_collection", "department_collection"],
    "people_partition": ["text_collection","name_collection","position_collection","department_collection"],
    "usjr_documents_partition": ["text_collection", "title_collection"],
    "scs_documents_partition" : ["text_collection"],
    "religious_admin_people_partition": ["text_collection","name_collection","position_collection"],
}

Connection

In [38]:
# Check if the connection already exists
if connections.has_connection('default'):
    connections.remove_connection('default')  # Disconnect if it exists

# Now, reconnect with your new configuration
connections.connect(alias='default', host='localhost', port='19530')

Embedder

In [11]:
fasttext_model = fasttext.load_model('/Users/garfieldgreglim/Library/Mobile Documents/com~apple~CloudDocs/Josenian-Query/Embedder/crawl-300d-2M-subword.bin')
def get_embedding(text, embedding_type):
    text = text.replace("\n", " ")
    model = "text-embedding-ada-002"
    if embedding_type == 'openai':
        return openai.Embedding.create(input=[text.lower()], model=model)['data'][0]['embedding']
    elif embedding_type == 'fasttext':
        return fasttext_model.get_sentence_vector(text.lower())
    else:
        raise ValueError("Invalid embedding_type. Expected 'openai' or 'fasttext'.")



Symbol remover

In [12]:
def remove_non_alphanumeric(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

Vectorizer

In [92]:
def vectorize_query(query):
    return {'question1536': get_embedding(query.lower(), 'openai'),'question300': get_embedding(query.lower(), 'fasttext').tolist()}

In [93]:
vectors = vectorize_query("Authored by Emilian Catalina")

In [94]:
vectors

{'question1536': [0.0009016267722472548,
  -0.0004782541946042329,
  0.002301543951034546,
  -0.04398893192410469,
  -0.0030298142228275537,
  0.01506718434393406,
  -0.028629044070839882,
  -0.018412351608276367,
  -0.0027249164413660765,
  -0.01735304854810238,
  0.011596575379371643,
  0.01602892018854618,
  -0.009749764576554298,
  0.012272577732801437,
  -0.011443255469202995,
  0.02046126499772072,
  0.01945771463215351,
  -0.00914345309138298,
  -0.007240890525281429,
  0.0029758037999272346,
  -0.023165274411439896,
  0.023806430399417877,
  0.003510681912302971,
  0.011854431591928005,
  -0.018621424213051796,
  -0.005704205017536879,
  0.00882287509739399,
  -0.043682292103767395,
  0.02277500368654728,
  -0.016962779685854912,
  -0.006815775763243437,
  -0.0026046994607895613,
  -0.013269158080220222,
  -0.03320077061653137,
  -0.026161985471844673,
  0.0020001304801553488,
  0.009728857316076756,
  -0.025980789214372635,
  0.022356858476996422,
  0.009896115399897099,
  0.0

Search collection

In [95]:
def search_collections(vectors, partition_name):
    question1536=vectors['question1536']
    question300=vectors['question300']
    results_dict = {}
    search_params = {
    "metric_type": "L2",  # Distance metric, can be L2, IP (Inner Product), etc.
    "offset": 0,}
    for name in fields_list:
        try:
            if name == 'text':
                collection = Collection(f"{name}_collection")
                collection.load()
                result = collection.search(
                    data=[question1536],
                    anns_field="embeds",
                    param=search_params,
                    limit=10,
                    partition_names=[partition_name],
                    output_fields=['uuid', 'text_id'],
                    consistency_level="Strong"
                )
                results_dict[name] = result
            else:
                collection = Collection(f"{name}_collection")
                collection.load()
                result = collection.search(
                    data=[question300],
                    anns_field="embeds",
                    param=search_params,
                    limit=10,
                    partition_names=[partition_name],
                    output_fields=['uuid'],
                    consistency_level="Strong"
                )
                results_dict[name] = result
        except MilvusException as e:
            if 'partition name' in str(e) and 'not found' in str(e):
                print(f"Partition '{partition_name}' not found in collection '{name}', skipping...")
                continue
            else:
                raise e  # if it's a different kind of MilvusException, we still want to raise it
    
    return results_dict

Check dimensions

In [96]:
def check_collection_dimension(collection):
    collection_params = collection.schema
    vector_field = [field for field in collection_params.fields if field.dtype == DataType.FLOAT_VECTOR][0]
    print(f"Dimension of vectors in collection '{collection.name}': {vector_field.params['dim']}")

In [97]:
results_dict = search_collections(vectors, 'documents_partition')

RPC error: [search], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:33:01.307340', 'RPC error': '2023-07-26 03:33:01.311487'}>
RPC error: [search], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:33:01.339215', 'RPC error': '2023-07-26 03:33:01.343686'}>
RPC error: [search], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:33:01.368903', 'RPC error': '2023-07-26 03:33:01.372781'}>
RPC error: [search], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:33:01.392287', 'RPC error': '2023-07-26 03:33:01.395685'}>


Partition 'documents_partition' not found in collection 'contact', skipping...
Partition 'documents_partition' not found in collection 'name', skipping...
Partition 'documents_partition' not found in collection 'position', skipping...
Partition 'documents_partition' not found in collection 'department', skipping...


Process results

In [98]:
def process_results(results_dict):
    json_results = {}

    for collection_name, result in results_dict.items():
        for query_hits in result:
            for hit in query_hits:
                if collection_name == 'text':
                    id_field = 'entity_id'
                    id_value = hit.entity.get('text_id')
                else:
                    id_field = 'entity_id'
                    id_value = hit.id
                
                # Create the result dictionary
                result_dict = {
                    id_field: id_value,
                    "distance": hit.distance,
                    "collection": collection_name
                }

                # If the id_value is already in the results and the new distance is greater, skip
                if id_value in json_results and json_results[id_value]["distance"] < hit.distance:
                    continue

                # Otherwise, update/insert the result
                json_results[id_value] = result_dict
                
            json_results_list = list(json_results.values())
            json_results_sorted = sorted(json_results_list, key=lambda x: x['distance'])
    
    return json_results_sorted


In [99]:
json_results_sorted = process_results(results_dict)

In [117]:
def populate_results(json_results_sorted):
    # Load all collections beforehand
    collections = {name: Collection(f"{name}_collection") for name in fields_list}

    # Create a list of entity IDs for the query
    entity_ids = [result["entity_id"] for result in json_results_sorted]

    # Preparing an empty dictionary for each field in the results
    for result in json_results_sorted:
        for name in fields_list:
            result[name] = []

    # Query for all relevant records at once
    for name, collection in collections.items():
        try:
            # Prepare the query
            output_fields = []
            if name == 'text':
                query_field = "text_id"
                output_fields = [name, 'text_id']
            else:
                query_field = "uuid"
                output_fields = [name]

            query = f"{query_field} in {entity_ids}"

            query_results = collection.query(
                expr=query, 
                offset=0, 
                limit=len(entity_ids), 
                partition_names=['documents_partition'], 
                output_fields=output_fields, 
                consistency_level="Strong"
            )

            # Append the results to the relevant fields in the results dictionary
            for query_result in query_results:
                for result in json_results_sorted:
                    if (name == 'text' and result["entity_id"] == query_result["text_id"]) or (name != 'text' and result["entity_id"] == query_result["uuid"]):
                        result[name].append(query_result[name])
            final_results = []
            for result in json_results_sorted:
                obj = {}
                for item in result:
                    # If item is not 'entity_id' or 'distance' and the item's value is not empty
                    if item not in ['entity_id', 'collection'] and result[item]:
                        obj[item] = result[item]
                final_results.append(obj)
        except Exception as e:
            print(f"Error with collection {name}: {str(e)}")
    return final_results[:10]


In [101]:
final_results = populate_results(json_results_sorted)

RPC error: [query], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:33:03.088350', 'RPC error': '2023-07-26 03:33:03.097856'}>
RPC error: [query], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:33:03.099120', 'RPC error': '2023-07-26 03:33:03.114345'}>
RPC error: [query], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:33:03.115060', 'RPC error': '2023-07-26 03:33:03.128520'}>
RPC error: [query], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:33:03.129148', 'RPC error': '2023-07-26 03:33:03.136173'}>


Error with collection contact: <MilvusException: (code=1, message=partition name documents_partition not found)>
Error with collection name: <MilvusException: (code=1, message=partition name documents_partition not found)>
Error with collection position: <MilvusException: (code=1, message=partition name documents_partition not found)>
Error with collection department: <MilvusException: (code=1, message=partition name documents_partition not found)>


In [102]:
final_results

[{'text': ['This study aims to identify whether the Senior High School students are Linguistic, Discourse, Sociolinguistic or Strategic Competent in English. Quantitative and qualitative research methods were used to collate a survey that would sure mend the study. With comprehensive computation of the random datasets, almost every Senior High School student achieved a satisfactory rating in both the Perceptions and Abilities Categories. A Chi-Square Statistic was also used and yielded maximum values which consequently, resulted to low p-values, suggesting the negation of the studyâ€™s null hypothesis. The result is further supported using the correlation statistic which corresponds to a high correlation between the studentsâ€™ Perceptions and Abilities in their speaking competence. The outcome of the summary of all competencies show that the Linguistic Competence contributed the most to the overall language intents of the students with sociolinguistic competence giving the least, whic

In [120]:
def generate_response(prompt, string_json):
    # Format the input as per the desired conversation format
    conversation = [
        {'role': 'system', 'content': """You are Josenian Quiri. University of San Jose- Recoletos' general knowledge base assistant. Refer to yourself as JQ. If there are links, give the link as well."""},
        {'role': 'user', 'content': prompt},
        {'role': 'system', 'content': f'Here is the database JSON from your knowledge base (note: select only the correct answer): \n{string_json}'},
        {'role': 'user', 'content': ''}
    ]
    
    # Convert the conversation to a string
    conversation_str = ''.join([f'{item["role"]}: {item["content"]}\n' for item in conversation])

    response = openai.ChatCompletion.create(
      model="gpt-4",
      messages=conversation,
      temperature=1,
      max_tokens=250,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    
    # Extract the generated response from the API's response
    generated_text = response['choices'][0]['message']['content']


    # Return the response
    return generated_text


In [121]:
def question_answer():
    prompt = input("You: ")
    vectors = vectorize_query(prompt)
    results_dict = search_collections(vectors, 'documents_partition')
    json_results_sorted = process_results(results_dict)
    final_results = populate_results(json_results_sorted)
    display
    string_json = json.dumps(final_results)
    display(string_json)
    generated_text = generate_response(prompt, string_json)
    print(f"JQ: {generated_text}")

In [123]:
question_answer()

You: Emiliano Catalina


RPC error: [search], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:45:00.894331', 'RPC error': '2023-07-26 03:45:00.897707'}>
RPC error: [search], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:45:00.916277', 'RPC error': '2023-07-26 03:45:00.919555'}>
RPC error: [search], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:45:00.938368', 'RPC error': '2023-07-26 03:45:00.941589'}>
RPC error: [search], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:45:00.959539', 'RPC error': '2023-07-26 03:45:00.962719'}>


Partition 'documents_partition' not found in collection 'contact', skipping...
Partition 'documents_partition' not found in collection 'name', skipping...
Partition 'documents_partition' not found in collection 'position', skipping...
Partition 'documents_partition' not found in collection 'department', skipping...


RPC error: [query], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:45:02.497726', 'RPC error': '2023-07-26 03:45:02.502636'}>
RPC error: [query], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:45:02.503259', 'RPC error': '2023-07-26 03:45:02.507452'}>
RPC error: [query], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:45:02.508018', 'RPC error': '2023-07-26 03:45:02.516438'}>
RPC error: [query], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 03:45:02.516954', 'RPC error': '2023-07-26 03:45:02.521365'}>


Error with collection contact: <MilvusException: (code=1, message=partition name documents_partition not found)>
Error with collection name: <MilvusException: (code=1, message=partition name documents_partition not found)>
Error with collection position: <MilvusException: (code=1, message=partition name documents_partition not found)>
Error with collection department: <MilvusException: (code=1, message=partition name documents_partition not found)>


'[{"distance": 0.23630306124687195, "text": ["This paper deals with 1) angle trisection, 2) Bhaskara\\u00e2\\u20ac\\u2122s first proof, and 3) Pythagorean theorem. The purpose of this paper is threefold. First, to show a new, direct method of trisecting the 900 angle using unmarked straight edge and compass; secondly, to show Bhaskara\\u00e2\\u20ac\\u2122s first proof of the Pythagorean theorem (c2 = a2 + b2) as embedded in this new, direct trisection of the 900 angle; lastly, to show the derivation of the Pythagorean theorem from this trisection of the 900 angle. This paper employs the direct dissection method. It concludes by presenting four points: a) the concept of trisectability as distinct from concept of constructability; b) the trisection of the 900 angle as really a new, different method; c) Bhaskara\\u00e2\\u20ac\\u2122s first proof of the Pythagorean theorem as truly embedded in this trisection of the 900 angle and; d) another way of deriving Pythagorean theorem from this tr

JQ: Emiliano Catalina or Emiliano C. De Catalina is the author of several papers including:

1. "Angle Trisection, Bhaskara’s Proof, and Pythagorean Theorem" published on May 28, 2021. You can view more about it [here](https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/article/view/987).

2. "Timeless Existence and Principle of Creation: Notions Embedded in John 1:1, 'In the Beginning Was the Word'" published on May 25, 2022. It can be viewed [here](https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/article/view/986).

3. He is also a co-author of "The Yapian Classification of the Vocabulary of the Austronesian Visayan-Cebuano Language" published on June 29, 2022. View this [here](https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/article/view/1221).
