In [1]:
from pymilvus import connections, DataType, CollectionSchema, FieldSchema, Collection, Partition, utility
from pymilvus import Milvus, DataType, Collection, MilvusException
import openai
import pandas as pd
import numpy as np
import re
import json
from openai.embeddings_utils import get_embedding
import time
from tqdm import tqdm
import fasttext

Api

In [2]:
openai.api_key = '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'

Dictionaries

In [3]:
collections_list = [
    'text_collection',
    'author_collection',
    'title_collection',
    'contact_collection',
    'name_collection',
    'position_collection',
    'department_collection',
    'date_collection',
]
fields_list = [
    'text',
    'author',
    'title',
    'contact',
    'name',
    'position',
    'department',
    'date',
]
collections_dict = {
    "text_collection": ["uuid", "text_id", "text", "embeds", "media", "link", "partition_name"],
    "author_collection": ["uuid", "author", "embeds", "partition_name"],
    "title_collection": ["uuid", "title", "embeds", "partition_name"],
    "date_collection": ["uuid", "date", "embeds", "partition_name"],
    "contact_collection": ["uuid", "contact", "embeds", "partition_name"],
    "department_collection": ["uuid", "department", "embeds", "partition_name"],
    "name_collection": ["uuid", "name", "embeds", "partition_name"],
    "position_collection": ["uuid", "position", "embeds", "partition_name"]
}

partitions = {
    "documents_partition": ["text_collection", "author_collection", "title_collection", "date_collection"],
    "social_posts_partition": ["text_collection", "date_collection"],
    "contacts_partition": ["name_collection", "text_collection", "contact_collection", "department_collection"],
    "people_partition": ["text_collection","name_collection","position_collection","department_collection"],
    "usjr_documents_partition": ["text_collection", "title_collection"],
    "scs_documents_partition" : ["text_collection"],
    "religious_admin_people_partition": ["text_collection","name_collection","position_collection"],
}

Connection

In [4]:
# Check if the connection already exists
if connections.has_connection('default'):
    connections.remove_connection('default')  # Disconnect if it exists

# Now, reconnect with your new configuration
connections.connect(alias='default', host='localhost', port='19530')

Embedder

In [5]:
fasttext_model = fasttext.load_model('/Users/garfieldgreglim/Library/Mobile Documents/com~apple~CloudDocs/Josenian-Query/Embedder/crawl-300d-2M-subword.bin')
def get_embedding(text, embedding_type):
    text = text.replace("\n", " ")
    model = "text-embedding-ada-002"
    if embedding_type == 'openai':
        return openai.Embedding.create(input=[text.lower()], model=model)['data'][0]['embedding']
    elif embedding_type == 'fasttext':
        return fasttext_model.get_sentence_vector(text.lower())
    else:
        raise ValueError("Invalid embedding_type. Expected 'openai' or 'fasttext'.")



Symbol remover

In [6]:
def remove_non_alphanumeric(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

Vectorizer

In [7]:
def vectorize_query(query):
    return {'question1536': get_embedding(query.lower(), 'openai'),'question300': get_embedding(query.lower(), 'fasttext').tolist()}

Search collection

In [10]:
def search_collections(vectors, partition_names):
    question1536=vectors['question1536']
    question300=vectors['question300']
    results_dict = {}
    search_params = {
    "metric_type": "L2",  # Distance metric, can be L2, IP (Inner Product), etc.
    "offset": 0,}
    for name in fields_list:
        try:
            if name == 'text':
                collection = Collection(f"{name}_collection")
                collection.load()
                result = collection.search(
                    data=[question1536],
                    anns_field="embeds",
                    param=search_params,
                    limit=10,
                    partition_names=partition_names,
                    output_fields=['uuid', 'text_id'],
                    consistency_level="Strong"
                )
                results_dict[name] = result
            else:
                collection = Collection(f"{name}_collection")
                collection.load()
                result = collection.search(
                    data=[question300],
                    anns_field="embeds",
                    param=search_params,
                    limit=10,
                    partition_names=partition_names,
                    output_fields=['uuid'],
                    consistency_level="Strong"
                )
                results_dict[name] = result
        except MilvusException as e:
            if 'partition name' in str(e) and 'not found' in str(e):
                print(f"Partition '{partition_names}' not found in collection '{name}', skipping...")
                continue
            else:
                raise e  # if it's a different kind of MilvusException, we still want to raise it
    
    return results_dict

Check dimensions

In [11]:
def check_collection_dimension(collection):
    collection_params = collection.schema
    vector_field = [field for field in collection_params.fields if field.dtype == DataType.FLOAT_VECTOR][0]
    print(f"Dimension of vectors in collection '{collection.name}': {vector_field.params['dim']}")

Process results

In [13]:
def process_results(results_dict):
    json_results = {}

    for collection_name, result in results_dict.items():
        for query_hits in result:
            for hit in query_hits:
                if collection_name == 'text':
                    id_field = 'entity_id'
                    id_value = hit.entity.get('text_id')
                else:
                    id_field = 'entity_id'
                    id_value = hit.id
                
                # Create the result dictionary
                result_dict = {
                    id_field: id_value,
                    "distance": hit.distance,
                    "collection": collection_name
                }

                # If the id_value is already in the results and the new distance is greater, skip
                if id_value in json_results and json_results[id_value]["distance"] < hit.distance:
                    continue

                # Otherwise, update/insert the result
                json_results[id_value] = result_dict
                
            json_results_list = list(json_results.values())
            json_results_sorted = sorted(json_results_list, key=lambda x: x['distance'])
    
    return json_results_sorted


In [15]:
def populate_results(json_results_sorted, partition_names):
    # Load all collections beforehand
    collections = {name: Collection(f"{name}_collection") for name in fields_list}

    # Create a list of entity IDs for the query
    entity_ids = [result["entity_id"] for result in json_results_sorted]

    # Preparing an empty dictionary for each field in the results
    for result in json_results_sorted:
        for name in fields_list:
            result[name] = []

    # Query for all relevant records at once
    for name, collection in collections.items():
        try:
            # Prepare the query
            output_fields = []
            if name == 'text':
                query_field = "text_id"
                output_fields = [name, 'text_id']
            else:
                query_field = "uuid"
                output_fields = [name]

            query = f"{query_field} in {entity_ids}"

            query_results = collection.query(
                expr=query, 
                offset=0, 
                limit=len(entity_ids), 
                partition_names=[partition_names], 
                output_fields=output_fields, 
                consistency_level="Strong"
            )

            # Append the results to the relevant fields in the results dictionary
            for query_result in query_results:
                for result in json_results_sorted:
                    if (name == 'text' and result["entity_id"] == query_result["text_id"]) or (name != 'text' and result["entity_id"] == query_result["uuid"]):
                        result[name].append(query_result[name])
            final_results = []
            for result in json_results_sorted:
                obj = {}
                for item in result:
                    # If item is not 'entity_id' or 'distance' and the item's value is not empty
                    if item not in ['entity_id', 'collection'] and result[item]:
                        obj[item] = result[item]
                final_results.append(obj)
        except Exception as e:
            print(f"Error with collection {name}: {str(e)}")
    return final_results[:10]


In [19]:
def generate_response(prompt, string_json):
    # Format the input as per the desired conversation format
    conversation = [
        {'role': 'system', 'content': """You are Josenian Quiri. University of San Jose- Recoletos' general knowledge base assistant. Refer to yourself as JQ. If there are links, give the link as well."""},
        {'role': 'user', 'content': prompt},
        {'role': 'system', 'content': f'Here is the database JSON from your knowledge base (note: select only the correct answer): \n{string_json}'},
        {'role': 'user', 'content': ''}
    ]
    
    # Convert the conversation to a string
    conversation_str = ''.join([f'{item["role"]}: {item["content"]}\n' for item in conversation])

    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=conversation,
      temperature=1,
      max_tokens=250,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    
    # Extract the generated response from the API's response
    generated_text = response['choices'][0]['message']['content']


    # Return the response
    return generated_text


In [20]:
def ranking_partitions(vectors):
    return ['social_posts_partition', 'documents_partition', 'people_partition', "contacts_partition"]

In [25]:
def question_answer():
    while True:
        try:
            prompt = input("You: ")
            if not prompt:
                print("No input provided. Try again.")
                continue
            vectors = vectorize_query(prompt)
            if vectors is None:
                print("No vectors returned. Check your vectorize_query function.")
                continue
            ranked_partitions = ranking_partitions(vectors)
            if ranked_partitions is None:
                print("No ranked_partitions returned. Check your ranking_partitions function.")
                continue
            partition = 0
            correct = 0
            display(ranked_partitions[partition])
            while correct != 1:
                results_dict = search_collections(vectors, [ranked_partitions[partition]])
                if results_dict is None:
                    print("No results returned. Check your search_collections function.")
                    break
                json_results_sorted = process_results(results_dict)
                if json_results_sorted is None:
                    print("No sorted results returned. Check your process_results function.")
                    break
                final_results = populate_results(json_results_sorted, ranked_partitions[partition])
                if final_results is None:
                    print("No final results returned. Check your populate_results function.")
                    break
                string_json = json.dumps(final_results)
                display(string_json)
                generated_text = generate_response(prompt, string_json)
                if generated_text is None:
                    print("No response generated. Check your generate_response function.")
                    break
                print(f"JQ: {generated_text}")
                correct = input("Is the answer correct? 1-Y, 0-N: ")
                if correct not in ['0', '1']:
                    print("Invalid input. Try again.")
                elif partition <= 3:
                    partition = partition + 1
                else:
                    partition = 0
        except Exception as e:
            print(f"An error occurred: {e}")


In [None]:
question_answer()

You: 
No input provided. Try again.
You: What is the meaning OAR?


'social_posts_partition'

RPC error: [search], <MilvusException: (code=1, message=partition name social_posts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:23.031982', 'RPC error': '2023-07-26 14:48:23.035613'}>
RPC error: [search], <MilvusException: (code=1, message=partition name social_posts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:23.053414', 'RPC error': '2023-07-26 14:48:23.056460'}>
RPC error: [search], <MilvusException: (code=1, message=partition name social_posts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:23.074648', 'RPC error': '2023-07-26 14:48:23.077821'}>
RPC error: [search], <MilvusException: (code=1, message=partition name social_posts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:23.096976', 'RPC error': '2023-07-26 14:48:23.099911'}>
RPC error: [search], <MilvusException: (code=1, message=partition name social_posts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:23.118153', 'RPC error': '2023-07-26 14:48:23.12090

Partition '['social_posts_partition']' not found in collection 'author', skipping...
Partition '['social_posts_partition']' not found in collection 'title', skipping...
Partition '['social_posts_partition']' not found in collection 'contact', skipping...
Partition '['social_posts_partition']' not found in collection 'name', skipping...
Partition '['social_posts_partition']' not found in collection 'position', skipping...
Partition '['social_posts_partition']' not found in collection 'department', skipping...


RPC error: [query], <MilvusException: (code=1, message=partition name social_posts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:23.824001', 'RPC error': '2023-07-26 14:48:23.834952'}>
RPC error: [query], <MilvusException: (code=1, message=partition name social_posts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:23.835625', 'RPC error': '2023-07-26 14:48:23.844069'}>
RPC error: [query], <MilvusException: (code=1, message=partition name social_posts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:23.844539', 'RPC error': '2023-07-26 14:48:23.849616'}>
RPC error: [query], <MilvusException: (code=1, message=partition name social_posts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:23.850057', 'RPC error': '2023-07-26 14:48:23.854185'}>
RPC error: [query], <MilvusException: (code=1, message=partition name social_posts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:23.854632', 'RPC error': '2023-07-26 14:48:23.858617'}>


Error with collection author: <MilvusException: (code=1, message=partition name social_posts_partition not found)>
Error with collection title: <MilvusException: (code=1, message=partition name social_posts_partition not found)>
Error with collection contact: <MilvusException: (code=1, message=partition name social_posts_partition not found)>
Error with collection name: <MilvusException: (code=1, message=partition name social_posts_partition not found)>
Error with collection position: <MilvusException: (code=1, message=partition name social_posts_partition not found)>
Error with collection department: <MilvusException: (code=1, message=partition name social_posts_partition not found)>


'[{"distance": 0.44889354705810547, "text": ["ucators (JOED) Organization Executive Committee: Franceska Therese K. Fanilag (President) Maika Marie F. Masibay (Vice President \\u00e2\\u0080\\u0093 Internal) Shemiah C. Georsua (Vice President \\u00e2\\u0080\\u0093 External) Mary Cristine C. Ymbong (Vice President \\u00e2\\u0080\\u0093 Communication) Mary Rose A. Tuburan (Vice President Communication) Bethany Faith D. Bataluna (Secretary) Ray Gilbert P. Guiterrez (Finance Officer) Christine Franco (Auditor) Zephanie Marie M. Carmona (Presidential Secretary)  Leadership Awards for Josenian Educators (JOED) Organization Social Media Managers: Thea Mikhaila B. Casta\\u00c3\\u00b1ares Stephanie Marie Lim  Leadership Awards for Active Josenian Educators (JOED) Organization Commissioners: Iah Mari M. Baguio and Pretzilyn Laika T. Ferrolino (Ways and Means) Cristian V. Albarando and Frethel Kyce L. Almonte (Community Outreach Program) Anna Bianca Isabella L. Vidal (Religious Affairs) Ana Loren 

JQ: I apologize, but I couldn't find any information about the meaning of "OAR" in the provided knowledge base. Could you please provide more context or specify what "OAR" refers to?
Is the answer correct? 1-Y, 0-N: 0


RPC error: [search], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:32.459772', 'RPC error': '2023-07-26 14:48:32.463745'}>
RPC error: [search], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:32.486940', 'RPC error': '2023-07-26 14:48:32.489943'}>
RPC error: [search], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:32.507088', 'RPC error': '2023-07-26 14:48:32.510670'}>
RPC error: [search], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:32.526827', 'RPC error': '2023-07-26 14:48:32.530180'}>


Partition '['documents_partition']' not found in collection 'contact', skipping...
Partition '['documents_partition']' not found in collection 'name', skipping...
Partition '['documents_partition']' not found in collection 'position', skipping...
Partition '['documents_partition']' not found in collection 'department', skipping...


RPC error: [query], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:34.008603', 'RPC error': '2023-07-26 14:48:34.017977'}>
RPC error: [query], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:34.018621', 'RPC error': '2023-07-26 14:48:34.023347'}>
RPC error: [query], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:34.023728', 'RPC error': '2023-07-26 14:48:34.028471'}>
RPC error: [query], <MilvusException: (code=1, message=partition name documents_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:34.028936', 'RPC error': '2023-07-26 14:48:34.033616'}>


Error with collection contact: <MilvusException: (code=1, message=partition name documents_partition not found)>
Error with collection name: <MilvusException: (code=1, message=partition name documents_partition not found)>
Error with collection position: <MilvusException: (code=1, message=partition name documents_partition not found)>
Error with collection department: <MilvusException: (code=1, message=partition name documents_partition not found)>




JQ: OAR stands for Order of Augustinian Recollects. The Order of Augustinian Recollects is a Catholic religious order of friars and nuns. They follow the teachings of St. Augustine and focus on the spiritual and pastoral care of the faithful. The University of San Jose-Recoletos, where I am based, is a university run by the Order of Augustinian Recollects.
Is the answer correct? 1-Y, 0-N: 0


RPC error: [search], <MilvusException: (code=1, message=partition name people_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:45.661836', 'RPC error': '2023-07-26 14:48:45.666189'}>
RPC error: [search], <MilvusException: (code=1, message=partition name people_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:45.691054', 'RPC error': '2023-07-26 14:48:45.694824'}>
RPC error: [search], <MilvusException: (code=1, message=partition name people_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:45.716089', 'RPC error': '2023-07-26 14:48:45.719585'}>


Partition '['people_partition']' not found in collection 'author', skipping...
Partition '['people_partition']' not found in collection 'title', skipping...
Partition '['people_partition']' not found in collection 'contact', skipping...


RPC error: [search], <MilvusException: (code=1, message=partition name people_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:46.845851', 'RPC error': '2023-07-26 14:48:46.849172'}>


Partition '['people_partition']' not found in collection 'date', skipping...


RPC error: [query], <MilvusException: (code=1, message=partition name people_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:47.223062', 'RPC error': '2023-07-26 14:48:47.228003'}>
RPC error: [query], <MilvusException: (code=1, message=partition name people_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:47.228727', 'RPC error': '2023-07-26 14:48:47.233318'}>
RPC error: [query], <MilvusException: (code=1, message=partition name people_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:47.233813', 'RPC error': '2023-07-26 14:48:47.237842'}>


Error with collection author: <MilvusException: (code=1, message=partition name people_partition not found)>
Error with collection title: <MilvusException: (code=1, message=partition name people_partition not found)>
Error with collection contact: <MilvusException: (code=1, message=partition name people_partition not found)>


RPC error: [query], <MilvusException: (code=1, message=partition name people_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:48.420925', 'RPC error': '2023-07-26 14:48:48.429659'}>


Error with collection date: <MilvusException: (code=1, message=partition name people_partition not found)>


'[{"distance": 0.362814724445343, "text": ["TALAVERA HOUSE OF PRAYER\\n\\nRev. Fr. Domingo E. Saladaga, also known as \\u201cFr. Sandy,\\u201d is a Cebuano who hails from Capitol Site, Cebu City.\\n\\nHe earned his college degree in 1983 at Seminario Mayor-Recoletos in Baguio City. Then, he had his masters degree in Psychology at the University of San Jose-Recoletos.\\n\\nHe spent his novitiate and \\u2018simple profession\\u2019 at Seminario Mayor-Recoletos from 1983 to 1984. Later on, he had his \\u2018solemn profession\\u2019 in 1987 at the Agustinos Recoletos in Navarra, Spain.\\n\\nFr. Saladaga was ordained as a Deacon on November 13, 1987 in Spain and had his Sacerdotal Ordination on October 16, 1988 at Our Lady of Mount Carmel Parish-Recoletos in Cebu. Due to his exposure in the West, Fr. Saladaga knows how to speak in English, Spanish, and Portuguese.\\n\\nIn 1988, he was first assigned as an Assistant to the Vicar Provincial of Bahay Toro in Quezon City. Six years after,  he b

JQ: The OAR stands for Order of Augustinian Recollects. They are a Catholic religious order founded in 16th-century Spain and are part of the wider Augustinian family. The Order of Augustinian Recollects plays an important role in the field of education and missionary work. They are known for their commitment to spirituality, learning, and service to the community.
Is the answer correct? 1-Y, 0-N: 0


RPC error: [search], <MilvusException: (code=1, message=partition name contacts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:56.858024', 'RPC error': '2023-07-26 14:48:56.863139'}>
RPC error: [search], <MilvusException: (code=1, message=partition name contacts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:56.892738', 'RPC error': '2023-07-26 14:48:56.896903'}>


Partition '['contacts_partition']' not found in collection 'author', skipping...
Partition '['contacts_partition']' not found in collection 'title', skipping...


RPC error: [search], <MilvusException: (code=1, message=partition name contacts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:57.645307', 'RPC error': '2023-07-26 14:48:57.648960'}>


Partition '['contacts_partition']' not found in collection 'position', skipping...


RPC error: [search], <MilvusException: (code=1, message=partition name contacts_partition not found)>, <Time:{'RPC start': '2023-07-26 14:48:58.033239', 'RPC error': '2023-07-26 14:48:58.037068'}>
RPC error: [query], <MilvusException: (code=1, message=invalid max query result window, limit [0] is invalid, should be greater than 0)>, <Time:{'RPC start': '2023-07-26 14:48:58.087189', 'RPC error': '2023-07-26 14:48:58.089444'}>
RPC error: [query], <MilvusException: (code=1, message=invalid max query result window, limit [0] is invalid, should be greater than 0)>, <Time:{'RPC start': '2023-07-26 14:48:58.090023', 'RPC error': '2023-07-26 14:48:58.092280'}>
RPC error: [query], <MilvusException: (code=1, message=invalid max query result window, limit [0] is invalid, should be greater than 0)>, <Time:{'RPC start': '2023-07-26 14:48:58.093092', 'RPC error': '2023-07-26 14:48:58.095389'}>
RPC error: [query], <MilvusException: (code=1, message=invalid max query result window, limit [0] is invali

Partition '['contacts_partition']' not found in collection 'date', skipping...
Error with collection text: <MilvusException: (code=1, message=invalid max query result window, limit [0] is invalid, should be greater than 0)>
Error with collection author: <MilvusException: (code=1, message=invalid max query result window, limit [0] is invalid, should be greater than 0)>
Error with collection title: <MilvusException: (code=1, message=invalid max query result window, limit [0] is invalid, should be greater than 0)>
Error with collection contact: <MilvusException: (code=1, message=invalid max query result window, limit [0] is invalid, should be greater than 0)>
Error with collection name: <MilvusException: (code=1, message=invalid max query result window, limit [0] is invalid, should be greater than 0)>
Error with collection position: <MilvusException: (code=1, message=invalid max query result window, limit [0] is invalid, should be greater than 0)>
Error with collection department: <Milvus