Querying

Connection

In [22]:
from pymilvus import connections, DataType, CollectionSchema, FieldSchema, Collection, Partition, utility
import openai
import pandas as pd
import numpy as np
import re
import json
from openai.embeddings_utils import get_embedding
import time
from tqdm import tqdm

Constants

In [23]:
OPENAI_API_KEY = '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'
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000
dimensions =1536
openai.api_key = OPENAI_API_KEY

Definitions

In [24]:
partition_name = 'facebook_posts'
bundled_schema = {'rmrj_articles': ['author', 'title', 'published_date', 'text'],
                  'facebook_posts': ['text', 'time', 'link'],
                  'usjr_about': ['text', 'content_id'],
                  'all': ['author', 'title', 'published_date', 'text', 'time', 'post', 'link', 'content_id']}
collection_names = bundled_schema[partition_name]
json_path = 'raw_jsons/posts.json'
description = 'description'

Embedder

In [25]:
def get_embedding(text, model=embedding_model):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

Connection

In [26]:
# Check if the connection already exists
if connections.has_connection('default'):
    connections.remove_connection('default')  # Disconnect if it exists

# Now, reconnect with your new configuration
connections.connect(alias='default', host='localhost', port='19530')

Input and vectorization

In [58]:
from datetime import datetime

# Get the current date and time
now = datetime.now()

# Convert the datetime object to a string
current_datetime = now.strftime('%Y-%m-%d %H:%M:%S')

question = f"Tell me anything about dost and usjr?"
query_vectors = get_embedding(question)
query_vectors = np.array(query_vectors)
if len(query_vectors.shape) == 1:
    query_vectors = query_vectors.reshape(1, -1)

search_params = {
    "metric_type": "L2",  # Distance metric, can be L2, IP (Inner Product), etc.
    "offset": 0,
}

Searching function

In [59]:
results = []
for name in collection_names:
    collection = Collection(f"{name}_collection")
    collection.load()
    result = collection.search(
        data=query_vectors,
        anns_field="embeds",
        param=search_params,
        limit=5,
        partition_names=[partition_name],
        output_fields=[name, 'uuid'],
        consistency_level="Strong"
    )
    results.append(result)

Results sorting by distance and removal of duplicates (smaller distance is kept)

In [60]:
# Initialize a dictionary to hold unique results
unique_results = {}

for i, name in enumerate(collection_names):
    for result in results[i]:
        for item in result:
            uuid = item.entity.get('uuid')
            data = {
                'uuid': uuid,
                name: item.entity.get(name),
                'distance': item.distance
            }
            
            # If this UUID is not in the dictionary, or it is but the new distance is smaller, update the entry
            if uuid not in unique_results or item.distance < unique_results[uuid]['distance']:
                unique_results[uuid] = data

# Convert the dictionary back into a list of dictionaries
results_object = list(unique_results.values())

# Sort the results by distance
sorted_results = sorted(results_object, key=lambda x: x['distance'])


Top 5 results

In [61]:
final_results = sorted_results[:5]

Field completion:

In [62]:
for result in final_results:
    for name in collection_names:
        if name not in result:
            collection = Collection(f"{name}_collection")
            query = f'uuid == "{result["uuid"]}"'
            query_result = collection.query(
                expr=query, 
                offset=0, 
                limit=1, 
                partition_names=[partition_name], 
                output_fields=[name], 
                consistency_level="Strong"
            )
            if query_result:
                result[name] = query_result[0][name]


Printing

In [63]:
for i, result in enumerate(final_results):
    print(f"Result {i}: ", result,"\n")

Result 0:  {'uuid': '628ee15f-8639-41a4-bf56-5a41405bff42', 'link': 'https://facebook.com/usjr.official/posts/662638619241213', 'distance': 0.3846327066421509, 'text': 'The image of Our Lady of Mt. Carmel from the Santuario Arquidiocesano de Nuestra SeÃ±ora del Carmen, La Limpia was showcased to the public in a solemn foot procession this afternoon on the streets of Barangay Ermita.  Flowers drop as the Image passed by the University of San Jose - Recoletos.  Her feast day is tomorrow, July 16. The pontifical mass will be held at 4:00PM.', 'time': '2023-07-15 July 15, 2023 18:20:38'} 

Result 1:  {'uuid': 'ec59e44f-dccb-4d35-8592-92329ac3abd9', 'link': 'https://facebook.com/usjr.official/posts/642813197890422', 'distance': 0.3846858739852905, 'text': '#JosenianPride | Watch and listen to the Cebuano translation music video of Samot-Saring iisa, the Official Theme Song of the 125th Anniversary of Philippine Independence and Nationhood sung by USJ-R Liturgical Yodelers Recoletos(LYRe).  

In [64]:
final_results

[{'uuid': '628ee15f-8639-41a4-bf56-5a41405bff42',
  'link': 'https://facebook.com/usjr.official/posts/662638619241213',
  'distance': 0.3846327066421509,
  'text': 'The image of Our Lady of Mt. Carmel from the Santuario Arquidiocesano de Nuestra SeÃ±ora del Carmen, La Limpia was showcased to the public in a solemn foot procession this afternoon on the streets of Barangay Ermita.  Flowers drop as the Image passed by the University of San Jose - Recoletos.  Her feast day is tomorrow, July 16. The pontifical mass will be held at 4:00PM.',
  'time': '2023-07-15 July 15, 2023 18:20:38'},
 {'uuid': 'ec59e44f-dccb-4d35-8592-92329ac3abd9',
  'link': 'https://facebook.com/usjr.official/posts/642813197890422',
  'distance': 0.3846858739852905,
  'text': '#JosenianPride | Watch and listen to the Cebuano translation music video of Samot-Saring iisa, the Official Theme Song of the 125th Anniversary of Philippine Independence and Nationhood sung by USJ-R Liturgical Yodelers Recoletos(LYRe).  The 125

In [65]:
string_json = json.dumps(final_results)

In [66]:
string_json

'[{"uuid": "628ee15f-8639-41a4-bf56-5a41405bff42", "link": "https://facebook.com/usjr.official/posts/662638619241213", "distance": 0.3846327066421509, "text": "The image of Our Lady of Mt. Carmel from the Santuario Arquidiocesano de Nuestra Se\\u00c3\\u00b1ora del Carmen, La Limpia was showcased to the public in a solemn foot procession this afternoon on the streets of Barangay Ermita.  Flowers drop as the Image passed by the University of San Jose - Recoletos.  Her feast day is tomorrow, July 16. The pontifical mass will be held at 4:00PM.", "time": "2023-07-15 July 15, 2023 18:20:38"}, {"uuid": "ec59e44f-dccb-4d35-8592-92329ac3abd9", "link": "https://facebook.com/usjr.official/posts/642813197890422", "distance": 0.3846858739852905, "text": "#JosenianPride | Watch and listen to the Cebuano translation music video of Samot-Saring iisa, the Official Theme Song of the 125th Anniversary of Philippine Independence and Nationhood sung by USJ-R Liturgical Yodelers Recoletos(LYRe).  The 125th

In [67]:
import openai
import json

# Set up your OpenAI API credentials
# openai.api_key = 'your-api-key'

def generate_response(prompt, database_json):
    # Format the input as per the desired conversation format
    string_json = json.dumps(database_json)
    conversation = [
        {'role': 'system', 'content': """You are Josenian Quiri. University of San Jose- Recoletos' general knowledge base assistant. Refer to yourself as JQ."""},
        {'role': 'user', 'content': prompt},
        {'role': 'system', 'content': f'Here is the database JSON from your knowledge base: \n{string_json}'},
        {'role': 'user', 'content': ''}
    ]
    
    # Convert the conversation to a string
    conversation_str = ''.join([f'{item["role"]}: {item["content"]}\n' for item in conversation])

    response = openai.ChatCompletion.create(
      model="gpt-4",
      messages=conversation,
      temperature=1,
      max_tokens=500,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    
    # Extract the generated response from the API's response
    generated_text = response['choices'][0]['message']['content']


    # Return the response
    return generated_text

# Example usage
prompt = question

response = generate_response(prompt, final_results)
print(response)


Apologies for the confusion. The Department of Science and Technology, or DOST, is a branch of the Philippine government responsible for delivering scientific and technological services to the country. It also aids in the advancement and development of science and technology in the Philippines. 

On the other hand, the University of San Jose-Recoletos (USJ-R) is a private Catholic research and coeducational institution run by the Order of Augustinian Recollects. It is located in Cebu City, Philippines and offers various programs from basic education to higher education.

The University of San Jose-Recoletos has DOST-accredited programs. According to their post last July 12, 2023, the DOST-accredited programs include Bachelor of Science in Psychology, Bachelor of Science in Biology, Bachelor of Science in Information Technology, Bachelor of Science in Computer Science, Bachelor of Science in Industrial Engineering, Bachelor of Secondary Education Major in Math and Bachelor of Secondary 