Querying

Connection

In [1]:
from pymilvus import connections, DataType, CollectionSchema, FieldSchema, Collection, Partition, utility
import openai
import pandas as pd
import numpy as np
import re
import json
from openai.embeddings_utils import get_embedding
import time
from tqdm import tqdm

Constants

In [2]:
OPENAI_API_KEY = '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'
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000
dimensions =1536
openai.api_key = OPENAI_API_KEY

Definitions

In [3]:
partition_name = 'facebook_posts'
bundled_schema = {'rmrj_articles': ['author', 'title', 'published_date', 'text'],
                  'facebook_posts': ['text', 'time', 'link'],
                  'usjr_about': ['text', 'content_id'],
                  'all': ['author', 'title', 'published_date', 'text', 'time', 'post', 'link', 'content_id']}
collection_names = bundled_schema[partition_name]
json_path = 'raw_jsons/posts.json'
description = 'description'

Embedder

In [4]:
def get_embedding(text, model=embedding_model):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

Connection

In [5]:
# Check if the connection already exists
if connections.has_connection('default'):
    connections.remove_connection('default')  # Disconnect if it exists

# Now, reconnect with your new configuration
connections.connect(alias='default', host='localhost', port='19530')

Input and vectorization

In [58]:
from datetime import datetime

# Get the current date and time
now = datetime.now()

# Convert the datetime object to a string
current_datetime = now.strftime('%Y-%m-%d %H:%M:%S')

question = f"What are the DOST USJR privileges?" #{current_datetime}"
query_vectors = get_embedding(question)
query_vectors = np.array(query_vectors)
if len(query_vectors.shape) == 1:
    query_vectors = query_vectors.reshape(1, -1)

search_params = {
    "metric_type": "L2",  # Distance metric, can be L2, IP (Inner Product), etc.
    "offset": 0,
}

Searching function

In [59]:
results = []
for name in collection_names:
    collection = Collection(f"{name}_collection")
    collection.load()
    result = collection.search(
        data=query_vectors,
        anns_field="embeds",
        param=search_params,
        limit=5,
        partition_names=[partition_name],
        output_fields=[name, 'uuid'],
        consistency_level="Strong"
    )
    results.append(result)

Results sorting by distance and removal of duplicates (smaller distance is kept)

In [60]:
# Initialize a dictionary to hold unique results
unique_results = {}

for i, name in enumerate(collection_names):
    for result in results[i]:
        for item in result:
            uuid = item.entity.get('uuid')
            data = {
                'uuid': uuid,
                name: item.entity.get(name),
                'distance': item.distance
            }
            
            # If this UUID is not in the dictionary, or it is but the new distance is smaller, update the entry
            if uuid not in unique_results or item.distance < unique_results[uuid]['distance']:
                unique_results[uuid] = data

# Convert the dictionary back into a list of dictionaries
results_object = list(unique_results.values())

# Sort the results by distance
sorted_results = sorted(results_object, key=lambda x: x['distance'])


Top 5 results

In [61]:
final_results = sorted_results[:5]

Field completion:

In [62]:
for result in final_results:
    for name in collection_names:
        if name not in result:
            collection = Collection(f"{name}_collection")
            query = f'uuid == "{result["uuid"]}"'
            query_result = collection.query(
                expr=query, 
                offset=0, 
                limit=1, 
                partition_names=[partition_name], 
                output_fields=[name], 
                consistency_level="Strong"
            )
            if query_result:
                result[name] = query_result[0][name]


Printing

In [63]:
for i, result in enumerate(final_results):
    print(f"Result {i}: ", result,"\n")

Result 0:  {'uuid': 'c3765bd2-df52-484c-b4d0-d78a458325a4', 'text': "?? ?????????????????? The University of San Jose-Recoletos is ?????????????????? ???????????????????? ???? ?????????????? ?????? ???????????????????? (????????) ???????????????????? for the ???????????????? ???????? ????????-????????. Check the DOST-accredited programs.  Further, classes for AY 2023-2024 at USJ-R will begin on August 7, 2023. ???????????????????? ???? ??????????????. For more details, visit https://www.facebook.com/usjr.official/posts/pfbid0RHZHs1MHdut1Mv6YLUK9Rcnfypx8Rgqi7jz9FBRP2RK5UmMjZdULgThUubDgiJXvl Thank you and God bless ????CALLING ALL THE DEPARTMENT OF SCIENCE AND TECHNOLOGY QUALIFIERS FOR ACADEMIC YEAR 2023-2024  (DOST)DEPARTMENT OF SCIENCE AND TECHNOLOGY ACCREDITED PROGRAMS - BACHELOR OF SCIENCE IN PSYCHOLOGY  - BACHELOR OF SCIENCE IN BIOLOGY  - BACHELOR OF SCIENCE IN INFORMATION TECHNOLOGY  - BACHELOR OF SCIENCE IN COMPUTER SCIENCE - BACHELOR OF SCIENCE IN INDUSTRIAL ENGINEERING  - BACHEL

In [64]:
final_results

[{'uuid': 'c3765bd2-df52-484c-b4d0-d78a458325a4',
  'text': "?? ?????????????????? The University of San Jose-Recoletos is ?????????????????? ???????????????????? ???? ?????????????? ?????? ???????????????????? (????????) ???????????????????? for the ???????????????? ???????? ????????-????????. Check the DOST-accredited programs.  Further, classes for AY 2023-2024 at USJ-R will begin on August 7, 2023. ???????????????????? ???? ??????????????. For more details, visit https://www.facebook.com/usjr.official/posts/pfbid0RHZHs1MHdut1Mv6YLUK9Rcnfypx8Rgqi7jz9FBRP2RK5UmMjZdULgThUubDgiJXvl Thank you and God bless ????CALLING ALL THE DEPARTMENT OF SCIENCE AND TECHNOLOGY QUALIFIERS FOR ACADEMIC YEAR 2023-2024  (DOST)DEPARTMENT OF SCIENCE AND TECHNOLOGY ACCREDITED PROGRAMS - BACHELOR OF SCIENCE IN PSYCHOLOGY  - BACHELOR OF SCIENCE IN BIOLOGY  - BACHELOR OF SCIENCE IN INFORMATION TECHNOLOGY  - BACHELOR OF SCIENCE IN COMPUTER SCIENCE - BACHELOR OF SCIENCE IN INDUSTRIAL ENGINEERING  - BACHELOR OF SE

In [65]:
string_json = json.dumps(final_results)

In [66]:
string_json

'[{"uuid": "c3765bd2-df52-484c-b4d0-d78a458325a4", "text": "?? ?????????????????? The University of San Jose-Recoletos is ?????????????????? ???????????????????? ???? ?????????????? ?????? ???????????????????? (????????) ???????????????????? for the ???????????????? ???????? ????????-????????. Check the DOST-accredited programs.  Further, classes for AY 2023-2024 at USJ-R will begin on August 7, 2023. ???????????????????? ???? ??????????????. For more details, visit https://www.facebook.com/usjr.official/posts/pfbid0RHZHs1MHdut1Mv6YLUK9Rcnfypx8Rgqi7jz9FBRP2RK5UmMjZdULgThUubDgiJXvl Thank you and God bless ????CALLING ALL THE DEPARTMENT OF SCIENCE AND TECHNOLOGY QUALIFIERS FOR ACADEMIC YEAR 2023-2024  (DOST)DEPARTMENT OF SCIENCE AND TECHNOLOGY ACCREDITED PROGRAMS - BACHELOR OF SCIENCE IN PSYCHOLOGY  - BACHELOR OF SCIENCE IN BIOLOGY  - BACHELOR OF SCIENCE IN INFORMATION TECHNOLOGY  - BACHELOR OF SCIENCE IN COMPUTER SCIENCE - BACHELOR OF SCIENCE IN INDUSTRIAL ENGINEERING  - BACHELOR OF SEC

In [67]:
import openai
import json

# Set up your OpenAI API credentials
# openai.api_key = 'your-api-key'

def generate_response(prompt, database_json):
    # Format the input as per the desired conversation format
    string_json = json.dumps(database_json)
    conversation = [
        {'role': 'system', 'content': """You are Josenian Quiri. University of San Jose- Recoletos' general knowledge base assistant. Refer to yourself as JQ."""},
        {'role': 'user', 'content': prompt},
        {'role': 'system', 'content': f'Here is the database JSON from your knowledge base: \n{string_json}'},
        {'role': 'user', 'content': ''}
    ]
    
    # Convert the conversation to a string
    conversation_str = ''.join([f'{item["role"]}: {item["content"]}\n' for item in conversation])

    response = openai.ChatCompletion.create(
      model="gpt-4",
      messages=conversation,
      temperature=1,
      max_tokens=500,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    
    # Extract the generated response from the API's response
    generated_text = response['choices'][0]['message']['content']


    # Return the response
    return generated_text

# Example usage
prompt = question

response = generate_response(prompt, final_results)
print(response)


For DOST scholars studying at the University of San Jose - Recoletos, there are several privileges:

1. Subsidy of 20,000.00 per semester.
2. Allowance of 7,000.00 per month.

DOST scholars can choose from the following accredited programs at USJR:

1. Bachelor of Science in Psychology
2. Bachelor of Science in Biology
3. Bachelor of Science in Information Technology
4. Bachelor of Science in Computer Science
5. Bachelor of Science in Industrial Engineering
6. Bachelor of Secondary Education major in Math
7. Bachelor of Secondary Education major in Science

You may contact the university through these emails for scholarship related inquiries: scholarshipcenter@usjr.edu.ph or discount@usjr.edu.ph, and can reach them through this hotline: 253-7900 | Local: 375/294.

For more information, you may visit the official University of San Jose - Recoletos Facebook page.
