Querying

Connection

In [1]:
from pymilvus import connections, DataType, CollectionSchema, FieldSchema, Collection, Partition, utility
import openai
import pandas as pd
import numpy as np
import re
import json
from openai.embeddings_utils import get_embedding
import time
from tqdm import tqdm

Constants

In [2]:
OPENAI_API_KEY = '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'
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000
dimensions =1536
openai.api_key = OPENAI_API_KEY

Definitions

In [18]:
partition_name = 'facebook_posts'
bundled_schema = {'rmrj_articles': ['author', 'title', 'published_date', 'text'],
                  'facebook_posts': ['text', 'time', 'link'],
                  'usjr_about': ['text', 'content_id'],
                  'all': ['author', 'title', 'published_date', 'text', 'time', 'post', 'link', 'content_id']}
collection_names = bundled_schema[partition_name]
json_path = 'raw_jsons/posts.json'
description = 'description'

Embedder

In [4]:
def get_embedding(text, model=embedding_model):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

Connection

In [5]:
# Check if the connection already exists
if connections.has_connection('default'):
    connections.remove_connection('default')  # Disconnect if it exists

# Now, reconnect with your new configuration
connections.connect(alias='default', host='localhost', port='19530')

Input and vectorization

In [40]:
from datetime import datetime

# Get the current date and time
now = datetime.now()

# Convert the datetime object to a string
current_datetime = now.strftime('%Y-%m-%d %H:%M:%S')


query_vectors = get_embedding(f"When was the last class suspended? Current_datetime: {current_datetime}")
query_vectors = np.array(query_vectors)
if len(query_vectors.shape) == 1:
    query_vectors = query_vectors.reshape(1, -1)

search_params = {
    "metric_type": "L2",  # Distance metric, can be L2, IP (Inner Product), etc.
    "offset": 0,
}

Searching function

In [41]:
results = []
for name in collection_names:
    collection = Collection(f"{name}_collection")
    collection.load()
    result = collection.search(
        data=query_vectors,
        anns_field="embeds",
        param=search_params,
        limit=5,
        partition_names=[partition_name],
        output_fields=[name, 'uuid'],
        consistency_level="Strong"
    )
    results.append(result)

Results sorting by distance and removal of duplicates (smaller distance is kept)

In [42]:
# Initialize a dictionary to hold unique results
unique_results = {}

for i, name in enumerate(collection_names):
    for result in results[i]:
        for item in result:
            uuid = item.entity.get('uuid')
            data = {
                'uuid': uuid,
                name: item.entity.get(name),
                'distance': item.distance
            }
            
            # If this UUID is not in the dictionary, or it is but the new distance is smaller, update the entry
            if uuid not in unique_results or item.distance < unique_results[uuid]['distance']:
                unique_results[uuid] = data

# Convert the dictionary back into a list of dictionaries
results_object = list(unique_results.values())

# Sort the results by distance
sorted_results = sorted(results_object, key=lambda x: x['distance'])


Top 5 results

In [43]:
final_results = sorted_results[:5]

Field completion:

In [44]:
for result in final_results:
    for name in collection_names:
        if name not in result:
            collection = Collection(f"{name}_collection")
            query = f'uuid == "{result["uuid"]}"'
            query_result = collection.query(
                expr=query, 
                offset=0, 
                limit=1, 
                partition_names=[partition_name], 
                output_fields=[name], 
                consistency_level="Strong"
            )
            if query_result:
                result[name] = query_result[0][name]


Printing

In [45]:
for i, result in enumerate(final_results):
    print(f"Result {i}: ", result,"\n")

Result 0:  {'uuid': '32a6d05f-9cd9-4ee0-8fbf-ba315b318390', 'time': '2023-07-07 July 07, 2023 14:06:47', 'distance': 0.35559046268463135, 'text': 'National Culture Consciousness Week Online Lecture  Catch us on today at 2PM as we explore the wonders of museum exhibitions and how they can help us get to know more of ourselves and our culture! Joining us is Ms. Princess Hernandez of the Nayong Pilipino Foundation Inc., and she will be sharing to us also how we might be able to appreciate more these exhibitions...and even possibly create our own exhibitions!', 'link': 'https://facebook.com/usjr.official/posts/243503025106451'} 

Result 1:  {'uuid': '92f44996-5a55-4a7d-acc1-6cb940f0d834', 'time': '2023-07-07 July 07, 2023 11:06:36', 'distance': 0.3561747074127197, 'text': 'TODAY IN HISTORY  Exactly 76 years ago, the University of San Jose - Recoletos (formerly Colegio de San Jose-Recoletos) opened its first school year.  The student population was 940, a figure beyond the expectations of t

In [46]:
final_results

[{'uuid': '32a6d05f-9cd9-4ee0-8fbf-ba315b318390',
  'time': '2023-07-07 July 07, 2023 14:06:47',
  'distance': 0.35559046268463135,
  'text': 'National Culture Consciousness Week Online Lecture  Catch us on today at 2PM as we explore the wonders of museum exhibitions and how they can help us get to know more of ourselves and our culture! Joining us is Ms. Princess Hernandez of the Nayong Pilipino Foundation Inc., and she will be sharing to us also how we might be able to appreciate more these exhibitions...and even possibly create our own exhibitions!',
  'link': 'https://facebook.com/usjr.official/posts/243503025106451'},
 {'uuid': '92f44996-5a55-4a7d-acc1-6cb940f0d834',
  'time': '2023-07-07 July 07, 2023 11:06:36',
  'distance': 0.3561747074127197,
  'text': 'TODAY IN HISTORY  Exactly 76 years ago, the University of San Jose - Recoletos (formerly Colegio de San Jose-Recoletos) opened its first school year.  The student population was 940, a figure beyond the expectations of the Augu

In [47]:
string_json = json.dumps(final_results)

In [48]:
string_json

'[{"uuid": "32a6d05f-9cd9-4ee0-8fbf-ba315b318390", "time": "2023-07-07 July 07, 2023 14:06:47", "distance": 0.35559046268463135, "text": "National Culture Consciousness Week Online Lecture  Catch us on today at 2PM as we explore the wonders of museum exhibitions and how they can help us get to know more of ourselves and our culture! Joining us is Ms. Princess Hernandez of the Nayong Pilipino Foundation Inc., and she will be sharing to us also how we might be able to appreciate more these exhibitions...and even possibly create our own exhibitions!", "link": "https://facebook.com/usjr.official/posts/243503025106451"}, {"uuid": "92f44996-5a55-4a7d-acc1-6cb940f0d834", "time": "2023-07-07 July 07, 2023 11:06:36", "distance": 0.3561747074127197, "text": "TODAY IN HISTORY  Exactly 76 years ago, the University of San Jose - Recoletos (formerly Colegio de San Jose-Recoletos) opened its first school year.  The student population was 940, a figure beyond the expectations of the Augustinian Recoll

In [49]:
import openai
import json

# Set up your OpenAI API credentials
# openai.api_key = 'your-api-key'

def generate_response(prompt, database_json):
    # Format the input as per the desired conversation format
    string_json = json.dumps(database_json)
    conversation = [
        {'role': 'system', 'content': """You are Josenian Quiri. University of San Jose- Recoletos' general knowledge base assistant."""},
        {'role': 'user', 'content': prompt},
        {'role': 'system', 'content': f'Here is the database JSON from your knowledge base: \n{string_json}'},
        {'role': 'user', 'content': ''}
    ]
    
    # Convert the conversation to a string
    conversation_str = ''.join([f'{item["role"]}: {item["content"]}\n' for item in conversation])

    response = openai.ChatCompletion.create(
      model="gpt-4",
      messages=conversation,
      temperature=1,
      max_tokens=500,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    
    # Extract the generated response from the API's response
    generated_text = response['choices'][0]['message']['content']


    # Return the response
    return generated_text

# Example usage
prompt = "When were classes last suspended?"

response = generate_response(prompt, final_results)
print(response)


As an AI, I don't have real-time data. To know when the classes were last suspended at the University of San Jose - Recoletos, it is best to check the latest updates from the university's official website or social media accounts.
