# End-to-End Query Example

In [200]:
### Planning
# pseudo code for a given query is to
# convert a query to triples that can be used to search for it
# find key subjects/objects + predicates to query for
# upon a given search and return triples, there are two options
# use an LLM to 
# 1. check if there is any answer in the given triples
# 2. if there are, return
# 3. if not, filter triples that seem completely unrelated
# 4. get the return values of the triples, check if any have already been searched
# 5. for the ones that haven't loop through again and gather more information
# convert to triple
# query triple
# find top matches and top similar
# check for answers -> if found return
# otherwise 
# filter triples to look for more information
# query top 5

## Load Question Data

In [201]:
import json
import random

# Load JSON data for qa's with answers
with open('strategyqa-data/strategyqa_dataset/strategyqa_train.json', 'r') as file:
    data = json.load(file)

questions_answers = {}
for item in data:
    questions_answers[item['question']] = item['answer']
    
# Load question set from cluster
question_set = set()
with open('question_clusters/cluster4.json', 'r') as f:
    loaded_strings = json.load(f)
    
for l in loaded_strings:
    question_set.add(l)
    
# Helper Functions
def get_query_answer(query):
    return questions_answers[query]

def get_random_query():
    return list(question_set)[random.randint(0, len(question_set))]

## OpenAI Prompts

In [233]:
def get_question_prompt(text):
    multi_line_prompt = """
There exists a theoretical data store that consists of "triples" that come from a paragraph of information that has been deconstructed. A "triple" is an information structure that captures specific relationships or connections between two distinct entities. These entities can take various forms such as objects, facts, direct quotes, numbers, or any other meaningful data from the text.  As an example, look at the "Text" input below and the "Output" that is followed by different triples:

Text:\"\"\"John, a software engineer at Google, moved to San Francisco in 2018. He works on the AI research team, which is led by Dr. Sara Thompson.\"\"\"

Output:
("John", "is a", "software engineer")
("John", "works at", "Google")
("John", "moved to", "San Francisco in 2018")
("John", "works on", "the AI research team")
("AI research team", "is led by", "Dr. Sara Thompson")

Your goal is to take a given question, which comes after "Question:" and convert it into a series of triples that can query the triple data store to find information after the phrase "Output:". The output triples should have at least one "_" (blank character) to denote information to query. For example:

Question: "Where does john work"
Output:
("John", "works at", _)
("John", "is a", _)

The "_" can be anywhere that could be used as a query. For example,

Question: "What does John do at Google?"
Output:
("John", _, "Google")

The goal, is to return as many triples would be useful to query information needed. Sometimes questions, will be "Multi-hop" meaning they require multiple pieces of information to query. When a question appears to be more complex, break down the question so that there are multiple queries that can be used to get information. Additionally, each query will return more triples that can be used to before a linked search towards information. Now convert the following question:

Question: "{}"
Output:
    """.format(text)
    return multi_line_prompt

In [263]:
import openai

def get_response(multi_line_prompt):
    openai.api_key = "<KEY>"
    response = openai.Completion.create(
      model="text-davinci-003",
      prompt=multi_line_prompt,
      max_tokens=300
    )
    return response

def get_chat_response(multi_line_prompt):
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "system", "content": "You are an experiment information retrieval and query processing machine."},
            {"role": "user", "content": multi_line_prompt}
        ]
    )
    return response

### Query and Query Triples

In [286]:
backup_query = "Can Clouded leopards chase down many Pronghorn antelopes?"
query = get_random_query()
print("Starting Query: {}".format(query))
print("Answer: {}".format(get_query_answer(query)))

Starting Query: Would a northern fur seal pass a driving test?
Answer: False


In [292]:
response = get_response(get_question_prompt(query))
# GPT 3.5 response print(response["choices"][0]["message"]["content"])
# davinci-003 print(get_response["choices"][0]["text"])
print(response["choices"][0]["text"])


("Northern fur seal", _, _)
(_, "pass a", "driving test")


In [293]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from qdrant_client.http import models
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

qdrant_client = QdrantClient('http://localhost:6333')

In [294]:
def process_triple_from_string_to_array(trip):
    return [val[1:-1] for val in trip.strip()[1:-1].split(", ")]

In [295]:
def query_subjects(search_text):
    encoded_search = model.encode(search_text)
    search_result = qdrant_client.search(
            collection_name="subjects",
            query_vector=encoded_search.tolist(),
            query_filter=None,  # If you don't want any filters for now
            limit=5  # 5 the most closest results is enough
        )
    return search_result

## Search Algorithm

In [296]:
def query_algorithm(query, steps=5):
    all_triples = set([])
    response = get_response(get_question_prompt(query))
    query_triples = [process_triple_from_string_to_array(trip) for trip in response["choices"][0]["text"].strip().split("\n")]
    
    query_subject_array = []
    for trip in query_triples:
        if len(trip)>2:
            if not (trip[0] == "" or trip[0] == ""):
                query_subject_array.append(trip[0])
            if not (trip[2] == "" or trip[2] == ""):
                query_subject_array.append(trip[2])
            
    query_subject_set = set(query_subject_array)
    
    for i in range(0, steps-1):
        # print("Starting Subject Set:", query_subject_set)
        query_subject_array = []
        for subject in query_subject_set:
            query_response = query_subjects(subject)
            returned_triples = [process_triple_from_string_to_array(match.payload["triple"]) for match in query_response]
            
            # all_triples += returned_triples
            for trip in returned_triples:
                to_add = ", ".join(["\""+t+"\"" for t in trip])
                all_triples.add("("+to_add+")")
                
            # print("New Triples:", returned_triples)
            
            for trip in returned_triples: 
                if (len(trip) > 2):
                    query_subject_array.append(trip[0])
                    query_subject_array.append(trip[2])
            
        query_subject_set = set(query_subject_array)
        
    return all_triples

In [297]:
all_triples = query_algorithm(query, 2)

## Relevant Informational Triples

In [298]:
def triples_to_string(all_triples):
    triples_as_strings = ""
    for trip in all_triples:
        triples_as_strings += trip+"\n"
    return triples_as_strings

triples_as_strings = triples_to_string(all_triples)
print(triples_as_strings)

("northern fur seal", "found in", "Sea of Okhotsk")
("northern fur seal", "some breed on", "Tyuleniy Island off the coast of Sakhalin in the southwest Sea of Okhotsk")
("driving test", "consists of", "written or oral test (theory test)")
("northern fur seal", "found in", "Bering Sea")
("northern fur seal", "is the largest member of", "fur seal subfamily")
("driving test", "is a procedure", "designed to test a person's ability to drive a motor vehicle")
("northern fur seal", "is the only living species in", "genus Callorhinus")
("driving test", "is often a requirement to", "obtain a driver's license")
("driving test", "exists in", "various forms worldwide")
("driving test", "is used to", "assess a person's driving ability under normal operating conditions")



In [299]:
def get_answer_prompt(triples, query):
    multi_line_prompt = """
A "triple" is an information structure that captures specific relationships or connections between two distinct entities. These entities can take various forms such as objects, facts, direct quotes, numbers, or any other meaningful data from the text.  As an example, look at the "Text" input below and the "Output" that is followed by different triples:

Text:\"\"\"John, a software engineer at Google, moved to San Francisco in 2018. He works on the AI research team, which is led by Dr. Sara Thompson.\"\"\"

Output:
("John", "is a", "software engineer")
("John", "works at", "Google")
("John", "moved to", "San Francisco in 2018")
("John", "works on", "the AI research team")
("AI research team", "is led by", "Dr. Sara Thompson")

Your goal is to take a given set of triples and answer a query about them. Your goal is to take a given set of triples and answer a query about them. To do this, look at all the information present in the triples and try to come up with the most rational answer based on the connections that exist with the triples. Every answer is equally likely to be true or false. The triples will be provided after "Triples", the query will be provided after "Query", and the answer which you will write should be provided after "Answer". An Answer should always be a boolean either true or false. The answer response should always only be "true" or "false" you shouldn't say anything more. For example,

Triples:
("John", "works on", "the AI research team")
("AI research team", "is led by", "Dr. Sara Thompson")

Query: "Does John work with Sara Thompson"
Answer: true

Now try for the following triples and query to provide either a true or false answer:

Triples:
{}

Query: "{}"
Answer:
    """.format(triples, query)
    return multi_line_prompt

## Test Answer

In [300]:
def test():
    attempted_answer = get_response(get_answer_prompt(triples_as_strings, query))["choices"][0]["text"]
    answer = string_to_bool(attempted_answer.lower().strip())
    if answer=="fail":
        # failed_answers+= 1
        print("- Model Failed to Answer. Attempted: {}".format(attempted_answer.lower().strip()))
    else:
        print("Model Answer: {}".format(answer))
        correct_answer = get_query_answer(query)
        print("Correct Answer: {}".format(correct_answer))
        if correct_answer==answer:
            print("+ Model Answered Correct !")
            #correct_answers += 1

In [301]:
## GPT 3.5 Turbo Response
## get_chat_response(get_answer_prompt(triples_as_strings, query))["choices"][0]["message"]["content"]

In [302]:
test()

Model Answer: False
Correct Answer: False
+ Model Answered Correct !


In [195]:
def string_to_bool(string):
    if string=="true":
        return True
    elif string=="false":
        return False
    else:
        return "fail"

In [198]:
def test_query_full(query):
    #failed_answers = 0
    #correct_answers = 0

    all_triples = query_algorithm(query, 3)
    triples_as_strings = triples_to_string(all_triples)
    attempted_answer = get_response(get_answer_prompt(triples_as_strings, query))["choices"][0]["text"]
    answer = string_to_bool(attempted_answer.lower().strip())
    print("Query: {}".format(query))
    
    if answer=="fail":
        # failed_answers+= 1
        print("- Model Failed to Answer. Attempted: {}".format(attempted_answer.lower().strip()))
    else:
        print("Model Answer: {}".format(answer))
        correct_answer = get_query_answer(query)
        print("Correct Answer: {}".format(correct_answer))
        if correct_answer==answer:
            print("+ Model Answered Correct !")
            #correct_answers += 1
            

In [199]:
for q in question_set:
    print(test_query_full(q))

Query: Have rhinoceroses been killed to improve human sex lives?
Model Answer: False
Correct Answer: True
None
Query: Is the Golden eagle considered a scavenger bird?
Model Answer: False
Correct Answer: False
+ Model Answered Correct !
None
Query: Is a jellyfish safe from atherosclerosis?
Model Answer: False
Correct Answer: True
None
Query: Would a diet of ice eventually kill a person?
Model Answer: False
Correct Answer: True
None
Query: Do manta rays live in water above the safe temperature for cold food storage?
Model Answer: False
Correct Answer: True
None
Query: Are aggressive bumblebees suicidal?
Model Answer: False
Correct Answer: False
+ Model Answered Correct !
None
Query: Would WWF be angrier if you killed koala instead of black swan?
Model Answer: False
Correct Answer: True
None
Query: Is Bactrian Camel most impressive animal when it comes to number of humps?
Model Answer: False
Correct Answer: False
+ Model Answered Correct !
None
Query: Has Gorillaz creator been in more ban

KeyboardInterrupt: 