In [4]:
from pathlib import Path
dataRootPath = Path("../master-database-files/master-experimental/navigate_semantic_triples_evaluation/")
dataRootPath.exists()

True

In [42]:
from openai import OpenAI
from random import randint, choice
from os import environ
from pathlib import Path
from json import loads, dumps
environ["OPENAI_API_KEY"] = Path("~/.openaiapikey").expanduser().read_text().strip()

openaiClient = OpenAI()
def gpt_3_5_turbo_completion(query):
    answer = openaiClient.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": query
            }
        ],
        seed = randint(0, 1000000)
    )
    return answer.choices[0].message.content

def gpt_4_turbo_completion(query):
    answer = openaiClient.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {
                "role": "system",
                "content": query
            }
        ],
        seed = randint(0, 1000000)
    )
    return answer.choices[0].message.content

def tryRecieveAnswer(query, completionFunction = gpt_4_turbo_completion, answerConversion = lambda x: x, maxTries = 5):
    tryNumber = 0
    while tryNumber < maxTries:
        answer = completionFunction(query)
        try:
            answer = answerConversion(answer)
            return (answer, True)
        except:
            pass
        tryNumber += 1
    print(f"Failed to recieve answer for query: {query}. Last answer: {answer}")
    return (None, False)

def listAnswerConversion(answer):
    result = loads(answer)
    assert isinstance(result, list)
    for item in result:
        assert isinstance(item, str)
    return result

In [6]:
def findQuestionWithSemanticTripleAnswer(topicArea):
    query = f'''
    Semantic triples such as ["Star", "emits", "Light"] and ["Rocket", "can bring cargo to", "Space"] consists of a subject, a predicate, and an object. 
    Give me a question of the topc area "{topicArea}" that can be answered with a semantic triple.
    Return the question together with the semantic triple that answers it in the format ["question", ["sub", "pred", "obj"]].
    Return nothing but the array without explanation.'''
    def answerConversion(answer):
        result = loads(answer)
        assert isinstance(result, list)
        assert len(result) == 2
        assert isinstance(result[0], str)
        assert isinstance(result[1], list)
        assert len(result[1]) == 3
        assert all(isinstance(term, str) for term in result[1])
        return result
    answer, success = tryRecieveAnswer(query, answerConversion = answerConversion)
    return answer

In [7]:
def addNewQuestionsThatCanBeAnsweredByTriples(numberOfQuestions):
    termsPath = dataRootPath / "terms.txt"
    terms = termsPath.read_text().split("\n")
    questionPath = dataRootPath / "questions.txt"
    if not questionPath.exists():
        questionPath.write_text("[]")
    questions = loads(questionPath.read_text())
    for i in range(numberOfQuestions):
        topicArea = choice(terms)
        question, triple = findQuestionWithSemanticTripleAnswer(topicArea)
        questions.append([question, triple])
    questionPath.write_text(dumps(questions, indent = 4))

In [27]:
addNewQuestionsThatCanBeAnsweredByTriples(1)

In [41]:
def safeTripleNavigationPath(nameOfTheDataRun, generatingModel, navigatingModel, examinerModel, maxIterationsPerSearch, totalIterations):
    tripleNavigationPath = dataRootPath / "searches" / nameOfTheDataRun / "searchpath.json"
    if not tripleNavigationPath.exists():
        tripleNavigationPath.parent.mkdir(parents = True, exist_ok = True)
        tripleNavigationPath.write_text(dumps({}, indent = 4))
    tripleNavigation = loads(tripleNavigationPath.read_text())
    questionPath = dataRootPath / "questions.txt"
    questions = loads(questionPath.read_text())
    try:
        for question, _ in questions:
            if question not in tripleNavigation:
                tripleNavigation[question] = [[], False]
            currentSearch, tripleFound = tripleNavigation[question]
            # foramt of the currentSearch is [[alteringOption, topicChoices, choosenTopic, tripleChoices, choosenTriple], ...
            if tripleFound:
                continue
            if len(currentSearch) == 0:
                choosenTopic = "Physics"
                choosenTriple = ["Physics", "is a branch of", "Science"]
            else:
                choosenTopic = currentSearch[-1][2]
                choosenTriple = currentSearch[-1][4]
            if len(currentSearch) < 2:
                lastTopic = choosenTopic
            else:
                lastTopic = currentSearch[-2][2]
            while len(currentSearch) < maxIterationsPerSearch:
                totalIterations -= 1
                if totalIterations < 0:
                    break
                # Make one search iteration
                # What part of the triple should be altered?
                query = f'''Semantic triples such as ["Star", "emits", "Light"] and ["Rocket", "can bring cargo to", "Space"] consists of a subject, a predicate, and an object.
The goal is to find a semantic triple that answers the question "{question}".
This should be achieved by altering the current triple {choosenTriple}.
Return A if just the subject should remain the same, B if just the predicate should remain the same, and C if just the object should remain the same.
Return D if only the subject should change, E if only the predicate should change, and F if only the object should change.
Return nothing but the letter.'''
                def answerConversion(answer):
                    assert answer in ["A", "B", "C", "D", "E", "F"]
                    return answer
                alteringOption, success = tryRecieveAnswer(query, navigatingModel, answerConversion = answerConversion)
                if not success:
                    break
                tripleString = "[" + (choosenTriple[0] if alteringOption in ["A", "E", "F"] else "???") + ", " + (choosenTriple[1] if alteringOption in ["B", "D", "F"] else "???") + ", " + (choosenTriple[2] if alteringOption in ["C", "D", "E"] else "???") + "]"
                # What are the subtopics of the current topic and two related topics?
                query = f'Give me five subtopics of the topic "{choosenTopic}" and two related topics. Return them all in an array formatted like ["sub1", "sub2", "sub3", "sub4", "sub5", "rel1", "rel2"]. Return nothing but the array without explanation.'
                def answerConversion(answer):
                    result = loads(answer)
                    assert isinstance(result, list)
                    assert len(result) == 7
                    for item in result:
                        assert isinstance(item, str)
                    return result
                answer, success = tryRecieveAnswer(query, generatingModel, answerConversion = answerConversion)
                if not success:
                    break
                topicChoices = answer + [lastTopic, choosenTopic, "Physics"]
                topicChoicesString = "\n".join([f"{i+1}: {topic}" for i, topic in enumerate(topicChoices)])
                # What is the topic of the next triple?
                query = f'''The semantic triple {tripleString} should be completed to answer the question "{question}".
Therefore, a topic for the triple is needed. Select a topic from the following list:
{topicChoicesString}
Return nothing but the number of the topic.'''
                def answerConversion(answer):
                    ret = int(answer)
                    assert 1 <= ret <= len(topicChoices)
                    return ret
                topicIndex, success = tryRecieveAnswer(query, navigatingModel, answerConversion = answerConversion)
                if not success:
                    break
                lastTopic = choosenTopic
                choosenTopic = topicChoices[topicIndex - 1]
                # What are the possible completions of the triple?
                query = f'''Semantic triples such as ["Star", "emits", "Light"] and ["Rocket", "can bring cargo to", "Space"] consists of a subject, a predicate, and an object.
Create ten different completed versions of the triple {tripleString}. The topic of the triples should be "{choosenTopic}".
Return the completed triples in the format [["sub1", "pred1", "obj1"], ["sub2", "pred2", "obj2"], ...].
Return nothing but the array without explanation.'''
                def answerConversion(answer):
                    result = loads(answer)
                    assert isinstance(result, list)
                    assert all(isinstance(triple, list) for triple in result)
                    assert all(len(triple) == 3 for triple in result)
                    assert all(isinstance(term, str) for triple in result for term in triple)
                    return result
                tripleChoices, success = tryRecieveAnswer(query, generatingModel, answerConversion = answerConversion)
                if not success:
                    break
                numberedTriplesString = "\n".join([f"{i+1}: [" + ", ".join(triple) + "]" for i, triple in enumerate(tripleChoices)])
                # Which triple should be selected?
                query = f'''Select the triple that answers the question "{question}" or is the closest to it.
When you can not decide, select a random triple.
{numberedTriplesString}
Return nothing but the number of the triple without explanation.'''
                def answerConversion(answer):
                    ret = int(answer)
                    assert 1 <= ret <= len(tripleChoices)
                    return ret
                tripleIndex, success = tryRecieveAnswer(query, navigatingModel, answerConversion = answerConversion)
                if not success:
                    break
                choosenTriple = tripleChoices[tripleIndex - 1]
                currentSearch.append([alteringOption, topicChoices, choosenTopic, tripleChoices, choosenTriple])
                choosenTripleString = "[" + ", ".join(choosenTriple) + "]"
                # Check if the triple answers the question
                query = f'''Does the triple {choosenTripleString} explicitly answer the question "{question}"?
Return Y or N.'''
                def answerConversion(answer):
                    answer = answer.upper()
                    assert answer in ["Y", "N"]
                    return answer
                answer, success = tryRecieveAnswer(query, examinerModel, answerConversion = answerConversion)
                if not success:
                    break
                if answer == "Y":
                    tripleNavigation[question] = [currentSearch, True]
                    break
    except:
        pass
    tripleNavigationPath.write_text(dumps(tripleNavigation, indent = 4))
        

In [46]:
safeTripleNavigationPath("gpt4search", gpt_4_turbo_completion, gpt_4_turbo_completion, gpt_4_turbo_completion, 10, 500)

Failed to recieve answer for query: Select the triple that answers the question "What do cathode rays consist of?" or is the closest to it.
When you can not decide, select a random triple.
1: [Photon, is a quantum of, Electromagnetic Spectrum]
2: [Photon, is part of, Electromagnetic Spectrum]
3: [Photon, is an elementary particle in, Electromagnetic Spectrum]
4: [Photon, represents energy in, Electromagnetic Spectrum]
5: [Photon, manifests as light in, Electromagnetic Spectrum]
6: [Photon, travels through, Electromagnetic Spectrum]
7: [Photon, exhibits wave-particle duality within, Electromagnetic Spectrum]
8: [Photon, is absorbed by atoms across, Electromagnetic Spectrum]
9: [Photon, interacts with matter through, Electromagnetic Spectrum]
10: [Photon, is emitted at various wavelengths of, Electromagnetic Spectrum]
Return nothing but the number of the triple without explanation.. Last answer: None of the provided triples directly answer the question "What do cathode rays consist of?" 

In [43]:
safeTripleNavigationPath("gpt3_5search", gpt_3_5_turbo_completion, gpt_3_5_turbo_completion, gpt_4_turbo_completion, 10, 500)

In [25]:
# Count the number of questions
questionPath = dataRootPath / "questions.txt"
questions = loads(questionPath.read_text())
len(questions)
len(set([question for question, _ in questions]))
# Find the dublicate question
from collections import Counter
questionCounter = Counter([question for question, _ in questions])
for question, count in questionCounter.items():
    if count > 1:
        print(question)

What does the first law of thermodynamics state about energy in a closed system?


In [16]:
from math import log10
def countDecisionCostOfSingleSearch(search):
    cost = 0
    for _, topicChoices, _, tripleChoices, _ in search:
        cost += log10(6)
        cost += log10(len(topicChoices))
        cost += log10(len(tripleChoices))
    return cost

In [17]:
def calculateAverageWithError(valueList):
    average = sum(valueList) / len(valueList)
    error = (sum((value - average) ** 2 for value in valueList) / (len(valueList) - 1) / len(valueList)) ** 0.5
    return (average, error)

In [19]:
def calculateAverageDecisionCosts(nameOfTheDataRun):
    tripleNavigationPath = dataRootPath / "searches" / nameOfTheDataRun / "searchpath.json"
    tripleNavigation = loads(tripleNavigationPath.read_text())
    successfullSearchCosts = [countDecisionCostOfSingleSearch(searchpath) for searchpath, success in tripleNavigation.values() if success]
    failedSearchCosts = [countDecisionCostOfSingleSearch(searchpath) for searchpath, success in tripleNavigation.values() if not success]
    averageDecisionCostOfSuccessfulSearches, averageDecisionCostOfSuccessfulSearchesError = calculateAverageWithError(successfullSearchCosts)
    averageDecisionCostOfFailedSearches, averageDecisionCostOfFailedSearchesError = calculateAverageWithError(failedSearchCosts)
    print(len(successfullSearchCosts), len(failedSearchCosts))
    probabilityOfSuccess = len(successfullSearchCosts) / (len(successfullSearchCosts) + len(failedSearchCosts))
    probabilityOfSuccessError = (probabilityOfSuccess * (1 - probabilityOfSuccess) / (len(successfullSearchCosts) + len(failedSearchCosts))) ** 0.5
    averageDecisionCostOfNonInteruptedSearches = (1 / probabilityOfSuccess - 1) * averageDecisionCostOfFailedSearches + averageDecisionCostOfSuccessfulSearches
    averageDecisionCostOfNonInteruptedSearchesError = (averageDecisionCostOfSuccessfulSearchesError ** 2 + (averageDecisionCostOfFailedSearchesError * (1 / probabilityOfSuccess - 1)) ** 2 + (probabilityOfSuccessError * averageDecisionCostOfFailedSearches / probabilityOfSuccess / probabilityOfSuccess) ** 2) ** 0.5
    evaluationPath = dataRootPath / "searches" / nameOfTheDataRun / "evaluation.json"
    evaluation = {
        "averageDecisionCostOfSuccessfulSearches": averageDecisionCostOfSuccessfulSearches,
        "averageDecisionCostOfSuccessfulSearchesError": averageDecisionCostOfSuccessfulSearchesError,
        "averageDecisionCostOfFailedSearches": averageDecisionCostOfFailedSearches,
        "averageDecisionCostOfFailedSearchesError": averageDecisionCostOfFailedSearchesError,
        "averageDecisionCostOfNonInteruptedSearches": averageDecisionCostOfNonInteruptedSearches,
        "averageDecisionCostOfNonInteruptedSearchesError": averageDecisionCostOfNonInteruptedSearchesError,
        "probabilityOfSuccess": probabilityOfSuccess,
        "probabilityOfSuccessError": probabilityOfSuccessError
    }
    evaluationPath.write_text(dumps(evaluation, indent = 4))
    for key, value in evaluation.items():
        print(f"{key}: {value}")
    return evaluation

In [31]:
calculateAverageDecisionCosts("gpt4search")

57 43
averageDecisionCostOfSuccessfulSearches: 15.157983138058126
averageDecisionCostOfSuccessfulSearchesError: 0.9331690721115556
averageDecisionCostOfFailedSearches: 24.357279567317065
averageDecisionCostOfFailedSearchesError: 1.1614932972360248
averageDecisionCostOfNonInteruptedSearches: 33.532772987086794
averageDecisionCostOfNonInteruptedSearchesError: 3.9260499377328277
probabilityOfSuccess: 0.57
probabilityOfSuccessError: 0.04950757517794625


{'averageDecisionCostOfSuccessfulSearches': 15.157983138058126,
 'averageDecisionCostOfSuccessfulSearchesError': 0.9331690721115556,
 'averageDecisionCostOfFailedSearches': 24.357279567317065,
 'averageDecisionCostOfFailedSearchesError': 1.1614932972360248,
 'averageDecisionCostOfNonInteruptedSearches': 33.532772987086794,
 'averageDecisionCostOfNonInteruptedSearchesError': 3.9260499377328277,
 'probabilityOfSuccess': 0.57,
 'probabilityOfSuccessError': 0.04950757517794625}

In [49]:
calculateAverageDecisionCosts("gpt4search")

43 57
averageDecisionCostOfSuccessfulSearches: 16.475082996461143
averageDecisionCostOfSuccessfulSearchesError: 1.0797216048512424
averageDecisionCostOfFailedSearches: 25.83193267900581
averageDecisionCostOfFailedSearchesError: 0.8108730493070732
averageDecisionCostOfNonInteruptedSearches: 50.717412361654894
averageDecisionCostOfNonInteruptedSearchesError: 7.082393221697145
probabilityOfSuccess: 0.43
probabilityOfSuccessError: 0.04950757517794625


{'averageDecisionCostOfSuccessfulSearches': 16.475082996461143,
 'averageDecisionCostOfSuccessfulSearchesError': 1.0797216048512424,
 'averageDecisionCostOfFailedSearches': 25.83193267900581,
 'averageDecisionCostOfFailedSearchesError': 0.8108730493070732,
 'averageDecisionCostOfNonInteruptedSearches': 50.717412361654894,
 'averageDecisionCostOfNonInteruptedSearchesError': 7.082393221697145,
 'probabilityOfSuccess': 0.43,
 'probabilityOfSuccessError': 0.04950757517794625}

In [29]:
calculateAverageDecisionCosts("gpt3_5search")

42 58
averageDecisionCostOfSuccessfulSearches: 15.983633395067661
averageDecisionCostOfSuccessfulSearchesError: 1.072957857840274
averageDecisionCostOfFailedSearches: 27.78151250383644
averageDecisionCostOfFailedSearchesError: 0.0
averageDecisionCostOfNonInteruptedSearches: 54.348579233698935
averageDecisionCostOfNonInteruptedSearchesError: 7.846833318581913
probabilityOfSuccess: 0.42
probabilityOfSuccessError: 0.04935585071701227


{'averageDecisionCostOfSuccessfulSearches': 15.983633395067661,
 'averageDecisionCostOfSuccessfulSearchesError': 1.072957857840274,
 'averageDecisionCostOfFailedSearches': 27.78151250383644,
 'averageDecisionCostOfFailedSearchesError': 0.0,
 'averageDecisionCostOfNonInteruptedSearches': 54.348579233698935,
 'averageDecisionCostOfNonInteruptedSearchesError': 7.846833318581913,
 'probabilityOfSuccess': 0.42,
 'probabilityOfSuccessError': 0.04935585071701227}

In [48]:
calculateAverageDecisionCosts("gpt3_5search")

27 73
averageDecisionCostOfSuccessfulSearches: 16.01450352073513
averageDecisionCostOfSuccessfulSearchesError: 1.2068379508452023
averageDecisionCostOfFailedSearches: 27.78151250383644
averageDecisionCostOfFailedSearchesError: 0.0
averageDecisionCostOfNonInteruptedSearches: 91.12748177184845
averageDecisionCostOfNonInteruptedSearchesError: 16.961870101195196
probabilityOfSuccess: 0.27
probabilityOfSuccessError: 0.044395945760846225


{'averageDecisionCostOfSuccessfulSearches': 16.01450352073513,
 'averageDecisionCostOfSuccessfulSearchesError': 1.2068379508452023,
 'averageDecisionCostOfFailedSearches': 27.78151250383644,
 'averageDecisionCostOfFailedSearchesError': 0.0,
 'averageDecisionCostOfNonInteruptedSearches': 91.12748177184845,
 'averageDecisionCostOfNonInteruptedSearchesError': 16.961870101195196,
 'probabilityOfSuccess': 0.27,
 'probabilityOfSuccessError': 0.044395945760846225}

In [38]:
def revisitingAnswers(nameOfTheDataRun):
    tripleNavigationPath = dataRootPath / "searches" / nameOfTheDataRun / "searchpath.json"
    tripleNavigation = loads(tripleNavigationPath.read_text())
    for question, (search, success) in tripleNavigation.items():
        if success:
            lastTriple = search[-1][4]
            tripleString = "[" + ", ".join(lastTriple) + "]"
            # Check if the triple realy answers the question
            query = f'Does the triple {tripleString} explicitly answer the question "{question}"? Return Y or N.'
            def answerConversion(answer):
                answer = answer.upper()
                assert answer in ["Y", "N"]
                return answer
            answer, success = tryRecieveAnswer(query, gpt_4_turbo_completion, answerConversion = answerConversion)
            tripleNavigation[question][1] = answer == "Y"
    tripleNavigationPath.write_text(dumps(tripleNavigation, indent = 4))
revisitingAnswers("gpt4search")
revisitingAnswers("gpt3_5search")

Failed to recieve answer for query: Does the triple [Quantum Mechanics, uses, Schrodinger Equation] explicitly answer the question "What process allows energy levels in an atom to only have specific discrete values?"? Return Y or N.. Last answer: N

The triple [Quantum Mechanics, uses, Schrodinger Equation] does not explicitly answer the question "What process allows energy levels in an atom to only have specific discrete values?". It implies a method used in quantum mechanics but it does not specifically link to how energy levels in atoms are quantized.
