In [4]:
from requests import get
from json import loads, dumps
from nltk.tokenize import sent_tokenize
import nltk
from pathlib import Path
nltk.download('punkt')
datapath = Path("../master-database-files/master-experimental/convert_sentences_to_triples")
assert datapath.exists()

[nltk_data] Downloading package punkt to /home/gratach/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
def getWikipediaText(title):
    return [*loads(get(f"https://en.wikipedia.org/w/api.php?action=query&format=json&titles={title}&prop=extracts&explaintext").text)["query"]["pages"].items()][0][1]["extract"]

In [23]:
electronText = getWikipediaText("Electron")
electronSentences = sent_tokenize(electronText)
(datapath/ "sentences.json").write_text(dumps(electronSentences, indent=2))

60365

In [60]:
from os import environ
from pathlib import Path
from json import loads, dumps
from random import choice
environ["OPENAI_API_KEY"] = Path("~/.openaiapikey").expanduser().read_text().strip()

from openai import OpenAI
from random import randint

openaiClient = OpenAI()
def gpt_3_5_turbo_completion(query, temperature = 1):
    answer = openaiClient.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": query
            }
        ],
        temperature = temperature,
        seed = randint(0, 1000000)
    )
    return answer.choices[0].message.content
def gpt_4o_mini_completion(query, temperature = 1):
    answer = openaiClient.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": query
            }
        ],
        temperature = temperature,
        seed = randint(0, 1000000)
    )
    return answer.choices[0].message.content

def gpt_4_turbo_completion(query, temperature = 1):
    answer = openaiClient.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {
                "role": "system",
                "content": query
            }
        ],
        temperature = temperature,
        seed = randint(0, 1000000)
    )
    return answer.choices[0].message.content

def tryRecieveAnswer(query, completionFunction = gpt_3_5_turbo_completion, answerConversion = lambda x: True, maxTries = 10, temperature = 1):
    tryNumber = 0
    while tryNumber < maxTries:
        answer = completionFunction(query, temperature)
        try:
            answer = answerConversion(answer)
            return (answer, True)
        except:
            pass
        tryNumber += 1
    print(f"Failed to recieve answer for query: {query}")
    return (None, False)

def listAnswerConversion(answer):
    result = loads(answer)
    assert isinstance(result, list)
    for item in result:
        assert isinstance(item, str)
    return result

In [41]:
# Not usefull
ef resolveSentences(sourcepath, targetpath):
    unresolvedSentences = loads(sourcepath.read_text())
    resolvedSentences = [] if not targetpath.exists() else loads(targetpath.read_text())
    while len(resolvedSentences) < len(unresolvedSentences):
        sentence = unresolvedSentences[len(resolvedSentences)]
        contextSentence = resolvedSentences[-1] if len(resolvedSentences) > 0 else ""
        query = f'''
Example: The sentence "He gave her a book and she read it." with the context sentence "John met Merry in the school" can be resolved to "John gave Merry a book and Marry read the book". 
Please resolve the coreferences in the sentence: "{sentence}" with the context sentence: "{contextSentence}". Return only the sentence (not the context sentence) with the coreference resolved quoted in "".'''
        def answerConversion(answer):
            assert answer.startswith('"') and answer.endswith('"')
            return answer[1:-1]
        (answer, success) = tryRecieveAnswer(query, answerConversion=answerConversion)
        if success:
            resolvedSentences.append(answer)
            targetpath.write_text(dumps(resolvedSentences, indent=2))
        else:
            resolvedSentences.append("")

In [63]:
def identifyUsfullSentences(sourcepath, evaluationpath, targetpath):
    sentences = loads(sourcepath.read_text())
    evaluations = [] if not evaluationpath.exists() else loads(evaluationpath.read_text())
    while len(evaluations) < len(sentences):
        sentence = sentences[len(evaluations)]
        query = f'''
Is the following text a single, grammatically correct sentence that does not refer to anything outside of the text?
BEGINNING OF TEXT
{sentence}
END OF TEXT
Answer only with "y" or "n".'''
        def answerConversion(answer):
            assert answer in ["y", "n"]
            return answer
        (answer, success) = tryRecieveAnswer(query, answerConversion=answerConversion, completionFunction=gpt_4o_mini_completion)
        if success:
            evaluations.append(answer)
            evaluationpath.write_text(dumps(evaluations, indent=2))
        else:
            evaluations.append("n")
    targetpath.write_text(dumps([sentence for i, sentence in enumerate(sentences) if evaluations[i] == "y"], indent=2))

In [64]:
identifyUsfullSentences(datapath/ "sentences.json", datapath/ "sentences-evaluations.json", datapath/ "usefull-sentences.json")

In [68]:
def extractSemanticTriplesFromSentence(sentence):
    query = f'''
The sentence "The Earth orbits the sun at a distance of 1 AU." can be converted to the semantic triples:
[
  ["Earth", "orbits", "Sun"],
  ["Earth", "has a distance from the sun of", "1 AU"]
]
What are the semantic triples for the sentence: "{sentence}"?
Return nothing but the semantic triples in the format above.'''
    def answerConversion(answer):
        ret = loads(answer)
        assert isinstance(ret, list)
        for triple in ret:
            assert isinstance(triple, list) and len(triple) == 3
            assert all(isinstance(item, str) for item in triple)
        return ret
    (answer, success) = tryRecieveAnswer(query, answerConversion=answerConversion)
    if success:
        return answer
    return []

In [67]:
extractSemanticTriplesFromSentence("Electrons belong to the first generation of the lepton particle family, and are generally thought to be elementary particles because they have no known components or substructure.")

[['Electrons', 'belong to', 'first generation of lepton particle family'],
 ['Electrons', 'are generally thought to be', 'elementary particles'],
 ['Electrons', 'have', 'no known components or substructure']]

In [69]:
def convertSentencesToSemanticTriples(sourcepath, targetpath):
    sentences = loads(sourcepath.read_text())
    triples = [] if not targetpath.exists() else loads(targetpath.read_text())
    while len(triples) < len(sentences):
        sentence = sentences[len(triples)]
        triples.append(extractSemanticTriplesFromSentence(sentence))
        targetpath.write_text(dumps(triples, indent=2))

In [70]:
convertSentencesToSemanticTriples(datapath/ "usefull-sentences.json", datapath/ "semantic-triples.json")

In [71]:
def combineSemanticTriplesToSentence(triples):
    query = f'''
The semantic triples
[
  ["Earth", "orbits", "Sun"],
  ["Earth", "has a distance from the sun of", "1 AU"]
]
can be combined to the sentence "The Earth orbits the sun at a distance of 1 AU.".
What is the sentence for the semantic triples:
{dumps(triples, indent=2)}
Return nothing but the sentence quoted in "".'''
    def answerConversion(answer):
        assert answer.startswith('"') and answer.endswith('"')
        return answer[1:-1]
    (answer, success) = tryRecieveAnswer(query, answerConversion=answerConversion)
    if success:
        return answer
    return ""

In [72]:
combineSemanticTriplesToSentence([
    [
      "electrons",
      "exhibit",
      "properties of particles"
    ],
    [
      "electrons",
      "exhibit",
      "properties of waves"
    ],
    [
      "electrons",
      "can collide with",
      "other particles"
    ],
    [
      "electrons",
      "can be diffracted like",
      "light"
    ]
  ])

'Electrons exhibit properties of both particles and waves, can collide with other particles, and can be diffracted like light.'

In [75]:
def convertSemanticTriplesToSentences(sourcepath, targetpath):
    triples = loads(sourcepath.read_text())
    sentences = [] if not targetpath.exists() else loads(targetpath.read_text())
    while len(sentences) < len(triples):
        sentenceTriples = triples[len(sentences)]
        sentences.append(combineSemanticTriplesToSentence(sentenceTriples))
        targetpath.write_text(dumps(sentences, indent=2))

In [76]:
convertSemanticTriplesToSentences(datapath/ "semantic-triples.json", datapath/ "converted-sentences.json")

In [8]:
def compareStringLengths(firstpath, secondpath, resultpath):
    first = loads(firstpath.read_text())
    second = loads(secondpath.read_text())
    assert len(first) == len(second)
    lengthDifferences = [len(second[i]) - len(first[i]) for i in range(len(first))]
    averageLengthFirst = sum(len(sentence) for sentence in first) / len(first)
    averageLengthFirstError = (sum((len(sentence) - averageLengthFirst) ** 2 for sentence in first) / (len(first) * (len(first) - 1))) ** 0.5
    averageLengthSecond = sum(len(sentence) for sentence in second) / len(second)
    averageLengthSecondError = (sum((len(sentence) - averageLengthSecond) ** 2 for sentence in second) / (len(second) * (len(second) - 1))) ** 0.5
    averageLengthDifference = sum(lengthDifference for lengthDifference in lengthDifferences) / len(lengthDifferences)
    averageLengthDifferenceError = (sum((lengthDifference - averageLengthDifference) ** 2 for lengthDifference in lengthDifferences) / (len(lengthDifferences) * (len(lengthDifferences) - 1))) ** 0.5
    lengthChangePercentage = averageLengthDifference / averageLengthFirst * 100
    lengthChangePercentageError = ((averageLengthDifferenceError / averageLengthFirst)**2 + (averageLengthFirstError * averageLengthDifference / averageLengthFirst**2)**2)**0.5 * 100
    resultpath.write_text(dumps({
        "averageLengthFirst": averageLengthFirst,
        "averageLengthFirstError": averageLengthFirstError,
        "averageLengthSecond": averageLengthSecond,
        "averageLengthSecondError": averageLengthSecondError,
        "averageLengthDifference": averageLengthDifference,
        "averageLengthDifferenceError": averageLengthDifferenceError,
        "lengthChangePercentage": lengthChangePercentage,
        "lengthChangePercentageError": lengthChangePercentageError
        }, indent=2))

In [9]:
compareStringLengths(datapath/ "usefull-sentences.json", datapath/ "converted-sentences.json", datapath/ "length-comparison.json")

In [14]:
def getMostUsedNewAndOldWords(firstpath, secondpath, changedWordsPath):
    first = loads(firstpath.read_text())
    second = loads(secondpath.read_text())
    changedWords = {}
    for oldSentence, newSentence in zip(first, second):
        oldWordsInSentence = set(oldSentence.lower().replace(".", " ").replace(",", " ").split())
        newWordsInSentence = set(newSentence.lower().replace(".", " ").replace(",", " ").split())
        for word in newWordsInSentence:
            if word in changedWords:
                changedWords[word] += 1
            else:
                changedWords[word] = 1
        for word in oldWordsInSentence:
            if word in changedWords:
                changedWords[word] -= 1
            else:
                changedWords[word] = -1
    # Sort the words by their frequency
    changedWords = sorted(changedWords.items(), key=lambda item: item[1], reverse=True)
    changedWordsPath.write_text(dumps(changedWords, indent=2))

In [15]:
getMostUsedNewAndOldWords(datapath/ "usefull-sentences.json", datapath/ "converted-sentences.json", datapath/ "changed-words.json")

In [26]:
def calculateTriplesStatistics(triplesPath, statisticsPath):
    triples = loads(triplesPath.read_text())
    triplesCount = len(triples)
    triplesLengths = [len(triple) for triple in triples]
    averageTriplesLength = sum(triplesLengths) / triplesCount
    averageTriplesLengthError = (sum((length - averageTriplesLength) ** 2 for length in triplesLengths) / (triplesCount * (triplesCount - 1))) ** 0.5
    totalNumberOfTriples = sum(triplesLengths)
    statisticsPath.write_text(dumps({
        "triplesCount": triplesCount,
        "averageTriplesLength": averageTriplesLength,
        "averageTriplesLengthError": averageTriplesLengthError,
        "totalNumberOfTriples": totalNumberOfTriples
    }, indent=2))

In [27]:
calculateTriplesStatistics(datapath/ "semantic-triples.json", datapath/ "triples-statistics.json")

In [24]:
def getMostUsedTripleParts(triplesPath, mostUsedPartsPath):
    triples = loads(triplesPath.read_text())
    subjects = {}
    predicates = {}
    objects = {}
    for triple in triples:
        for part in triple:
            if part[0].lower() in subjects:
                subjects[part[0].lower()] += 1
            else:
                subjects[part[0].lower()] = 1
            if part[1].lower() in predicates:
                predicates[part[1].lower()] += 1
            else:
                predicates[part[1].lower()] = 1
            if part[2].lower() in objects:
                objects[part[2].lower()] += 1
            else:
                objects[part[2].lower()] = 1
    # Sort the parts by their frequency
    subjects = sorted(subjects.items(), key=lambda item: item[1], reverse=True)
    predicates = sorted(predicates.items(), key=lambda item: item[1], reverse=True)
    objects = sorted(objects.items(), key=lambda item: item[1], reverse=True)
    # Use only those parts that are used more than once
    subjects = [subject for subject in subjects if subject[1] > 1]
    predicates = [predicate for predicate in predicates if predicate[1] > 1]
    objects = [object for object in objects if object[1] > 1]
    mostUsedPartsPath.write_text(dumps({
        "subjects": subjects,
        "predicates": predicates,
        "objects": objects
    }, indent=2))

In [25]:
getMostUsedTripleParts(datapath/ "semantic-triples.json", datapath/ "most-used-parts.json")

In [28]:
def calculateAveragePredicatesWordLength(triplesPath, averageWordLengthPath):
    triplegroups = loads(triplesPath.read_text())
    predicates = [triple[1] for triplegroup in triplegroups for triple in triplegroup]
    predicatesWordLengths = [len(predicate.split()) for predicate in predicates]
    averageWordLength = sum(predicatesWordLengths) / len(predicatesWordLengths)
    averageWordLengthError = (sum((length - averageWordLength) ** 2 for length in predicatesWordLengths) / (len(predicatesWordLengths) * (len(predicatesWordLengths) - 1))) ** 0.5
    averageWordLengthPath.write_text(dumps({
        "averageWordLength": averageWordLength,
        "averageWordLengthError": averageWordLengthError
    }, indent=2))

In [29]:
calculateAveragePredicatesWordLength(datapath/ "semantic-triples.json", datapath/ "average-word-length.json")