In [15]:
import requests
from os import environ
from pathlib import Path
from json import loads, dumps
environ["SEMSCHOLAR_KEY"] = Path("~/.semscholarkey").expanduser().read_text().strip()
datapath = Path("../master-database-files/master-experimental/extract_triples_from_semscholar")
assert datapath.exists()

In [22]:
import time

def pause_until(timespan):
    current_time = time.time()
    if pause_until.last_time != None and current_time - pause_until.last_time < timespan:
        time.sleep(timespan - (current_time - pause_until.last_time))
    pause_until.last_time = time.time()

# Initialize the last_time attribute
pause_until.last_time = None

In [1]:
from os import environ
from pathlib import Path
from json import loads, dumps
from random import choice
environ["OPENAI_API_KEY"] = Path("~/.openaiapikey").expanduser().read_text().strip()
from openai import OpenAI
from random import randint

openaiClient = OpenAI()
def gpt_3_5_turbo_completion(query, temperature = 1):
    answer = openaiClient.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": query
            }
        ],
        temperature = temperature,
        seed = randint(0, 1000000)
    )
    return answer.choices[0].message.content

def gpt_4_turbo_completion(query, temperature = 1):
    answer = openaiClient.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {
                "role": "system",
                "content": query
            }
        ],
        temperature = temperature,
        seed = randint(0, 1000000)
    )
    return answer.choices[0].message.content

def tryRecieveAnswer(query, completionFunction = gpt_3_5_turbo_completion, answerConversion = lambda x: True, maxTries = 10, temperature = 1):
    tryNumber = 0
    while tryNumber < maxTries:
        answer = completionFunction(query, temperature)
        try:
            answer = answerConversion(answer)
            return (answer, True)
        except:
            pass
        tryNumber += 1
    print(f"Failed to recieve answer for query: {query}")
    print(f"Answer: {answer}")
    return (None, False)

def listAnswerConversion(answer):
    result = loads(answer)
    assert isinstance(result, list)
    for item in result:
        assert isinstance(item, str)
    return result

In [35]:
def extractTripleFromAbstract(abstract, title, subject):
    query = f"""
Semantic triples such as ["Star", "emits", "Light"] and ["Rocket", "can bring cargo to", "Space"] consists of a subject, a predicate, and an object.
Please extract a triple from the following abstract that contains the subject "{subject}":

Abstract of the paper "{title}":
{abstract}

Return nothing but the triple in the format ["subject", "predicate", "object"].
"""
    def answerConversion(answer):
        ret = loads(answer)
        assert isinstance(ret, list)
        assert len(ret) == 3
        for item in ret:
            assert isinstance(item, str)
        return ret
    answer, success = tryRecieveAnswer(query, answerConversion = answerConversion)
    return answer

In [10]:
extractTripleFromAbstract("""Based on the established task of identifying boosted, hadronically decaying top
quarks, we compare a wide range of modern machine learning approaches. Unlike
most established methods they rely on low-level input, for instance calorimeter
output. While their network architectures are vastly different, their performance
is comparatively similar. In general, we find that these new approaches are ex-
tremely powerful and great fun.""", "machine learning")

['machine learning approaches', 'rely on', 'low-level input']

In [36]:
def extractTriplesFromSemanticScholarAbstracts(term, outputFilePath, numberOfTriples = 1):
    outputFilePath = Path(outputFilePath)
    if outputFilePath.exists():
        raise Exception(f"Output file {outputFilePath} already exists!")
    with outputFilePath.open("w") as outputFile:
        outputFile.write("[")
        triples = []
        papers = []
        searchOffset = 0
        searchStep = 100
        while len(triples) < numberOfTriples:
            if len(papers) == 0:
                pause_until(1)
                req = requests.get(f"https://api.semanticscholar.org/graph/v1/paper/search?query={term}&offset={searchOffset}&limit={searchStep}&fields=abstract,title", headers = {"x-api-key": environ["SEMSCHOLAR_KEY"]})
                papers = loads(req.text)["data"]
                searchOffset += searchStep
                if len(papers) == 0:
                    break
            paper = papers.pop(0)
            abstract = paper["abstract"]
            title = paper["title"]
            if abstract == None:
                continue
            paperId = paper["paperId"]
            triple = extractTripleFromAbstract(abstract, title, term)
            outputFile.write(f"    [{dumps(triple)}, {dumps(paperId)}]{',' if len(triples) < numberOfTriples - 1 else ''}\n")
            outputFile.flush()
            triples.append(triple)
        outputFile.write("]")

In [37]:
extractTriplesFromSemanticScholarAbstracts("electron", datapath / "electron_triples.json", 100)

63.0