In [87]:
import openai as ai
import PyPDF2
import io
import json

In [88]:
# Set the API key for OpenAI
ai.api_key = ""

In [89]:
functions = [
    {
        "name": "generate_software_schema",
        "description": "Get the software mentions",
        "parameters": {
            "type": "object", 
            "properties": {
                "software_list": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "software_name": {
                                "type": "string",
                                "description": "The name of the software package"
                            },
                            "citation_present": {
                                "type": "boolean",
                                "description": "A boolean field to denote whether or not the software has a mention"
                            }
                        },
                        "required": ["software_name", "citation_present"]
                    }
                }
            },
            "required": ["software_list"]
        }
    }
]


In [90]:
def generate_prompt(input_text):

    return f'''
You are a citation support assistant.
Your job is to identify the names of software in a body of input text together with their citations.
A citation contains metadata like the author and date, and usually directly follows the reference to the software.
Proper nouns such as organizations, businesses, companies, universities, or publishers are not software and should be ignored.

EXAMPLES OF SOFTWARE MENTIONS WITH CITATIONS:

Input: The genes exhibiting high expression (≥100 normalized RPKM values) were mapped on functional bins assigned to different pathways in MapMan (Smith et. al., 2009)
software_name: MapMan
citation_present: true

Input: Flowing Software version 2.5.1 (Perttu, 2008) was used for downstream analysis.
software_name: Flowing Software
citation_present: true

Input: In case of a range of K values, the true K was determined as a value between the estimates predicted by fastSTRUCTURE (Green and Zuntz, 2009) and based on what made most biological sense
software_name: fastSTRUCTURE
citation_present: true

Input: This design was based on the Mixture Design (D-optimal, two mixture components, two factors, the limits: 5%–95%) option in the Design Expert 7.1.3 Software (Acme Corp., 2023) that generated the experimental scheme (13 standard/run) randomly
software_name: Design Expert
citation_present: true

Input: Data from this study were processed with GraphPad Prism 8 [GraphPad Software, USA] and expressed as mean ± standard deviation (x ± s) 
software_name: GraphPad Prism
citation_present: true

EXAMPLES OF SOFTWARE MENTIONS WITHOUT CITATIONS:

Input: This in turn will require improved algorithms, implemented in SHELXE, that take into account the unique aspects of electron scattering.
software_name: SHELXE
citation_present: false

Input: The coexpression networks and hub genes in vital modules were visualized and analyzed by Cytoscape software (Version 3.5.1) 13.
software_name: Cytoscape
citation_present: false

Input: For algorithms providing multiple transcript-level predictions (i.e., miRanda-MicroCosm, PACCMIT-CDS, and TargetSpy), the transcript with the best score was selected as the representative transcript isoform 
software_name: TargetSpy
citation_present: false

Input: We used SPSS version 26.0 to assess hypotheses and research questions
software_name: SPSS
citation_present: false

Now, it’s time to use what you have learned.

For each of the software references you can find in text between <begin_text> and <end_text>, add it to a JSON list that contains the name of the software, and a boolean field noting whether a citation is present or missing.

<begin_text>
{input_text}
<end_text>
'''

In [91]:
paper_body = """Due to its importance, many open-source packages contain time-dependent quantum system simulation tools. 
In Python, these include QuTiP [Green et. al., 2009], C3. C++ packages (also with Python interfaces)
include lindbladmpo and Quandary. Packages also exist in other languages, such as the Hamiltonian open quantum system 
toolkit (HOQST) and a Framework for Quantum Optimal Control in Julia, and Spinach in MATLAB. The features in Qiskit 
Dynamics for simulating Qiskit Pulse control sequences replace those previously offered in Qiskit Aer."""


In [92]:
# Generating response back from gpt-3.5-turbo
openai_response = ai.ChatCompletion.create(
        model = 'gpt-3.5-turbo',
        messages = [{'role': 'user', 'content': generate_prompt(paper_body)}],
        functions = functions,
        function_call = 'auto'
    )


In [95]:
# Read the response from OpenAI.
decoded_response = json.loads(openai_response.choices[0].message.function_call.arguments.strip())

json_formatted_str = json.dumps(decoded_response, indent=2)

print(json_formatted_str)

{
  "software_list": [
    {
      "software_name": "QuTiP",
      "citation_present": true
    },
    {
      "software_name": "C3",
      "citation_present": false
    },
    {
      "software_name": "lindbladmpo",
      "citation_present": false
    },
    {
      "software_name": "Quandary",
      "citation_present": false
    },
    {
      "software_name": "HOQST",
      "citation_present": false
    },
    {
      "software_name": "Framework for Quantum Optimal Control",
      "citation_present": false
    },
    {
      "software_name": "Spinach",
      "citation_present": false
    },
    {
      "software_name": "Qiskit Dynamics",
      "citation_present": false
    },
    {
      "software_name": "Qiskit Pulse",
      "citation_present": false
    },
    {
      "software_name": "Qiskit Aer",
      "citation_present": false
    }
  ]
}
