In [232]:
import openai as ai
import PyPDF2
import io
import json

In [233]:
# Set the API key for OpenAI
ai.api_key = "<YOUR API KEY>"

In [234]:
functions = [
    {
        "name": "generate_software_schema",
        "description": "Get the software mentions",
        "parameters": {
            "type": "object", 
            "properties": {
                "software_list": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "software_name": {
                                "type": "string",
                                "description": "The name of the software package"
                            },
                            "citation_string": {
                                "type": "string",
                                "description": "The existing citation"
                            }
                        },
                        "required": ["software_name"]
                    }
                }
            },
            "required": ["software_list"]
        }
    }
]


In [235]:
role_prompt = """
You are a citation support assistant.
Your job is to identify the names of software in a body of input text together with their citations.
A citation contains metadata like the author and date.
Citations are enclosed in '()' or '[]' and directly follow the name of the software.
Proper nouns such as organizations, businesses, companies, universities, or publishers are not software and should be ignored.
Only identify software if you are sure. Otherwise ignore.

Here are some examples of software mentions with citations:

Example: In case of a range of K values, the true K was determined as a value between the estimates predicted by fastSTRUCTURE (Green and Zuntz, 2009) and based on what made most biological sense
Software: fastSTRUCTURE
Citation: Green and Zuntz, 2009

Example: This design was based on the Mixture Design (D-optimal, two mixture components, two factors, the limits: 5%–95%) option in the Design Expert 7.1.3 Software (Acme Corp., 2023) that generated the experimental scheme (13 standard/run) randomly
Software: Design Expert
Citation: Acme Corp., 2023

Example: Data from this study were processed with GraphPad Prism 8 (GraphPad Software, USA) and expressed as mean ± standard deviation (x ± s) 
Software: GraphPad Prism
Citation: GraphPad Software, USA

Example: The genes exhibiting high expression (≥100 normalized RPKM values) were mapped on functional bins assigned to different pathways in MapMan (Smith et. al., 2009)
Software: MapMan
Citation: Smith et. al., 2009

Now, it’s time to use what you have learned.

For each of the software references you can find in text between <begin_text> and <end_text>, add it to a JSON list that contains the name of the software, and the citation string if it exists.

"""

In [236]:
def generate_prompt(role_prompt, input_text):
    
    return f'''

    {role_prompt}

    <begin_text>{input_text}<end_text>
'''

In [237]:
paper_body_1 = """Due to its importance, many open-source packages contain time-dependent quantum system simulation tools. 
In Python, these include QuTiP (Green et. al., 2009), C3. C++ packages (also with Python interfaces)
include lindbladmpo and Quandary. Packages also exist in other languages, such as the Hamiltonian open quantum system 
toolkit (HOQST) and a Framework for Quantum Optimal Control in Julia, and Spinach in MATLAB. The features in Qiskit 
Dynamics for simulating Qiskit Pulse control sequences replace those previously offered in Qiskit Aer."""

paper_body_2 = """Given the potentially vast number of biology preprints — several hundred thousand papers each year — 
it was clear that bioRxiv (Green et. al., 2009) would require an industrial scale architecture that could process and display a high volume of 
submissions and stably accommodate millions of online readers with minimal downtimes. bioRxiv’s hosting and manuscript 
management sites would have to include state-of-the-art features biologists had come to expect of online journals and 
be able to accommodate both existing and future integrations with other participants in the scholarly communication 
ecosystem (e.g. search engines, indexing services, journals, and manuscript submission systems). After defining the 
specifications required, we partnered with HighWire Press, a company developed within and part-owned by Stanford University 
that had a proven record of more than 20 years in online manuscript hosting and technology development for clients including the American Academy for the Advancement of Science (AAAS) and The National Academy of Sciences (NAS)."""

paper_body_3 = """The 3D structure of GBF1 protein is not available; therefore, a structural model of the Sec7 domain of GBF1 (GBF1_Sec7) 
protein was generated using comparative modeling methods (Sali and Blundell, 1993). Homology model of the GBF1_Sec7 
in its autoinhibited form was generated using the crystal structure of the autoinhibited form of Grp1 Arf GTPase exchange 
factor (PDB: 2R0D, resolution 2.0 Å), which shares ~65% homology with GBF1 in the Sec7 domain. A 3D structural model of the 
GBF1_Sec7-Arf1 complex was generated using the crystal structure of Arno_Sec7-Arf1 (PDB: 1R8Q, resolution 1.9 Å) since Arno 
shares ~65% homology with GBF1 in the Sec7 domain. MD simulations were carried out with the pemed.CUDA module of the 
program Amber18 (Case et al., 2018) using standard and well-tested protocols (Kannan et al., 2015). All atom versions of the Amber 
14SB force field (ff14SB) (Maier et al., 2015) were used to represent the protein. Force field parameters for phosphorylated tyrosine 
and GTP were taken as described elsewhere (Homeyer et al., 2006); an overall charge of –2e is assigned to the phosphate groups. 
The Xleap module was used to prepare the system for the MD simulations. All the simulation systems were neutralized with appropriate 
numbers of counterions. Each neutralized system was solvated in an octahedral box with TIP3P (Jorgensen et al., 1983) water molecules, 
leaving at least 10 Å between the solute atoms and the borders of the box. All MD simulations were carried out in explicit solvent 
at 300 K. During the simulations, the long-range electrostatic interactions were treated with the particle mesh Ewald (Darden et al., 1993) 
method using a real space cutoff distance of 9 Å. The SETTLE (Miyamoto and Kollman, 1992) algorithm was used to constrain bond 
vibrations involving hydrogen atoms, which allowed a time step of 2 fs during the simulations. Solvent molecules and counterions were 
initially relaxed using energy minimization with restraints on the protein and inhibitor atoms. This was followed by unrestrained 
energy minimization to remove any steric clashes."""


In [238]:
# Generating response back from gpt-3.5-turbo
openai_response = ai.ChatCompletion.create(
        model = 'ft:gpt-3.5-turbo-0613:personal::8E3PTrF7',
        messages = [{'role': 'user', 'content': generate_prompt(role_prompt, paper_body_3)}],
        functions = functions,
        function_call = 'auto'
    )


In [239]:
# Read the response from OpenAI.

finish_reason = openai_response.choices[0]['finish_reason']

if finish_reason == "function_call":
    decoded_response = json.loads(openai_response.choices[0].message.function_call.arguments.strip())
    json_formatted_str = json.dumps(decoded_response, indent=2)
    print(json_formatted_str)
elif finish_reason == "stop":
    print('No software')


{
  "software_list": [
    {
      "software_name": "Amber18",
      "citation_string": "Case et al., 2018"
    },
    {
      "software_name": "Xleap",
      "citation_string": ""
    }
  ]
}
