### How to access the OpenAI application programming interface

In your command prompt window:
>> pip install openai

https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety



In [1]:
import pandas as pd 
import numpy as np
import sklearn as sklearn
import os as os
import getpass

import matplotlib.pyplot as plt
import seaborn as sns
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import re as re
import datetime as datetime
# Ensure all necessary packages are installed
%pip install kneed

user = getpass.getuser()

if user == "JVARGH7":
    path_equity_precision_llm_folder = "C:/Cloud/OneDrive - Emory University/Papers/Global Equity in Diabetes Precision Medicine LLM"
    path_equity_precision_llm_repo =  'C:/code/external/equity_precision_llm'

elif user == "aamnasoniwala":
    path_equity_precision_llm_folder = "/Users/aamnasoniwala/Library/CloudStorage/OneDrive-Emory/Global Equity in Diabetes Precision Medicine LLM"
    path_equity_precision_llm_repo = '/Users/aamnasoniwala/code/equity_precision_llm'

else:
    raise ValueError("Unrecognized user")

excel_path = path_equity_precision_llm_folder + "/llm training/Development Data.xlsx"
# path_equity_precision_llm_repo = os.path.abspath("").replace("preprocessing", "")


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [2]:
api_key_epl_shared = ""

from openai import OpenAI
# https://stackoverflow.com/questions/36959031/how-to-source-file-into-python-script
execfile(path_equity_precision_llm_repo + "/constants.py")


In [3]:
execfile(path_equity_precision_llm_repo + "/functions/prompt_generator.py")
execfile(path_equity_precision_llm_repo + "/functions/base_prompt_append.py")

base_prompt_files = ['p1v6', 'p2v6', 'p3v6','p4v6']
base_prompts = base_prompt_append(base_prompt_files)

prompt_pmid = prompt_generator(34735502, base_prompts, excel_path, sheet_name='Sheet1')

print(base_prompts[0])
print(base_prompts[1])
print(prompt_pmid)
print(base_prompts[3])

I am going to outline inclusion criteria for three categories: diabetes, precision medicine, and primary study. 

Please wait for me to prompt you on what to do based on these criteria.  
Here are the inclusion criteria:  

DIABETES: Do not exclude any type of diabetes or prediabetes. The presence of certain conditions or risk factors may not definitively confirm that the study is related to diabetes or prediabetes, unless there is a clear link to diabetes pathophysiology, diagnosis, or management. 

PRECISION MEDICINE: Precision medicine is an assessment of genetic or metabolic state to guide preventive and therapeutic decisions in humans. Exclude epidemiological studies using traditional biomarkers only, focusing on omics (genomics, metabolomics, proteomics, lipidomics etc.) or multi-omics studies. 

PRIMARY STUDY: All types of studies, other than meta-analysis and systematic reviews, were evaluated as primary studies. Although it may bias the studies towards counting contributions o

### 1. Preparing Your Batch File

https://platform.openai.com/docs/guides/batch

.jsonl file where each line contains the details of an individual request to the API

Each request must include a custom_id value

{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}


{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}

In [4]:
pmid_list = pd.read_excel(excel_path, sheet_name='Sheet1')['PMID'].tolist()

json_list_scenario1 = []
json_list_scenario2 = []
json_list_scenario3 = []
json_list_scenario4 = []

for index, pmid in enumerate(pmid_list):
    # print(pmid)
    prompt_pmid = prompt_generator(pmid, base_prompts, excel_path, sheet_name='Sheet1')
    dict_pmid_scenario1 = {"custom_id": str(index) + "_" + str(pmid), 
                 "method": "POST", 
                 "url": "/v1/chat/completions", 
                 "body": {"model": "gpt-4o-2024-08-06", 
                          "messages": [ {"role": "system", "content": base_prompts[0]},
                                        {"role": "system", "content": base_prompts[1]},
                                        {"role": "user", "content": prompt_pmid},
                                        {"role": "user", "content": base_prompts[3]}],
                            "max_tokens": 1000,
                            "frequency_penalty": 0.2,
                            "temperature": 0.2,
                            "presence_penalty": 0.3

                        }
                }
    
    dict_pmid_scenario2 = {"custom_id": str(index) + "_" + str(pmid), 
                 "method": "POST", 
                 "url": "/v1/chat/completions", 
                 "body": {"model": "gpt-4o-2024-08-06", 
                          "messages": [ {"role": "system", "content": base_prompts[0]},
                                        {"role": "system", "content": base_prompts[1]},
                                        {"role": "user", "content": prompt_pmid},
                                        {"role": "user", "content": base_prompts[3]}],
                            "max_tokens": 1000,
                            "frequency_penalty": -0.2,
                            "temperature": 0.2,
                            "presence_penalty": -0.3

                        }
                }
    
    dict_pmid_scenario3 = {"custom_id": str(index) + "_" + str(pmid), 
                 "method": "POST", 
                 "url": "/v1/chat/completions", 
                 "body": {"model": "gpt-4o-2024-08-06", 
                          "messages": [ {"role": "system", "content": base_prompts[0]},
                                        {"role": "system", "content": base_prompts[1]},
                                        {"role": "user", "content": prompt_pmid},
                                        {"role": "user", "content": base_prompts[3]}],
                            "max_tokens": 1000,
                            "frequency_penalty": 0.2,
                            "top_p": 0.8,
                            "presence_penalty": 0.3

                        }
                }

    dict_pmid_scenario4 = {"custom_id": str(index) + "_" + str(pmid), 
                 "method": "POST", 
                 "url": "/v1/chat/completions", 
                 "body": {"model": "gpt-4o-2024-08-06", 
                          "messages": [ {"role": "system", "content": base_prompts[0]},
                                        {"role": "system", "content": base_prompts[1]},
                                        {"role": "user", "content": prompt_pmid},
                                        {"role": "user", "content": base_prompts[3]}],
                            "max_tokens": 1000,
                            "frequency_penalty": -0.2,
                            "top_p": 0.8,
                            "presence_penalty": -0.3

                        }
                }
    json_list_scenario1.append(dict_pmid_scenario1)
    
    json_list_scenario2.append(dict_pmid_scenario2)

    json_list_scenario3.append(dict_pmid_scenario3)

    json_list_scenario4.append(dict_pmid_scenario4)




In [6]:
import json
with open(path_equity_precision_llm_folder + '\llm training\Development Scenario 1.jsonl', 'w') as outfile:
    for entry in json_list_scenario1:
        json_line = json.dumps(entry)
        outfile.write(json_line + '\n')

with open(path_equity_precision_llm_folder + '\llm training\Development Scenario 2.jsonl', 'w') as outfile:
    for entry in json_list_scenario2:
        json_line = json.dumps(entry)
        outfile.write(json_line + '\n')

with open(path_equity_precision_llm_folder + '\llm training\Development Scenario 3.jsonl', 'w') as outfile:
    for entry in json_list_scenario3:
        json_line = json.dumps(entry)
        outfile.write(json_line + '\n')

with open(path_equity_precision_llm_folder + '\llm training\Development Scenario 4.jsonl', 'w') as outfile:
    for entry in json_list_scenario4:
        json_line = json.dumps(entry)
        outfile.write(json_line + '\n')

### 2. Uploading Your Batch Input File

In [7]:
client = OpenAI(api_key= api_key_epl_shared)

batch_input_file_scenario1 = client.files.create(
  file=open(path_equity_precision_llm_folder + '\llm training\Development Scenario 1.jsonl', "rb"),
  purpose="batch"
)

batch_input_file_scenario2 = client.files.create(
  file=open(path_equity_precision_llm_folder + '\llm training\Development Scenario 2.jsonl', "rb"),
  purpose="batch"
)

batch_input_file_scenario3 = client.files.create(
  file=open(path_equity_precision_llm_folder + '\llm training\Development Scenario 3.jsonl', "rb"),
  purpose="batch"
)

batch_input_file_scenario4 = client.files.create(
  file=open(path_equity_precision_llm_folder + '\llm training\Development Scenario 4.jsonl', "rb"),
  purpose="batch"
)



### 3. Creating the Batch

In [8]:
batch_input_file_scenario1_id = batch_input_file_scenario1.id
print(batch_input_file_scenario1_id)
batch_created_scenario1 = client.batches.create(
    input_file_id=batch_input_file_scenario1_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "development data for PMID query - scenario 1"
    }
)

batch_input_file_scenario2_id = batch_input_file_scenario2.id
print(batch_input_file_scenario1_id)
batch_created_scenario2 = client.batches.create(
    input_file_id=batch_input_file_scenario2_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "development data for PMID query - scenario 2"
    }
)

batch_input_file_scenario3_id = batch_input_file_scenario3.id
print(batch_input_file_scenario3_id)
batch_created_scenario3 = client.batches.create(
    input_file_id=batch_input_file_scenario3_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "development data for PMID query - scenario 3"
    }
)

batch_input_file_scenario4_id = batch_input_file_scenario4.id
print(batch_input_file_scenario4_id)
batch_created_scenario4 = client.batches.create(
    input_file_id=batch_input_file_scenario4_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "development data for PMID query - scenario 4"
    }
)

file-4yxGZHXkBjBhHAo4erzx9M
file-4yxGZHXkBjBhHAo4erzx9M
file-3Ji8rCg5wPwwFf97WAnNaD
file-U4wG7HfCmHdCFd9dvMJYmQ


In [9]:
batches_file_path = path_equity_precision_llm_repo + "/Batches.txt"

if not os.path.exists(batches_file_path):
    with open(batches_file_path, "w") as f:
        pass

with open(batches_file_path, "a") as f:
    f.write("\n")
    f.write("\n Development Data")
    current_time = datetime.datetime.now()
    f.write(f"# {current_time}\n")
    f.write(f"batch_created_scenario1_id = '{batch_created_scenario1.id}'\n")
    f.write(f"batch_created_scenario2_id = '{batch_created_scenario2.id}'\n")
    f.write(f"batch_created_scenario3_id = '{batch_created_scenario3.id}'\n")
    f.write(f"batch_created_scenario4_id = '{batch_created_scenario4.id}'\n")




In [10]:
batch_created_scenario1_id = batch_created_scenario1.id
batch_created_scenario2_id = batch_created_scenario2.id
batch_created_scenario3_id = batch_created_scenario3.id
batch_created_scenario4_id = batch_created_scenario4.id

### 4. Checking the Status of a Batch

In [17]:
# batch_created_id = 'batch_675d9dc3f25881909544964060c659c3'
client = OpenAI(api_key= api_key_epl_shared)

batch_status_scenario1 = client.batches.retrieve(batch_created_scenario1_id)

batch_status_scenario2 = client.batches.retrieve(batch_created_scenario2_id)
batch_status_scenario3 = client.batches.retrieve(batch_created_scenario3_id)

batch_status_scenario4 = client.batches.retrieve(batch_created_scenario4_id)

print("Scenario 1: " + batch_status_scenario1.status + "; Scenario 2: " + batch_status_scenario2.status + "; Scenario 3:" + batch_status_scenario3.status + "; Scenario 4:" + batch_status_scenario4.status)




Scenario 1: completed; Scenario 2: completed; Scenario 3:completed; Scenario 4:in_progress


### 5. Retrieving the Results

In [142]:
client = OpenAI(api_key= api_key_epl_shared)

file_response_scenario1 = client.files.content(batch_status_scenario1.output_file_id)
file_response_scenario2 = client.files.content(batch_status_scenario2.output_file_id)
file_response_scenario3 = client.files.content(batch_status_scenario3.output_file_id)
file_response_scenario4 = client.files.content(batch_status_scenario4.output_file_id)


In [147]:
execfile(path_equity_precision_llm_repo + "/functions/format_gpt_output.py")

results1 = format_gpt_output(file_response_scenario1)
results2 = format_gpt_output(file_response_scenario2)
results3 = format_gpt_output(file_response_scenario3)
results4 = format_gpt_output(file_response_scenario4)



results1.to_csv(path_equity_precision_llm_folder + '\llm training\Development Scenario 1_results.csv', index=False)
results2.to_csv(path_equity_precision_llm_folder + '\llm training\Development Scenario 2_results.csv', index=False)
results3.to_csv(path_equity_precision_llm_folder + '\llm training\Development Scenario 3_results.csv', index=False)
results4.to_csv(path_equity_precision_llm_folder + '\llm training\Development Scenario 4_results.csv', index=False)


  results1.to_csv(path_equity_precision_llm_folder + '\llm training\Development Scenario 1_results.csv', index=False)
  results2.to_csv(path_equity_precision_llm_folder + '\llm training\Development Scenario 2_results.csv', index=False)
  results3.to_csv(path_equity_precision_llm_folder + '\llm training\Development Scenario 3_results.csv', index=False)
  results4.to_csv(path_equity_precision_llm_folder + '\llm training\Development Scenario 4_results.csv', index=False)
  df = pd.read_json(json_output.content.decode('utf-8'), lines=True)
  df = pd.read_json(json_output.content.decode('utf-8'), lines=True)
  df = pd.read_json(json_output.content.decode('utf-8'), lines=True)
  df = pd.read_json(json_output.content.decode('utf-8'), lines=True)


In [148]:
results4.head()

Unnamed: 0,pmid,title,gpt_precision_medicine,gpt_diabetes,gpt_primary_study,gpt_source_population
0,34735502,The association between triglyceride-glucose ...,no,no,yes,ea
1,31369557,Targeted sequencing of candidate genes of dys...,yes,no,yes,sa
2,20838400,Are endocannabinoid type 1 receptor gene (CNR...,yes,no,yes,cee
3,18820969,Cellulose biosynthesis by the beta-proteobact...,no,no,yes,
4,27677465,No PERV transmission during a clinical trial ...,no,yes,yes,lac
