### How to access the OpenAI application programming interface

In your command prompt window:
>> pip install openai

https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety



In [26]:
import pandas as pd 
import numpy as np
import sklearn as sklearn
import os as os
import getpass

import matplotlib.pyplot as plt
import seaborn as sns
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import re as re
import datetime as datetime
# Ensure all necessary packages are installed
%pip install kneed

user = getpass.getuser()

if user == "JVARGH7":
    path_equity_precision_llm_folder = "C:/Cloud/OneDrive - Emory University/Papers/Global Equity in Diabetes Precision Medicine LLM"
    path_equity_precision_llm_repo =  'C:/code/external/equity_precision_llm'

elif user == "aamnasoniwala":
    path_equity_precision_llm_folder = "/Users/aamnasoniwala/Library/CloudStorage/OneDrive-Emory/Global Equity in Diabetes Precision Medicine LLM"
    path_equity_precision_llm_repo = '/Users/aamnasoniwala/code/equity_precision_llm'

else:
    raise ValueError("Unrecognized user")

excel_path = path_equity_precision_llm_folder + "/llm training/Test Data.xlsx"
# path_equity_precision_llm_repo = os.path.abspath("").replace("preprocessing", "")


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
api_key_epl_shared = ""

from openai import OpenAI
# https://stackoverflow.com/questions/36959031/how-to-source-file-into-python-script
execfile(path_equity_precision_llm_repo + "/constants.py")


In [28]:
execfile(path_equity_precision_llm_repo + "/functions/prompt_generator.py")
execfile(path_equity_precision_llm_repo + "/functions/base_prompt_append.py")

base_prompt_files = ['p1v6', 'p2v6', 'p3v6','p4v6']
base_prompts = base_prompt_append(base_prompt_files)

### 1. Preparing Your Batch File

https://platform.openai.com/docs/guides/batch

.jsonl file where each line contains the details of an individual request to the API

Each request must include a custom_id value

{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}


{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}

In [None]:
pmid_list = pd.read_excel(excel_path, sheet_name='Sheet1')['PMID'].tolist()

json_list_test = []

for index, pmid in enumerate(pmid_list):
    prompt_pmid = prompt_generator(pmid, base_prompts, excel_path, sheet_name='Sheet1')
    dict_pmid_test = {"custom_id": str(index) + "_" + str(pmid), 
                 "method": "POST", 
                 "url": "/v1/chat/completions", 
                 "body": {"model": "gpt-4o-2024-08-06", 
                          "messages": [ {"role": "system", "content": base_prompts[0]},
                                        {"role": "system", "content": base_prompts[1]},
                                        {"role": "user", "content": prompt_pmid},
                                        {"role": "user", "content": base_prompts[3]}],
                            "max_tokens": 2000,
                            "frequency_penalty": -0.2,
                            "top_p": 0.8,
                            "presence_penalty": -0.3

                        }
                }

    json_list_test.append(dict_pmid_test)



### 2. Split the Test Dataset 

In [30]:
# Remove everything inside the folder 'Test Data Splits'


for filename in os.listdir(path_equity_precision_llm_folder + "/llm training/Test Data Splits"):
    os.remove(path_equity_precision_llm_folder + "/llm training/Test Data Splits" +"/" + filename)

# Number of splits
n_json_splits = 4


In [None]:
import json

# Split the JSON data
split_index = len(json_list_test) // n_json_splits # n_json_splits to be specified above

for i in range(n_json_splits):
    part = json_list_test[i*split_index:(i+1)*split_index]
    with(open(path_equity_precision_llm_folder + '/llm training/Test Data Splits/' + f"Test_part{i+1}.jsonl", 'w')) as f:
        for entry in part:
            json_line = json.dumps(entry)
            f.write(json_line + '\n')
    print(f"JSON file saved successfully to OneDrive folder:" + f"Test_part{i+1}.jsonl")




JSON file saved successfully to OneDrive folder:Test Scenario 4_part1.jsonl
JSON file saved successfully to OneDrive folder:Test Scenario 4_part2.jsonl
JSON file saved successfully to OneDrive folder:Test Scenario 4_part3.jsonl
JSON file saved successfully to OneDrive folder:Test Scenario 4_part4.jsonl


### 3. Uploading Your Batch Input File

In [None]:
client = OpenAI(api_key= api_key_epl_shared)

inputs_file_path = path_equity_precision_llm_repo + "/Inputs.txt"

if not os.path.exists(inputs_file_path):
    with open(inputs_file_path, "w") as f:
        pass


with open(inputs_file_path, "a") as f:
    f.write("\n")
    current_time = datetime.datetime.now()
    f.write(f"# {current_time}")
    f.write("\n")

for i in range(n_json_splits):
    batch_input_file_test_partI = client.files.create(
      file = open(path_equity_precision_llm_folder + '/llm training/Test Data Splits/' + f"Test_part{i+1}.jsonl","rb"),
      purpose="batch"
    )
    
    with open(inputs_file_path, "a") as f:
      f.write(f"batch_input_file_test_part{i+1} = '{batch_input_file_test_partI.id}'\n")
    









### 4. Creating the Batch

Navigate to Inputs.txt and enter each batch_input_file_test_partXX_id at a time in the next code chunk.     
For example:    
batch_input_file_test_partI_id = "file-GNgYAgwqibtGtLZ7tDoHYr"
i = 1

In [None]:
# Enter the batch_input_file_test_partI_id from the Inputs.txt file
batch_input_file_test_partI_id = "file-GNgYAgwqibtGtLZ7tDoHYr"
part = 1 # This needs to be changed to the part number


print(batch_input_file_test_partI_id)
batch_created_test_partI = client.batches.create(
    input_file_id=batch_input_file_test_partI_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "test data for PMID query - scenario 4 part " + str(part)
    }
)

batches_file_path = path_equity_precision_llm_repo + "/Batches.txt"

if not os.path.exists(batches_file_path):
    with open(batches_file_path, "w") as f:
        pass

with open(batches_file_path, "a") as f:
    f.write("\n")
    current_time = datetime.datetime.now()
    f.write(f"# {current_time}\n")
    f.write(f"batch_created_test_part{part}_id = '{batch_created_test_partI.id}'\n")


file-GNgYAgwqibtGtLZ7tDoHYr


### 5. Checking the Status of a Batch

Navigate to Batches.txt and enter the bacth_Created_test_partI_id for each part.    
Example:    
batch_created_test_partI_id = 'file-GNgYAgwqibtGtLZ7tDoHYr'    
part = 1 # This needs to be changed to the part number

In [None]:
# batch_created_test_partI_id = batch_created_test_partI.id
batch_created_test_partI_id = 'file-GNgYAgwqibtGtLZ7tDoHYr'
part = 1 # This needs to be changed to the part number
client = OpenAI(api_key= api_key_epl_shared)


batch_status_test_partI = client.batches.retrieve(batch_created_test_partI_id)

print("Scenario 4: " + batch_status_test_partI.status)




ValueError: Expected a non-empty value for `batch_id` but received ''

### 5. Retrieving the Results

In [None]:
client = OpenAI(api_key= api_key_epl_shared)

file_response_test_partI = client.files.content(batch_status_test_partI.output_file_id)


In [None]:
execfile(path_equity_precision_llm_repo + "/functions/format_gpt_output.py")

results4_partI = format_gpt_output(file_response_test_partI)

results4_partI.to_csv(path_equity_precision_llm_folder + '\llm training\Test Data Splits\Test Part ' + str(i) +'_results.csv', index=False)


  df = pd.read_json(json_output.content.decode('utf-8'), lines=True)
  df = pd.read_json(json_output.content.decode('utf-8'), lines=True)
  df = pd.read_json(json_output.content.decode('utf-8'), lines=True)
  df = pd.read_json(json_output.content.decode('utf-8'), lines=True)
