### How to access the OpenAI application programming interface

In your command prompt window:
>> pip install openai

https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety



In [46]:
import pandas as pd 
import numpy as np
import sklearn as sklearn
import os as os
import getpass

import matplotlib.pyplot as plt
import seaborn as sns
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import re as re
import datetime as datetime
# Ensure all necessary packages are installed
%pip install kneed

user = getpass.getuser()

if user == "JVARGH7":
    path_equity_precision_llm_folder = "C:/Cloud/OneDrive - Emory University/Papers/Global Equity in Diabetes Precision Medicine LLM"
    path_equity_precision_llm_repo =  'C:/code/external/equity_precision_llm'

elif user == "aamnasoniwala":
    path_equity_precision_llm_folder = "/Users/aamnasoniwala/Library/CloudStorage/OneDrive-Emory/Global Equity in Diabetes Precision Medicine LLM"
    path_equity_precision_llm_repo = '/Users/aamnasoniwala/code/equity_precision_llm'

else:
    raise ValueError("Unrecognized user")

excel_path = path_equity_precision_llm_folder + "/llm training/Aamna Datasets/Test Data 1.xlsx"
# path_equity_precision_llm_repo = os.path.abspath("").replace("preprocessing", "")


Note: you may need to restart the kernel to use updated packages.


In [47]:
api_key_epl_shared = ""

from openai import OpenAI
# https://stackoverflow.com/questions/36959031/how-to-source-file-into-python-script
execfile(path_equity_precision_llm_repo + "/constants.py")


In [48]:
execfile(path_equity_precision_llm_repo + "/functions/prompt_generator.py")
execfile(path_equity_precision_llm_repo + "/functions/base_prompt_append.py")

base_prompt_files = ['p1v6', 'p2v6', 'p3v6','p4v6']
base_prompts = base_prompt_append(base_prompt_files)

prompt_pmid = prompt_generator(23680249, base_prompts, excel_path, sheet_name = 'Sheet1')


print(base_prompts[0])
print(base_prompts[1])
print(prompt_pmid)
print(base_prompts[3])

I am going to outline inclusion criteria for three categories: diabetes, precision medicine, and primary study. 

Please wait for me to prompt you on what to do based on these criteria.  
Here are the inclusion criteria:  

DIABETES: Do not exclude any type of diabetes or prediabetes. The presence of certain conditions or risk factors may not definitively confirm that the study is related to diabetes or prediabetes, unless there is a clear link to diabetes pathophysiology, diagnosis, or management. 

PRECISION MEDICINE: Precision medicine is an assessment of genetic or metabolic state to guide preventive and therapeutic decisions in humans. Exclude epidemiological studies using traditional biomarkers only, focusing on omics (genomics, metabolomics, proteomics, lipidomics etc.) or multi-omics studies. 

PRIMARY STUDY: All types of studies, other than meta-analysis and systematic reviews, were evaluated as primary studies. Although it may bias the studies towards counting contributions o

### 1. Preparing Your Batch File

https://platform.openai.com/docs/guides/batch

.jsonl file where each line contains the details of an individual request to the API

Each request must include a custom_id value

{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}


{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}

In [49]:
pmid_list = pd.read_excel(excel_path, sheet_name='Sheet1')['PMID'].tolist()

json_list_scenario4 = []

for index, pmid in enumerate(pmid_list):
    dict_pmid_scenario4 = {"custom_id": str(index) + "_" + str(pmid), 
                 "method": "POST", 
                 "url": "/v1/chat/completions", 
                 "body": {"model": "gpt-4o-2024-08-06", 
                          "messages": [ {"role": "system", "content": base_prompts[0]},
                                        {"role": "system", "content": base_prompts[1]},
                                        {"role": "user", "content": prompt_pmid},
                                        {"role": "user", "content": base_prompts[3]}],
                            "max_tokens": 2000,
                            "frequency_penalty": -0.2,
                            "top_p": 0.8,
                            "presence_penalty": -0.3

                        }
                }

    json_list_scenario4.append(dict_pmid_scenario4)

In [50]:
# other option i tried to bypass token limit ********

import json
import pandas as pd

json_list_scenario4 = []

# Generate JSON objects
for index, pmid in enumerate(pmid_list):
    dict_pmid_scenario4 = {
        "custom_id": f"{index}_{pmid}", 
        "method": "POST", 
        "url": "/v1/chat/completions", 
        "body": {
            "model": "gpt-4o-2024-08-06", 
            "messages": [
                {"role": "system", "content": base_prompts[0]},
                {"role": "system", "content": base_prompts[1]},
                {"role": "user", "content": prompt_pmid},
                {"role": "user", "content": base_prompts[3]}
            ],
            "max_tokens": 2000,
            "frequency_penalty": -0.2,
            "top_p": 0.8,
            "presence_penalty": -0.3
        }
    }
    json_list_scenario4.append(dict_pmid_scenario4)

# Split the JSON data
mid_index = len(json_list_scenario4) // 2
part1 = json_list_scenario4[:mid_index]
part2 = json_list_scenario4[mid_index:]

# Define the OneDrive folder path
onedrive_path = "/Users/aamnasoniwala/Library/CloudStorage/OneDrive-Emory/Global Equity in Diabetes Precision Medicine LLM/llm training/Aamna Datasets"

# Ensure the directory exists
os.makedirs(onedrive_path, exist_ok=True)

# Save each part as separate JSON files in OneDrive
file1_path = os.path.join(onedrive_path, "output_part1.json")
file2_path = os.path.join(onedrive_path, "output_part2.json")

with open(file1_path, "w") as f1:
    json.dump(part1, f1, indent=4)

with open(file2_path, "w") as f2:
    json.dump(part2, f2, indent=4)

print(f"JSON files saved successfully to OneDrive folder:\n{file1_path}\n{file2_path}")

JSON files saved successfully to OneDrive folder:
/Users/aamnasoniwala/Library/CloudStorage/OneDrive-Emory/Global Equity in Diabetes Precision Medicine LLM/llm training/Aamna Datasets/output_part1.json
/Users/aamnasoniwala/Library/CloudStorage/OneDrive-Emory/Global Equity in Diabetes Precision Medicine LLM/llm training/Aamna Datasets/output_part2.json


In [51]:
import json

with open(path_equity_precision_llm_folder + '\llm training\Test 1 Scenario 4.jsonl', 'w') as outfile:
    for entry in json_list_scenario4:
        json_line = json.dumps(entry)
        outfile.write(json_line + '\n')

  with open(path_equity_precision_llm_folder + '\llm training\Test 1 Scenario 4.jsonl', 'w') as outfile:


### 2. Uploading Your Batch Input File

In [52]:
client = OpenAI(api_key= api_key_epl_shared)

batch_input_file_scenario4 = client.files.create(
  file=open(path_equity_precision_llm_folder + '\llm training\Test 1 Scenario 4.jsonl', "rb"),
  purpose="batch"
)



  file=open(path_equity_precision_llm_folder + '\llm training\Test 1 Scenario 4.jsonl', "rb"),


### 3. Creating the Batch

In [42]:
batch_input_file_scenario4_id = batch_input_file_scenario4.id
print(batch_input_file_scenario4_id)
batch_created_scenario4 = client.batches.create(
    input_file_id=batch_input_file_scenario4_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "test 1 data for PMID query - scenario 4"
    }
)

file-UdZ1AUr1LdQ3UQfQGikV5y


In [43]:
batches_file_path = path_equity_precision_llm_repo + "/Batches.txt"

if not os.path.exists(batches_file_path):
    with open(batches_file_path, "w") as f:
        pass

with open(batches_file_path, "a") as f:
    f.write("\n")
    current_time = datetime.datetime.now()
    f.write(f"# {current_time}\n")
    f.write(f"batch_created_scenario4_id = '{batch_created_scenario4.id}'\n")




In [44]:
batch_created_scenario4_id = batch_created_scenario4.id

### 4. Checking the Status of a Batch

In [45]:
# batch_created_id = 'batch_675d9dc3f25881909544964060c659c3'
client = OpenAI(api_key= api_key_epl_shared)


batch_status_scenario4 = client.batches.retrieve(batch_created_scenario4_id)

print("Scenario 4: " + batch_status_scenario4.status)




Scenario 4: failed


### 5. Retrieving the Results

In [None]:
client = OpenAI(api_key= api_key_epl_shared)

file_response_scenario4 = client.files.content(batch_status_scenario4.output_file_id)


In [None]:
execfile(path_equity_precision_llm_repo + "/functions/format_gpt_output.py")

results4 = format_gpt_output(file_response_scenario4)

results4.to_csv(path_equity_precision_llm_folder + '\llm training\Test 1 Scenario 4_results.csv', index=False)


  df = pd.read_json(json_output.content.decode('utf-8'), lines=True)
  df = pd.read_json(json_output.content.decode('utf-8'), lines=True)
  df = pd.read_json(json_output.content.decode('utf-8'), lines=True)
  df = pd.read_json(json_output.content.decode('utf-8'), lines=True)


In [26]:
results4.head()

Unnamed: 0,pmid,title,gpt_precision_medicine,gpt_diabetes,gpt_primary_study,gpt_source_population
1,22744164,Acculturation and glycemic control of Asian I...,no,yes,yes,sa
1,15561964,Linkage analysis of diabetes status among hyp...,yes,yes,yes,na
1,28770629,"Ipragliflozin, a sodium glucose co-transporte...",no,yes,yes,ea
1,33764184,Gastrodin protects against high glucose-induc...,yes,yes,yes,not applicable
1,36155119,Curcumin supplementation reduces blood glucos...,no,yes,yes,lac
