### How to access the OpenAI application programming interface

In your command prompt window:
>> pip install openai

https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety



In [6]:
import pandas as pd 
import numpy as np
import sklearn as sklearn
import os as os
import getpass

import matplotlib.pyplot as plt
import seaborn as sns
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import re as re

# Ensure all necessary packages are installed
%pip install kneed

user = getpass.getuser()

if user == "JVARGH7":
    path_equity_precision_llm_folder = "C:/Cloud/OneDrive - Emory University/Papers/Global Equity in Diabetes Precision Medicine LLM"
    path_equity_precision_llm_repo =  'C:/code/external/equity_precision_llm'

elif user == "aamnasoniwala":
    path_equity_precision_llm_folder = "/Users/aamnasoniwala/Library/CloudStorage/OneDrive-Emory/Global Equity in Diabetes Precision Medicine LLM"
    path_equity_precision_llm_repo = '/Users/aamnasoniwala/Desktop/equity_precision_llm'

else:
    raise ValueError("Unrecognized user")

excel_path = path_equity_precision_llm_folder + "/llm training/Methods.xlsx"
# path_equity_precision_llm_repo = os.path.abspath("").replace("preprocessing", "")


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
api_key_epl_shared = ""

from openai import OpenAI
# https://stackoverflow.com/questions/36959031/how-to-source-file-into-python-script
execfile(path_equity_precision_llm_repo + "/constants.py")


In [9]:
execfile(path_equity_precision_llm_repo + "/functions/prompt_generator.py")
execfile(path_equity_precision_llm_repo + "/functions/base_prompt_append.py")


base_prompt_files = ['p1v6', 'p2v6', 'p3v6','p4v6']
base_prompts = base_prompt_append(base_prompt_files)

prompt_pmid = prompt_generator(22744164, base_prompts, excel_path)


print(base_prompts[0])
print(base_prompts[1])
print(prompt_pmid)
print(base_prompts[3])

I am going to outline inclusion criteria for three categories: diabetes, precision medicine, and primary study. 

Please wait for me to prompt you on what to do based on these criteria.  
Here are the inclusion criteria:  

DIABETES: Do not exclude any type of diabetes or prediabetes. The presence of certain conditions or risk factors may not definitively confirm that the study is related to diabetes or prediabetes, unless there is a clear link to diabetes pathophysiology, diagnosis, or management. 

PRECISION MEDICINE: Precision medicine is an assessment of genetic or metabolic state to guide preventive and therapeutic decisions in humans. Exclude epidemiological studies using traditional biomarkers only, focusing on omics (genomics, metabolomics, proteomics, lipidomics etc.) or multi-omics studies. 

PRIMARY STUDY: All types of studies, other than meta-analysis and systematic reviews, were evaluated as primary studies. Although it may bias the studies towards counting contributions o

In [16]:
# https://platform.openai.com/docs/api-reference/chat/create?lang=python

client = OpenAI(api_key= api_key_epl_shared)

completion = client.chat.completions.create(

    model="gpt-4o-mini-2024-07-18",

    messages=[

        {"role": "system", "content": base_prompts[0]},
        {"role": "system", "content": base_prompts[1]},
        {"role": "user", "content": prompt_pmid},
        {"role": "user", "content": base_prompts[3]}

    ],
    temperature= 0.2,
    max_tokens = 1000

)




In [17]:
print(completion.choices[0].message)

ChatCompletionMessage(content='Here is the analysis of the abstract organized in a table format:\n\n| PMID      | Title                                                       | Precision Medicine | Diabetes | Primary Study | Source Population |\n|-----------|-------------------------------------------------------------|--------------------|---------|---------------|-------------------|\n| 22744164  | Acculturation and glycemic control of Asian Indian adults with type 2 diabetes | No                 | Yes     | Yes           | SA                |\n\n### Justifications:\n- **Precision Medicine**: The study does not focus on genetic or metabolic assessments to guide therapeutic decisions but rather examines the association between acculturation and glycemic control. Therefore, it does not fulfill the precision medicine criteria (No).\n  \n- **Diabetes**: The study explicitly involves participants with type 2 diabetes and discusses glycemic control, which is directly related to diabetes man

### 1. Preparing Your Batch File

https://platform.openai.com/docs/guides/batch

.jsonl file where each line contains the details of an individual request to the API

Each request must include a custom_id value

{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}


{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}

In [153]:
pmid_list = pd.read_excel(excel_path, sheet_name='Training Data')['PMID'].tolist()

json_list = []

for index, pmid in enumerate(pmid_list):
    prompt_pmid = prompt_generator(pmid, base_prompts, excel_path)
    dict_pmid = {"custom_id": str(index) + "_" + str(pmid), 
                 "method": "POST", 
                 "url": "/v1/chat/completions", 
                 "body": {"model": "gpt-4o-mini-2024-07-18", 
                          "messages": [ {"role": "system", "content": base_prompts[0]},
                                        {"role": "system", "content": base_prompts[1]},
                                        {"role": "user", "content": prompt_pmid},
                                        {"role": "user", "content": base_prompts[3]}],
                            "max_tokens": 1000,
                            "temperature": 0.2
                        }
                }

    json_list.append(dict_pmid)



In [154]:
import json
with open(path_equity_precision_llm_folder + '\llm training\Training.jsonl', 'w') as outfile:
    for entry in json_list:
        json_line = json.dumps(entry)
        outfile.write(json_line + '\n')

### 2. Uploading Your Batch Input File

In [8]:
client = OpenAI(api_key= api_key_epl_shared)

batch_input_file = client.files.create(
  file=open(path_equity_precision_llm_folder + '\llm training\Training.jsonl', "rb"),
  purpose="batch"
)



### 3. Creating the Batch

In [9]:
batch_input_file_id = batch_input_file.id
print(batch_input_file_id)
batch_created = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "training data for PMID query"
    }
)



file-Qq6iRkY18WUe2SVNNhQkwg


In [14]:
batches_file_path = path_equity_precision_llm_repo + "/Batches.txt"

if not os.path.exists(batches_file_path):
    with open(batches_file_path, "w") as f:
        pass

with open(batches_file_path, "a") as f:
    f.write("\n")
    current_time = datetime.datetime.now()
    f.write(f"# {current_time}\n")
    f.write(f"batch_created_id = '{batch_created.id}'\n")

batch_created_id = batch_created.id


### 4. Checking the Status of a Batch

In [10]:
# batch_created_id = 'batch_675d9dc3f25881909544964060c659c3'
client = OpenAI(api_key= api_key_epl_shared)

batch_status = client.batches.retrieve(batch_created_id)
batch_status.status

'completed'

### 5. Retrieving the Results

In [11]:
client = OpenAI(api_key= api_key_epl_shared)

file_response = client.files.content(batch_status.output_file_id)


In [12]:
import pandas as pd

df = pd.read_json(file_response.content.decode('utf-8'), lines=True)




  df = pd.read_json(file_response.content.decode('utf-8'), lines=True)


In [13]:
df.head()

Unnamed: 0,id,custom_id,response,error
0,batch_req_675da0e1d74481908c436550f9e0e03e,22744164,"{'status_code': 200, 'request_id': '6196abeb0f...",
1,batch_req_675da0e1e7f88190a48c04dd2fa1d620,115561964,"{'status_code': 200, 'request_id': 'de5f29cc0a...",
2,batch_req_675da0e1f9448190ac58f1499e3b1581,228770629,"{'status_code': 200, 'request_id': 'efc0d81bee...",
3,batch_req_675da0e20ffc819097a40762d8e30cf3,333764184,"{'status_code': 200, 'request_id': 'dc2244f3b5...",
4,batch_req_675da0e2274c8190a8da45f01519a53a,436155119,"{'status_code': 200, 'request_id': '7d2c5e378c...",


In [14]:
df['response'][3]['body']['choices'][0]['message']['content']

'Here\'s the analysis of the abstract in table format:\n\n| PMID      | Title                                                                                                  | Precision Medicine | Diabetes | Primary Study | Source Population |\n|-----------|--------------------------------------------------------------------------------------------------------|--------------------|---------|---------------|-------------------|\n| 33764184  | Gastrodin protects against high glucose-induced cardiomyocyte toxicity via GSK-3beta-mediated nuclear translocation of Nrf2 | Yes                | Yes     | Yes           | UNK               |\n\n### Justifications:\n- **Precision Medicine**: Classified as \'yes\' because the study investigates the effects of Gastrodin on cellular responses related to high glucose conditions, which aligns with precision medicine\'s assessment of metabolic states to guide treatment.\n- **Diabetes**: Classified as \'yes\' because the abstract explicitly mentions "Di

In [15]:

results = pd.DataFrame()
for index in range(len(df)):
    markdown_table = df['response'][index]['body']['choices'][0]['message']['content']
    out = pd.read_csv(pd.io.common.StringIO(markdown_table.split('\n\n')[1]), 
                      sep="|", skipinitialspace=False, 
                      skipfooter=0, engine='python',header=0)

    out.columns = out.columns.str.lower().str.strip().str.replace('**', '')
    results = pd.concat([results,out.iloc[[1]]])
    
results = results.filter(regex=r'^(?!unnamed)')


# Rename 'precision medicine' as 'precision_medicine'
results = results.rename(columns={'precision medicine': 'gpt_precision_medicine', 
                                  'diabetes': 'gpt_diabetes', 
                                  'primary study': 'gpt_primary_study', 
                                  'source population': 'gpt_source_population'})
results['pmid'] = results['pmid'].astype('int32')
# Convert to lower to match
results['gpt_precision_medicine'] = results['gpt_precision_medicine'].str.lower().str.strip()
results['gpt_diabetes'] = results['gpt_diabetes'].str.lower().str.strip()
results['gpt_primary_study'] = results['gpt_primary_study'].str.lower().str.strip()
results['gpt_source_population'] = results['gpt_source_population'].str.lower().str.strip()


results.head()

results.to_csv(path_equity_precision_llm_folder + '\llm training\Training_results.csv', index=False)
results.head()


Unnamed: 0,pmid,title,gpt_precision_medicine,gpt_diabetes,gpt_primary_study,gpt_source_population
1,22744164,Acculturation and glycemic control of Asian I...,no,yes,yes,sa
1,15561964,Linkage analysis of diabetes status among hyp...,yes,yes,yes,na
1,28770629,"Ipragliflozin, a sodium glucose co-transporte...",no,yes,yes,ea
1,33764184,Gastrodin protects against high glucose-induc...,yes,yes,yes,unk
1,36155119,Curcumin supplementation reduces blood glucos...,no,yes,yes,lac


In [16]:

print(results['gpt_precision_medicine'].unique())


['no' 'yes']
