In [2]:
import pandas as pd 
import numpy as np
import sklearn as sklearn
import os as os
import getpass

import matplotlib.pyplot as plt
import seaborn as sns
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import re as re
import datetime as datetime
# Ensure all necessary packages are installed
%pip install kneed

user = getpass.getuser()

if user == "JVARGH7":
    path_equity_precision_llm_folder = "C:/Cloud/OneDrive - Emory University/Papers/Global Equity in Diabetes Precision Medicine LLM"
    path_equity_precision_llm_repo =  'C:/code/external/equity_precision_llm'

elif user == "aamnasoniwala":
    path_equity_precision_llm_folder = "/Users/aamnasoniwala/Library/CloudStorage/OneDrive-Emory/Global Equity in Diabetes Precision Medicine LLM"
    path_equity_precision_llm_repo = '/Users/aamnasoniwala/code/equity_precision_llm'

else:
    raise ValueError("Unrecognized user")

excel_path = path_equity_precision_llm_folder + "/llm training/epldat03_Unattributable Data.csv"
# path_equity_precision_llm_repo = os.path.abspath("").replace("preprocessing", "")


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
api_key_epl_shared = ""

from openai import OpenAI
# https://stackoverflow.com/questions/36959031/how-to-source-file-into-python-script
execfile(path_equity_precision_llm_repo + "/constants.py")

In [12]:
execfile(path_equity_precision_llm_repo + "/functions/prompt_generator.py")
execfile(path_equity_precision_llm_repo + "/functions/base_prompt_append.py")

base_prompt_files = ['p1v6', 'p2v6', 'p3v6','p4v6']
base_prompts = base_prompt_append(base_prompt_files)

In [None]:
pmid_list = pd.read_csv(excel_path)['PMID'].tolist()

json_list_unattributable = []

# Step 1: Read the excel sheet
if excel_path.endswith('.xlsx') or excel_path.endswith('.xls'):
    df = pd.read_excel(excel_path, sheet_name= "Sheet1")
elif excel_path.endswith('.csv'):
    df = pd.read_csv(excel_path,sep=',')
else:
    raise ValueError("Unsupported file format. Please provide an Excel or CSV file.")

for index, pmid in enumerate(pmid_list):
    print(pmid)
    prompt_pmid = prompt_generator_v2(pmid, base_prompts, df)
    dict_pmid_unattributable = {"custom_id": str(index) + "_" + str(pmid), 
                 "method": "POST", 
                 "url": "/v1/chat/completions", 
                 "body": {"model": "gpt-4o-2024-08-06", 
                          "messages": [ {"role": "system", "content": base_prompts[0]},
                                        {"role": "system", "content": base_prompts[1]},
                                        {"role": "user", "content": prompt_pmid},
                                        {"role": "user", "content": base_prompts[3]}],
                            "max_tokens": 2000,
                            "frequency_penalty": -0.2,
                            "top_p": 0.8,
                            "presence_penalty": -0.3

                        }
                }

    json_list_unattributable.append(dict_pmid_unattributable)

In [None]:
# Remove everything inside the folder 'Test Data Splits'


for filename in os.listdir(path_equity_precision_llm_folder + "/llm training/Unclassified Splits"):
    os.remove(path_equity_precision_llm_folder + "/llm training/Unclassified Splits" +"/" + filename)

# Number of splits
n_json_splits = 10

In [None]:
import json

# Split the JSON data
split_index = len(json_list_unattributable) // n_json_splits # n_json_splits to be specified above

for i in range(n_json_splits):
    part = json_list_unattributable[i*split_index:(i+1)*split_index]
    with(open(path_equity_precision_llm_folder + '/llm training/Unclassified Splits/' + f"Unattributable_part{i+1}.jsonl", 'w')) as f:
        for entry in part:
            json_line = json.dumps(entry)
            f.write(json_line + '\n')
    print(f"JSON file saved successfully to OneDrive folder:" + f"Unattributable_part{i+1}.jsonl")




In [None]:
client = OpenAI(api_key= api_key_epl_shared)

inputs_file_path = path_equity_precision_llm_repo + "/Inputs.txt"

if not os.path.exists(inputs_file_path):
    with open(inputs_file_path, "w") as f:
        pass


with open(inputs_file_path, "a") as f:
    f.write("\n")
    current_time = datetime.datetime.now()
    f.write(f"# {current_time}")
    f.write("\n")

for i in range(n_json_splits):
    batch_input_file_unattributable_part1 = client.files.create(
      file = open(path_equity_precision_llm_folder + '/llm training/Unclassified Splits/' + f"Unattributable_part{i+1}.jsonl","rb"),
      purpose="batch"
    )
    
    with open(inputs_file_path, "a") as f:
      f.write(f"batch_input_file_unattributable_part{i+1} = '{batch_input_file_unattributable_part1.id}'\n")
    








In [None]:
# Enter the batch_input_file_test_partI_id from the Inputs.txt file
batch_input_file_unattributable_part1_id = "file-MEH4Ly2TgTfoVRs283XvZJ"
part = 1 # This needs to be changed to the part number


print(batch_input_file_unattributable_part1_id)
batch_created_unattributable_part1 = client.batches.create(
    input_file_id=batch_input_file_unattributable_part1_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "unattributable data for PMID query - scenario 4 part 4" + str(part)
    }
)

batches_file_path = path_equity_precision_llm_repo + "/Batches.txt"

if not os.path.exists(batches_file_path):
    with open(batches_file_path, "w") as f:
        pass

with open(batches_file_path, "a") as f:
    f.write("\n")
    current_time = datetime.datetime.now()
    f.write(f"# {current_time}\n")
    f.write(f"batch_created_unattributable_part{part}_id = '{batch_created_unattributable_part1.id}'\n")


In [None]:
# batch_created_test_partI_id = batch_created_test_partI.id
batch_created_unattributable_part1_id = 'batch_67b679736cb88190826795d4d28b08fd'
part = 1 # This needs to be changed to the part number
client = OpenAI(api_key= api_key_epl_shared)


batch_status_unattributable_part1 = client.batches.retrieve(batch_created_unattributable_part1_id)

print("Scenario 4: " + batch_status_unattributable_part1.status)


In [None]:
batch_created_unattributable_part1_id = 'batch_67b66cf9833c8190a3e67e081f9298c9'
batch_created_unattributable_part2_id = 'batch_67b679736cb88190826795d4d28b08fd'
batch_created_unattributable_part3_id = 'batch_67b6802fec0481908c84063ba49399fc'
batch_created_unattributable_part4_id = 'batch_67b682351dc081909e053ce05d48278e'

client = OpenAI(api_key= api_key_epl_shared)

file_response_unattributable_part1 = client.files.content(batch_status_unattributable_part1.output_file_id)
file_response_unattributable_part2 = client.files.content(batch_status_unattributable_part2.output_file_id)
file_response_unattributable_part3 = client.files.content(batch_status_unattributable_part3.output_file_id)
file_response_unattributable_part4 = client.files.content(batch_status_unattributable_part4.output_file_id)


In [None]:
execfile(path_equity_precision_llm_repo + "/functions/format_gpt_output.py")

results4_part1 = format_gpt_output(file_response_unattributable_part1)
results4_part1.to_csv(path_equity_precision_llm_folder + '/llm training/Unclassified Splits/Unattributable Part ' + str(1) +'_results.csv', index=False)

results4_part2 = format_gpt_output(file_response_unattributable_part2)
results4_part2.to_csv(path_equity_precision_llm_folder + '/llm training/Unclassified Splits/Unattributable Part ' + str(2) +'_results.csv', index=False)

results4_part3 = format_gpt_output(file_response_unattributable_part3)
results4_part3.to_csv(path_equity_precision_llm_folder + '/llm training/Unclassified Splits/Unattributable Part ' + str(3) +'_results.csv', index=False)

results4_part4 = format_gpt_output(file_response_unattributable_part4)
results4_part4.to_csv(path_equity_precision_llm_folder + '/llm training/Unclassified Splits/Unattributable Part ' + str(4) +'_results.csv', index=False)
