In [26]:
import pandas as pd 
import numpy as np
import sklearn as sklearn
import os as os

import matplotlib.pyplot as plt
import seaborn as sns
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import re as re

if os.getlogin()=="JVARGH7":
    path_equity_precision_llm_folder = "C:/Cloud/OneDrive - Emory University/Papers/Global Equity in Diabetes Precision Medicine LLM"
    path_equity_precision_llm_repo =  'C:/code/external/equity_precision_llm'

elif os.getlogin()=='aamnasoniwala':
    path_equity_precision_llm_folder = "/Users/aamnasoniwala/Library/CloudStorage/OneDrive-Emory/Global Equity in Diabetes Precision Medicine LLM"
    path_equity_precision_llm_repo = '/Users/aamnasoniwala/code/equity_precision_llm'

excel_path = path_equity_precision_llm_folder + "/llm training/Methods.xlsx"
# path_equity_precision_llm_repo = os.path.abspath("").replace("preprocessing", "")

execfile(path_equity_precision_llm_repo + "/functions/crosstab_summary.py")


In [27]:
# path_equity_precision_llm_folder = "/Users/aamnasoniwala/Library/CloudStorage/OneDrive-Emory/Global Equity in Diabetes Precision Medicine LLM"
# path_equity_precision_llm_repo = '/Users/aamnasoniwala/code/equity_precision_llm'

excel_path_training = path_equity_precision_llm_folder + "/llm training/Methods.xlsx"
excel_path_development = path_equity_precision_llm_folder + "/llm training/Development Data.xlsx"
excel_path_test = path_equity_precision_llm_folder + "/llm training/Test Data.xlsx"
# path_equity_precision_llm_repo = os.path.abspath("").replace("preprocessing", "")

execfile(path_equity_precision_llm_repo + "/functions/clean_input.py")
execfile(path_equity_precision_llm_repo + "/functions/crosstab_summary.py")

In [28]:
input_training = clean_input(input_path = excel_path_training, sheet_name='Training Data')
input_development = clean_input(input_path = excel_path_development, sheet_name='Sheet1')
input_test = clean_input(input_path = excel_path_test, sheet_name='Sheet1')


### Running different scenarios for Training (1 to 4)

In [29]:
combined_output_training = pd.DataFrame()
for scenario in range(1,5):
    results = pd.read_csv(path_equity_precision_llm_folder + '/llm training/Training Scenario '+ str(scenario) +'_results.csv')
    merged_df_training = input_training.merge(results, left_on='PMID', right_on='pmid', how='left')
    merged_df_training['source_population_match'] = merged_df_training.apply(lambda row: bool(re.search('(^|\s)' + str(row['gpt_source_population']),str(row['orig_source_population']))), axis=1)

    # Crosstab summary for precision medicine
    summary_precision_medicine = crosstab_summary(merged_df_training,truth='orig_precision_medicine',test='gpt_precision_medicine')
    summary_diabetes = crosstab_summary(merged_df_training,truth='orig_diabetes',test='gpt_diabetes')
    summary_primary_study = crosstab_summary(merged_df_training,truth='orig_primary_study',test='gpt_primary_study')

    summary_precision_medicine['variable'] = 'Precision Medicine'
    summary_diabetes['variable'] = 'Diabetes'
    summary_primary_study['variable'] = 'Primary Study'  

    t_source_population = pd.crosstab(merged_df_training['source_population_match'], merged_df_training['orig_source_population'])

    prop_correct_source_population = t_source_population.loc[True].sum()/t_source_population.sum().sum() 
    prop_correct_source_population

    summary_source_population = pd.DataFrame({'variable': 'Source Population', 'Accuracy': prop_correct_source_population}, index=[0])


    df_summary = pd.concat([summary_precision_medicine, summary_diabetes, summary_primary_study,summary_source_population])
    df_summary['Scenario'] = 'Scenario' + str(scenario)
    combined_output_training = pd.concat([combined_output_training,df_summary],axis=0,ignore_index=True) 

combined_output_training.to_csv(path_equity_precision_llm_repo + '/preprocessing/epl03_combined output_Training.csv', index=False)

### Running different scenarios for Development (1 to 4)

In [30]:
combined_output_development = pd.DataFrame()
for scenario in range(1,5):
    results = pd.read_csv(path_equity_precision_llm_folder + '/llm training/Development Scenario '+ str(scenario) +'_results.csv')
    merged_df_development = input_development.merge(results, left_on='PMID', right_on='pmid', how='left')
    merged_df_development['source_population_match'] = merged_df_development.apply(lambda row: bool(re.search('(^|\s)' + str(row['gpt_source_population']),str(row['orig_source_population']))), axis=1)

    # Crosstab summary for precision medicine
    summary_precision_medicine = crosstab_summary(merged_df_development,truth='orig_precision_medicine',test='gpt_precision_medicine')
    summary_diabetes = crosstab_summary(merged_df_development,truth='orig_diabetes',test='gpt_diabetes')
    summary_primary_study = crosstab_summary(merged_df_development,truth='orig_primary_study',test='gpt_primary_study')

    summary_precision_medicine['variable'] = 'Precision Medicine'
    summary_diabetes['variable'] = 'Diabetes'
    summary_primary_study['variable'] = 'Primary Study'  

    t_source_population = pd.crosstab(merged_df_training['source_population_match'], merged_df_training['orig_source_population'])

    prop_correct_source_population = t_source_population.loc[True].sum()/t_source_population.sum().sum() 
    prop_correct_source_population

    summary_source_population = pd.DataFrame({'variable': 'Source Population', 'Accuracy': prop_correct_source_population}, index=[0])


    df_summary = pd.concat([summary_precision_medicine, summary_diabetes, summary_primary_study,summary_source_population])
    df_summary['Scenario'] = 'Scenario' + str(scenario)
    combined_output_development = pd.concat([combined_output_development,df_summary],axis=0,ignore_index=True) 

combined_output_development.to_csv(path_equity_precision_llm_repo + '/preprocessing/epl03_combined output_Development.csv', index=False)

### Combining different outputs for Test (1 : n_json_splits)

In [None]:
results = pd.DataFrame()
for scenario in range(1,n_json_splits+1):
    results = pd.concat([results,pd.read_csv(path_equity_precision_llm_folder + '/llm training/Test Data Splits/Test Part '+ str(part) +'_results.csv')])
merged_df_test = input_development.merge(results, left_on='PMID', right_on='pmid', how='left')
merged_df_test['source_population_match'] = merged_df_test.apply(lambda row: bool(re.search('(^|\s)' + str(row['gpt_source_population']),str(row['orig_source_population']))), axis=1)

    # Crosstab summary for precision medicine
summary_precision_medicine = crosstab_summary(merged_df_test,truth='orig_precision_medicine',test='gpt_precision_medicine')
summary_diabetes = crosstab_summary(merged_df_test,truth='orig_diabetes',test='gpt_diabetes')
summary_primary_study = crosstab_summary(merged_df_test,truth='orig_primary_study',test='gpt_primary_study')

summary_precision_medicine['variable'] = 'Precision Medicine'
summary_diabetes['variable'] = 'Diabetes'
summary_primary_study['variable'] = 'Primary Study'  

t_source_population = pd.crosstab(merged_df_training['source_population_match'], merged_df_training['orig_source_population'])

prop_correct_source_population = t_source_population.loc[True].sum()/t_source_population.sum().sum() 
prop_correct_source_population

summary_source_population = pd.DataFrame({'variable': 'Source Population', 'Accuracy': prop_correct_source_population}, index=[0])


df_summary = pd.concat([summary_precision_medicine, summary_diabetes, summary_primary_study,summary_source_population])


df_summary.to_csv(path_equity_precision_llm_repo + '/preprocessing/epl03_combined output_Test.csv', index=False)