In [1]:
"""
    This script processes the validation response JSON file generated by validate_response.py.
    As a reminder, the generated JSON file is a list of dictionaries, where each dictionary is a response from a verbal autopsy record generated by a language model.
    Each record is run 10 times, and the results are aggregated into a single dictionary.
    This code will loads the JSON file, then groupby rowid, aggregate the number of times a similar ICD10 code and CGHR10 code is returned, 
    compile all results into a dataframe, and then export the results to CSV files.
"""


import pandas as pd

PARSED_DATA = "repeated_sampled_0308_parsed.csv"

EXPORT_DIR = "../healsl_rd1to2_rapid_gpt3_v2b_2024_03_11"


df = pd.read_csv(PARSED_DATA)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Remove any ICDs with decimals
df[['cause1_icd10', 'cause2_icd10', 'cause3_icd10', 'cause4_icd10', 'cause5_icd10']] = df[['cause1_icd10', 'cause2_icd10', 'cause3_icd10', 'cause4_icd10', 'cause5_icd10']].map(lambda x: x.split('.')[0] if pd.notnull(x) else x)



In [3]:
grouped_df = df.groupby('rowid')
same_cause_count_df = pd.DataFrame(grouped_df['cause1_icd10'].value_counts())
# blank_df = pd.DataFrame(columns=[x for x in range(1,11)])
blank_df = pd.DataFrame(index=same_cause_count_df.reset_index().rowid.unique(), columns=[x for x in range(1,11)])
dummy_df = pd.get_dummies(same_cause_count_df['count']).astype(int).groupby('rowid').sum()
# same_cause_count_df = blank_df.merge(dummy_df, how='outer', right_index=True)
same_cause_count_df = blank_df.combine_first(dummy_df)
# same_cause_count_df = blank_df.merge(dummy_df, how='outer').infer_objects().fillna(0)
same_cause_count_df = same_cause_count_df.rename(columns=lambda x: f'same_cause1_icd10_{x}x')
same_cause_icd10_colnames = same_cause_count_df.columns

binarized_sum = same_cause_count_df.sum()
nbinarized_sum = same_cause_count_df[same_cause_icd10_colnames].apply(lambda x: x.astype(bool)).sum()

# reduce all non-zero values to 1 and sum
# print("same_cause_count_df.sum() binarized:")
# print(binarized_sum)
# print("same_cause_count_df.sum() non-binarized:")
# print(nbinarized_sum)

print("Binarized and non-binarized sum (binarized reduces repeated counts of a rowid record to 1)")
display(pd.DataFrame({'binarized': binarized_sum, 'non-binarized': nbinarized_sum}))

print(f"Majority repeated similarity (0.0-1.0): {binarized_sum.iloc[-5:].sum()/len(df.rowid.unique())}")

Binarized and non-binarized sum (binarized reduces repeated counts of a rowid record to 1)


Unnamed: 0,binarized,non-binarized
same_cause1_icd10_1x,19,17
same_cause1_icd10_2x,11,10
same_cause1_icd10_3x,6,6
same_cause1_icd10_4x,6,5
same_cause1_icd10_5x,6,4
same_cause1_icd10_6x,2,2
same_cause1_icd10_7x,6,6
same_cause1_icd10_8x,7,7
same_cause1_icd10_9x,13,13
same_cause1_icd10_10x,66,66


Majority repeated similarity (0.0-1.0): 0.94


In [4]:
"""
    Objective: Add the aggregated cause1_icd10 to the end of the dataframe. This completes the ICD10 processing portion. 
    The final dataframe have the following columns:
    - same_cause1_icd10_1x      (binarized, indiciating if the cause1_cghr10 is repeated 1 times)
    - same_cause1_icd10_2x
    - same_cause1_icd10_3x
    - same_cause1_icd10_4x
    - same_cause1_icd10_5x
    - same_cause1_icd10_6x
    - same_cause1_icd10_7x
    - same_cause1_icd10_8x
    - same_cause1_icd10_9x
    - same_cause1_icd10_10x     (binarized, indicating if the cause1_cghr10 is repeated 10 times)
    - cause1_icd10              (dictionary, {ICD10_1: count, ICD10_2: count, ...})
"""
aggregated_cause1_icd10_rows = []

for name, group in grouped_df:
    # print(f"Row ID: {name}", group['cause1_icd10'].value_counts().to_dict())
    aggregated_cause1_icd10_rows.append([name, group['cause1_icd10'].value_counts().to_dict()])
    
combined_icd10_df = pd.DataFrame(aggregated_cause1_icd10_rows, columns=['rowid', 'cause1_icd10']).set_index('rowid')

# pd.set_option('display.max_rows', None)

# combined_df

final_icd_df = same_cause_count_df.merge(combined_icd10_df, left_index=True, right_index=True)
final_icd_df = final_icd_df.merge(df[['rowid', 'age_group', 'round']].drop_duplicates(subset='rowid').set_index('rowid'), left_index=True, right_index=True)
final_icd_df

Unnamed: 0,same_cause1_icd10_1x,same_cause1_icd10_2x,same_cause1_icd10_3x,same_cause1_icd10_4x,same_cause1_icd10_5x,same_cause1_icd10_6x,same_cause1_icd10_7x,same_cause1_icd10_8x,same_cause1_icd10_9x,same_cause1_icd10_10x,cause1_icd10,age_group,round
14000252,0,0,0,0,0,0,0,0,0,1,{'A09': 10},adult,rd1
14000286,0,0,0,0,0,0,0,0,0,1,{'G83': 10},adult,rd1
14000296,0,0,0,0,0,0,0,0,0,1,{'K35': 10},adult,rd1
14000405,0,0,1,0,0,0,1,0,0,0,"{'R50': 7, 'J18': 3}",adult,rd1
14000435,0,0,0,0,0,0,0,0,0,1,{'B54': 10},child,rd1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24002738,0,0,0,0,0,0,0,0,0,1,{'B54': 10},child,rd2
24002795,0,1,0,0,0,0,0,1,0,0,"{'E75': 8, 'A83': 2}",child,rd2
24002976,0,0,0,0,0,0,0,0,0,1,{'G83': 10},adult,rd2
24003163,0,0,0,0,0,0,0,0,0,1,{'G40': 10},adult,rd2


In [5]:
"""
    Convert ICD10 to CGHR10, to find number of repeats.    
"""

# load the mapping
icd10_to_cghr_mapping = pd.read_csv('../data_202402/icd10_cghr10_v1.csv')

icd10_to_cghr_mapping.sample(5)

Unnamed: 0,cghr10_age,cghr10_title,icd10_code,icd10_range
1308,adult,Maternal conditions,O25,O20-O99
68,adult,Unspecified infections,A73,A35-A99
661,adult,Other noncommunicable diseases,H14,H11-H59
302,adult,Cancers,D13,D00-D48
2051,adult,Suicide,X68,X60-X84


In [6]:
# """
#     Objective: Load a minimal set of columns from the original datasets of all age groups and rounds.
# """

# # Get age group and round once again and merge everything together into one dataframe
# path_prefix = "../data_202402/"
# merged_all_df = pd.DataFrame()

# rounds = ['rd1', 'rd2']
# age_groups = ['adult', 'child', 'neo']

# for r in rounds:
#     for a in age_groups:
        
#         questionnaire_df =  pd.read_csv(f"{path_prefix}healsl_{r}_{a}_v1.csv")
#         age_df =            pd.read_csv(f"{path_prefix}healsl_{r}_{a}_age_v1.csv")
#         narrative_df =      pd.read_csv(f"{path_prefix}healsl_{r}_{a}_narrative_v1.csv")

#         narrative_df = narrative_df.rename(columns={'summary': 'open_narrative'})
        
#         # Merge the dataframes
#         narrative_only = narrative_df[['rowid','open_narrative']]
#         sex_only = questionnaire_df[['rowid','sex_cod']]
#         age_only = age_df[['rowid','age_value_death','age_unit_death']]
        
#         merged_df = narrative_only.merge(sex_only, on='rowid').merge(age_only, on='rowid')

#         # Fill in missing values with empty string
#         merged_df['sex_cod'] = merged_df['sex_cod'].fillna('')
        
#         merged_df['age_group'] = f"{a}"
#         merged_df['rd_group'] = f"{r}"

#         assert not merged_df.isnull().values.any(), "Execution halted: NaN values found in merged_df"

#         print(f"round: {r.ljust(10)} age group: {a.ljust(10)} len: {str(merged_df.shape[0]).ljust(10)}")
#         # print(f"Sample of merged_df {merged_df.shape}:")
#         # display(merged_df.sample(5))
        
#         merged_all_df = pd.concat([merged_all_df, merged_df])
        


In [7]:
# The mapping dataframe consists of all age groups. To simplify the mapping in the next step,
# we preemptively split the age group into separate dataframes. Then, we set ICD10 as index
# for easy retrival.
cghr_map_helper = {}
for group in icd10_to_cghr_mapping.cghr10_age.unique():
    cghr_map_helper[group] = icd10_to_cghr_mapping[icd10_to_cghr_mapping.cghr10_age == group].set_index('icd10_code')


In [8]:
# Assign a new column 'cause1_cghr10' which retrieves the equivalent CGHR10 code from ICD10
# This utilizes the cghr_map_helper dictionary using ICD10 code as index. In some cases, the
# ICD10 code is not found, and an exception, 'NA', is added to handle this case.
cghr_df = df.assign(
    cause1_cghr10 = df.apply(lambda row: 
        'NA' if row.cause1_icd10 not in cghr_map_helper[row.age_group].index                  # if index is not in group, return NA
        else cghr_map_helper[row.age_group].loc[row.cause1_icd10]['cghr10_title']       # if index is in group, return the cghr10_title
        , axis=1)
)

In [9]:
# count how many after grouping by rowid
print("Some ICD10 codes could not be mapped to CGHR. For those records, the CGHR10 code is set to 'NA'.")
print(f"Number of NA in cause1_cghr10: {len(cghr_df[cghr_df.cause1_cghr10 == 'NA'].groupby('rowid').size().value_counts())}")


Some ICD10 codes could not be mapped to CGHR. For those records, the CGHR10 code is set to 'NA'.
Number of NA in cause1_cghr10: 1


In [10]:
grouped_cghr_df = cghr_df.groupby('rowid')
same_cause_count_cghr_df = pd.DataFrame(grouped_cghr_df['cause1_cghr10'].value_counts())
# blank_df = pd.DataFrame(columns=[x for x in range(1,11)])
blank_df = pd.DataFrame(index=same_cause_count_cghr_df.reset_index().rowid.unique(), columns=[x for x in range(1,11)])
dummy_df = pd.get_dummies(same_cause_count_cghr_df['count']).astype(int).groupby('rowid').sum()
# same_cause_count_cghr_df = blank_df.merge(dummy_df, how='outer', right_index=True)

In [11]:

same_cause_count_cghr_df = blank_df.combine_first(dummy_df)
# same_cause_count_cghr_df = blank_df.merge(dummy_df, how='outer').infer_objects().fillna(0)
same_cause_count_cghr_df = same_cause_count_cghr_df.rename(columns=lambda x: f'same_cause1_cghr10_{x}x')
same_cause_cghr10_colnames = same_cause_count_cghr_df.columns

binarized_cghr_sum = same_cause_count_cghr_df.sum()
nbinarized_cghr_sum = same_cause_count_cghr_df[same_cause_cghr10_colnames].apply(lambda x: x.astype(bool)).sum()

# reduce all non-zero values to 1 and sum
# print("same_cause_count_cghr_df.sum() binarized:")
# print(binarized_sum)
# print("same_cause_count_cghr_df.sum() non-binarized:")
# print(nbinarized_sum)

print("Binarized and non-binarized sum (binarized reduces repeated counts of a rowid record to 1)")
display(pd.DataFrame({'binarized': binarized_cghr_sum, 'non-binarized': nbinarized_cghr_sum}))

print(f"Majority repeated CGHR10 similarity (0.0-1.0): {binarized_cghr_sum.iloc[-5:].sum()/len(df.rowid.unique())}")

Binarized and non-binarized sum (binarized reduces repeated counts of a rowid record to 1)


Unnamed: 0,binarized,non-binarized
same_cause1_cghr10_1x,10,9
same_cause1_cghr10_2x,5,5
same_cause1_cghr10_3x,4,4
same_cause1_cghr10_4x,3,3
same_cause1_cghr10_5x,6,4
same_cause1_cghr10_6x,2,2
same_cause1_cghr10_7x,3,3
same_cause1_cghr10_8x,5,5
same_cause1_cghr10_9x,7,7
same_cause1_cghr10_10x,79,79


Majority repeated CGHR10 similarity (0.0-1.0): 0.96


In [12]:
aggregated_cause1_cghr10_rows = []

for name, group in grouped_cghr_df:
    # print(f"Row ID: {name}", group['cause1_cghr10'].value_counts().to_dict())
    aggregated_cause1_cghr10_rows.append([name, group['cause1_cghr10'].value_counts().to_dict()])

combined_cghr10_df = pd.DataFrame(aggregated_cause1_cghr10_rows, columns=['rowid', 'cause1_cghr10']).set_index('rowid')

# pd.set_option('display.max_rows', None)

# combined_df

final_cghr_df = same_cause_count_cghr_df.merge(combined_cghr10_df, left_index=True, right_index=True)
# final_cghr_df = final_cghr_df.merge(cghr_df[['rowid', 'age_group', 'round']].drop_duplicates(subset='rowid').set_index('rowid'), left_index=True, right_index=True)


In [13]:
final_agg_df = final_icd_df.merge(final_cghr_df, left_index=True, right_index=True)
final_agg_colnames = [c for c in final_agg_df.columns if c not in ['age_group', 'round']] + ['age_group', 'round']
final_agg_df = final_agg_df[final_agg_colnames]


In [23]:
final_agg_df.reset_index().rename(columns={'index': 'rowid', 'cause1_icd10' : 'cause_icd10', 'cause1_cghr10': 'cause_cghr10'}).to_csv(f"{EXPORT_DIR}/healsl_rd1to2_rapid_gpt3_sample100_v2b.csv", index=False)

In [14]:
# final_agg_df[final_agg_df['round'] == "rd1"].to_csv(f"{EXPORT_DIR}/healsl_rd1_rapid_gpt3_sample_agg_v2b.csv", index=False)
# final_agg_df[final_agg_df['round'] == "rd2"].to_csv(f"{EXPORT_DIR}/healsl_rd2_rapid_gpt3_sample_agg_v2b.csv", index=False)
