In [1]:
import os
import pandas as pd
import numpy as np
from openai import OpenAI
import textwrap
import datetime
import pytz
import json
import re

In [2]:
PROPORTION_SAMPLING = True
N_REPEAT_RESPONSES = 10
N_SAMPLES = 100  # Replace X with the desired number of values to select
OUTPUT_FILE = "output.csv"

In [3]:
path_prefix = "../data_202402/"
merged_all_df = pd.DataFrame()

rounds = ['rd1', 'rd2']
age_groups = ['adult', 'child', 'neo']

for r in rounds:
    for a in age_groups:
        
        questionnaire_df =  pd.read_csv(f"{path_prefix}healsl_{r}_{a}_v1.csv")
        age_df =            pd.read_csv(f"{path_prefix}healsl_{r}_{a}_age_v1.csv")
        narrative_df =      pd.read_csv(f"{path_prefix}healsl_{r}_{a}_narrative_v1.csv")

        narrative_df = narrative_df.rename(columns={'summary': 'open_narrative'})
        
        # Merge the dataframes
        narrative_only = narrative_df[['rowid','open_narrative']]
        sex_only = questionnaire_df[['rowid','sex_cod']]
        age_only = age_df[['rowid','age_value_death','age_unit_death']]
        
        merged_df = narrative_only.merge(sex_only, on='rowid').merge(age_only, on='rowid')

        # Fill in missing values with empty string
        merged_df['sex_cod'] = merged_df['sex_cod'].fillna('')
        
        merged_df['group'] = f"{a}_{r}"

        assert not merged_df.isnull().values.any(), "Execution halted: NaN values found in merged_df"

        print(f"round: {r.ljust(10)} age group: {a.ljust(10)} len: {str(merged_df.shape[0]).ljust(10)}")
        # print(f"Sample of merged_df {merged_df.shape}:")
        # display(merged_df.sample(5))
        
        merged_all_df = pd.concat([merged_all_df, merged_df])

print("")        
print(f"Total length of merged_all_df: {len(merged_all_df)}")

  questionnaire_df =  pd.read_csv(f"{path_prefix}healsl_{r}_{a}_v1.csv")
  questionnaire_df =  pd.read_csv(f"{path_prefix}healsl_{r}_{a}_v1.csv")


round: rd1        age group: adult      len: 4987      
round: rd1        age group: child      len: 2998      
round: rd1        age group: neo        len: 585       
round: rd2        age group: adult      len: 2025      
round: rd2        age group: child      len: 1059      
round: rd2        age group: neo        len: 233       

Total length of merged_all_df: 11887


In [4]:
# Taking a sample of the merged_all_df
if PROPORTION_SAMPLING:    

    # Get the sampling fraction
    sampling_frac = ((merged_all_df.value_counts('group') / len(merged_all_df)) * N_SAMPLES).round(0).astype(int).to_dict()
    
    # Initialize the dictionary to store the sample ids
    sample_ids = {}

    # Get sample based on fraction for each group
    for sample in sampling_frac:
        sample_ids[sample] = merged_all_df[merged_all_df['group'] == sample].sample(sampling_frac[sample], random_state=1).rowid.tolist()    
        print(f"{sample}: {sampling_frac[sample]} records")
        
    # Sort dict from largest group to smallest
    sorted_sample_ids = dict(sorted(sample_ids.items(), key=lambda item: len(item[1]), reverse=True))

    # Get the actual samples count
    sample_values_count = len([item for subitem in sorted_sample_ids.values() for item in subitem])

    # If sample count is more than N_SAMPLES required, remove the excess samples
    # starting from the group with the most samples and more than 10 samples
    if sample_values_count > N_SAMPLES: 
        excess = sample_values_count - N_SAMPLES
        print(f"There are more than {N_SAMPLES} samples. Removing excess samples.")
                
        for _ in range(excess):
            for key in sorted_sample_ids:
                
                if len(sorted_sample_ids[key]) > 10:
                    sorted_sample_ids[key].pop()
                    break
    else:
        print(f"There are {sample_values_count} samples. No need to remove any samples.")
        
    # Flatten the sample dictionary to a list of rowids
    sample_ids_list = [item for sublist in sorted_sample_ids.values() for item in sublist]

    # Compile a dataframe based on the sample rowids
    random_rowids = merged_all_df[merged_all_df['rowid'].isin(sample_ids_list)]
    
    # Construct a unique id rowid_repetition and append to the dataframe
    final_df = pd.concat([random_rowids.assign(u_id=random_rowids['rowid'].astype(str) + "_" + str(r)) for r in range(10)])

    
# Using full dataset
else:
    
    # Duplicate rowid as u_id 
    final_df = merged_all_df.assign(u_id=merged_all_df['rowid'])

# Reorder u_id as first column for presentation
final_df = final_df[['u_id'] + [col for col in final_df.columns if col != 'u_id']]


adult_rd1: 42 records
child_rd1: 25 records
adult_rd2: 17 records
child_rd2: 9 records
neo_rd1: 5 records
neo_rd2: 2 records
There are 100 samples. No need to remove any samples.


In [5]:
# Print results
print(f"Sampling: {PROPORTION_SAMPLING}")
print(f"Shape of final dataframe: {final_df.shape}")
final_df.sample(1)

Sampling: True
Shape of final dataframe: (1000, 7)


Unnamed: 0,u_id,rowid,open_narrative,sex_cod,age_value_death,age_unit_death,group
1705,24001012_3,24001012,As per respondent the deceased was a 33 years ...,Male,33,Years,adult_rd2


In [6]:
import os


try:
    # Check if the output file already exists. If yes, use a different name.
    if os.path.exists(OUTPUT_FILE):
        import pytz
        import datetime
        TIMEZONE = pytz.timezone('US/Eastern')

        current_time = datetime.datetime.now(TIMEZONE)
        formatted_time = current_time.strftime("%y%m%d_%H%M%S")
               
        temp_output_file = OUTPUT_FILE.replace(".csv", f"_{formatted_time}.csv")
        
        print(f"{OUTPUT_FILE} already exists. Saving to {temp_output_file} instead.")
        final_df.to_csv(temp_output_file, index=False)
        
    # Save dataframe as designated output file
    else:
        final_df.to_csv(OUTPUT_FILE, index=False)
        
        print(f"Output saved to {OUTPUT_FILE}")
except Exception as e:
    print(f"Error saving to {OUTPUT_FILE}. Error: {e}")


Output saved to output.csv
