In [7]:
"""
This script extracts and combines neccessary features from various datasets into one dataset.


Pseudo code
-----------
1. Generate file names based on the pattern into individual dataframes.
2. Extract the necessary features from each dataframe.
3. Merge extracted features into a single dataframe using the rowid as the key.
4. If PROPORTIONAL_SAMPLING is True
    4.1. Calculate the proportion of each age group in the dataset.
    4.2. Use the proportion to sample the dataset.
    4.3. Handle oversampling
5. Create unique id (uid) for each row
    5.1 If PROPORTIONAL_SAMPLING is True, uid = f"{age_group}_{rowid}"
6. Save the dataset to a file



Details regarding #1 of the pseudo code:
----------------------------------------

In the context of this project, the HEALSL datset provides verbal autopsy data split into multiple 
different files. Round one and round two; adult, child, and neonate; and questionnaire, age, and 
open narrative are all separate files. The script extracts the deceased's sex from the questionnaire 
dataset, the deceased's age from the age dataset, and the open narrative recorded from the verbal 
autopsy from the narrative dataset. Then, we combine the extracted features (columns) using their 
row id as the key.

    variables:
    rounds (r):     rd1, rd2
    age_groups (a): adult, child, neo

    pattern
    questionnaire:  healsl_{r}_{a}_v1.csv
    age data:       healsl_{r}_{a}_age_v1.csv
    narrative data: healsl_{r}_{a}_narrative_v1.csv

    This will result in 12 files for each round of data.
    e.g.
    healsl_rd1_neo_v1.csv
    healsl_rd1_neo_age_v1.csv
    healsl_rd1_neo_narrative_v1.csv
    ...
    healsl_rd2_adult_v1.csv
    healsl_rd2_adult_age_v1.csv
    healsl_rd2_adult_narrative_v1.csv


Details regarding #4 of the pseudo code:
----------------------------------------

    The setting complements with experimenting the consistency of responses from the language model. 
    The idea is to repeatiedly request a response to the same question and compare the results.
    This subroutine takes a sample proportionally based on number of records per group from the 
    original dataset.
    
    Due to rounding errors, oversampling may happen. This is handled by removing the extra rows
    from the group with the highest proportion to ensure groups with the lowest samples maintains 
    their prescence.
"""
pass

In [2]:
import os
import pandas as pd
import numpy as np
from openai import OpenAI
import datetime
import pytz

# Set PROPORTIONAL_SAMPLING to True to sample the dataset proportionally, and False retain the entire dataset
PROPORTION_SAMPLING = False
N_REPEAT_RESPONSES = 10
N_SAMPLES = 100  # Replace X with the desired number of values to select

# Output filename after processing the dataset
OUTPUT_FILE = "healsl_dataset_all.csv"

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
path_prefix = "../data_202402/"
merged_all_df = pd.DataFrame()

rounds = ['rd1', 'rd2']
age_groups = ['adult', 'child', 'neo']

for r in rounds:
    for a in age_groups:
        
        questionnaire_df =  pd.read_csv(f"{path_prefix}healsl_{r}_{a}_v1.csv")
        age_df =            pd.read_csv(f"{path_prefix}healsl_{r}_{a}_age_v1.csv")
        narrative_df =      pd.read_csv(f"{path_prefix}healsl_{r}_{a}_narrative_v1.csv")

        narrative_df = narrative_df.rename(columns={'summary': 'open_narrative'})
        
        # Merge the dataframes
        narrative_only = narrative_df[['rowid','open_narrative']]
        sex_only = questionnaire_df[['rowid','sex_cod']]
        age_only = age_df[['rowid','age_value_death','age_unit_death']]
        
        merged_df = narrative_only.merge(sex_only, on='rowid').merge(age_only, on='rowid')

        # Fill in missing values with empty string
        merged_df['sex_cod'] = merged_df['sex_cod'].fillna('')
        
        merged_df['group'] = f"{a}_{r}"

        assert not merged_df.isnull().values.any(), "Execution halted: NaN values found in merged_df"

        print(f"round: {r.ljust(10)} age group: {a.ljust(10)} len: {str(merged_df.shape[0]).ljust(10)}")
        # print(f"Sample of merged_df {merged_df.shape}:")
        # display(merged_df.sample(5))
        
        merged_all_df = pd.concat([merged_all_df, merged_df])

print("")        
print(f"Total length of merged_all_df: {len(merged_all_df)}")

  questionnaire_df =  pd.read_csv(f"{path_prefix}healsl_{r}_{a}_v1.csv")
  questionnaire_df =  pd.read_csv(f"{path_prefix}healsl_{r}_{a}_v1.csv")


round: rd1        age group: adult      len: 4987      
round: rd1        age group: child      len: 2998      
round: rd1        age group: neo        len: 585       
round: rd2        age group: adult      len: 2025      
round: rd2        age group: child      len: 1059      
round: rd2        age group: neo        len: 233       

Total length of merged_all_df: 11887


In [4]:
# Taking a sample of the merged_all_df
if PROPORTION_SAMPLING:    

    # Get the sampling fraction
    sampling_frac = ((merged_all_df.value_counts('group') / len(merged_all_df)) * N_SAMPLES).round(0).astype(int).to_dict()
    
    # Initialize the dictionary to store the sample ids
    sample_ids = {}

    # Get sample based on fraction for each group
    for sample in sampling_frac:
        sample_ids[sample] = merged_all_df[merged_all_df['group'] == sample].sample(sampling_frac[sample], random_state=1).rowid.tolist()    
        print(f"{sample}: {sampling_frac[sample]} records")
        
    # Sort dict from largest group to smallest
    sorted_sample_ids = dict(sorted(sample_ids.items(), key=lambda item: len(item[1]), reverse=True))

    # Get the actual samples count
    sample_values_count = len([item for subitem in sorted_sample_ids.values() for item in subitem])

    # If sample count is more than N_SAMPLES required, remove the excess samples
    # starting from the group with the most samples and more than 10 samples
    if sample_values_count > N_SAMPLES: 
        excess = sample_values_count - N_SAMPLES
        print(f"There are more than {N_SAMPLES} samples. Removing excess samples.")
                
        for _ in range(excess):
            for key in sorted_sample_ids:
                
                if len(sorted_sample_ids[key]) > 10:
                    sorted_sample_ids[key].pop()
                    break
    else:
        print(f"There are {sample_values_count} samples. No need to remove any samples.")
        
    # Flatten the sample dictionary to a list of rowids
    sample_ids_list = [item for sublist in sorted_sample_ids.values() for item in sublist]

    # Compile a dataframe based on the sample rowids
    random_rowids = merged_all_df[merged_all_df['rowid'].isin(sample_ids_list)]
    
    # Construct a unique id rowid_repetition and append to the dataframe
    final_df = pd.concat([random_rowids.assign(uid=random_rowids['rowid'].astype(str) + "_" + str(r)) for r in range(10)])

    
# Using full dataset
else:
    
    # Duplicate rowid as uid 
    final_df = merged_all_df.assign(uid=merged_all_df['rowid'])

# Reorder uid as first column for presentation
final_df = final_df[['uid'] + [col for col in final_df.columns if col != 'uid']]


In [5]:
# Print results
print(f"Sampling: {PROPORTION_SAMPLING}")
print(f"Shape of final dataframe: {final_df.shape}")
final_df.sample(1)

Sampling: False
Shape of final dataframe: (11887, 7)


Unnamed: 0,uid,rowid,open_narrative,sex_cod,age_value_death,age_unit_death,group
2242,14003858,14003858,"According to the respondent, the deceased was ...",Female,6,Months,child_rd1


In [6]:
import os


try:
    # Check if the output file already exists. If yes, use a different name.
    if os.path.exists(OUTPUT_FILE):
        TIMEZONE = pytz.timezone('US/Eastern')

        current_time = datetime.datetime.now(TIMEZONE)
        formatted_time = current_time.strftime("%y%m%d_%H%M%S")
               
        temp_output_file = OUTPUT_FILE.replace(".csv", f"_{formatted_time}.csv")
        
        print(f"{OUTPUT_FILE} already exists. Saving to {temp_output_file} instead.")
        final_df.to_csv(temp_output_file, index=False)
        
    # Save dataframe as designated output file
    else:
        final_df.to_csv(OUTPUT_FILE, index=False)
        
        print(f"Output saved to {OUTPUT_FILE}")
except Exception as e:
    print(f"Error saving to {OUTPUT_FILE}. Error: {e}")


Output saved to healsl_dataset_all.csv
