In [14]:
"""
This script extracts and combines neccessary features from various datasets into one dataset.


Pseudo code
-----------
1. Generate filepath to verbal autopsy CSV data based on predefined patterns
2. Extract the necessary features from each file
3. Merge extracted features into a single dataframe using the "rowid" as the key
4. If PROPORTIONAL_SAMPLING == True
    4.1. Calculate the no. of records proportion of (age_group+round)/all_records
    4.2. Sample each based on calculated proportion 
    4.3. Handle potential oversampling
5. Generate unique id (uid) for each row
6. Save completed dataframe to as JSON file for Step 02



Details regarding #1 to #3 of the pseudo code:
----------------------------------------------

The HEALSL datset provides verbal autopsy (VA) data divided into different
files. As of writing, there are two rounds of data, round 1 and 2, study
conducted at different time periods. Each round is divided into three age
groups: adult, child, and neonate. Each record is normalized into into three
files: questionnaire, age, and open narrative, sharing the samw rowid as the
key. The questionnaire provides the sex of the deceased, the age dataset
provides the age of the deceased, and the narrative dataset provides the open
narrative recorded from the VA data.

The purpose of this script is to extract all the necessary features scattered in
different files into one dataset.

The pattern is based on the following

    variables:
    rounds (r):     rd1, rd2
    age_groups (a): adult, child, neo

    filename patterns:
    questionnaire:  healsl_{r}_{a}_v1.csv
    age data:       healsl_{r}_{a}_age_v1.csv
    narrative data: healsl_{r}_{a}_narrative_v1.csv

    This will result in 12 files for each round of data.
    e.g.
    healsl_rd1_neo_v1.csv
    healsl_rd1_neo_age_v1.csv
    healsl_rd1_neo_narrative_v1.csv
    ...
    healsl_rd2_adult_v1.csv
    healsl_rd2_adult_age_v1.csv
    healsl_rd2_adult_narrative_v1.csv


Details regarding #4 of the pseudo code:
----------------------------------------

The script can alternatively generate a sample of the dataset based on the
proportion of records. This can be used to repeatiedly request a response to the
same question and compare the results among different repetitions. Since some
age groups + round segments have limited number of records, we sample from each
segment proportionally to ensure that each segment is represented in the sample.

Due to rounding errors from the calculated proportion, oversampling may happen.
This is handled by removing the extra rows from the group with the highest
proportion to ensure groups with the lowest samples maintains 
their prescence.
    
    
Details regarding #5 of the pseudo code:
----------------------------------------

The format of the unique id (uid) depends on whether PROPORTIONAL_SAMPLING is True or False. 
    
    PROPORTIONAL_SAMPLING is False, the uid is similar to the rowid.
    PROPORTIONAL_SAMPLING is True, rowids will no longer be unique as it is being repeated n 
    times. Hence, uid = f"{rowid}_{n}"


Miscellaneous:
--------------

- age group and round number is appended to the dataset as a feature. These will help splitting 
  the results back into their respective groups, as needed.
- some features, e.g. "sex_cod", were empty in the dataset, and treated as NaN by the script.
  These were filled as empty string "" to avoid any issues during the experiment.
"""
pass

In [15]:
import os
import pandas as pd
import numpy as np
from openai import OpenAI
import datetime
import pytz

TIMEZONE = pytz.timezone('America/Toronto')


# Set PROPORTIONAL_SAMPLING to True to sample the dataset proportionally, and False retain the entire dataset
# N_REPEAT_RESPONSES and N_SAMPLES are only used when PROPORTIONAL_SAMPLING is True
PROPORTION_SAMPLING = False
N_REPEAT_RESPONSES = 10
N_SAMPLES = 100  # Replace X with the desired number of values to select

# Output filename after processing the dataset
# appends datetime, and sampled if PROPORTIONAL_SAMPLING is True
OUTPUT_FILE = "healsl_dataset_all.csv"

In [16]:
path_prefix = "../data_202402/"
merged_all_df = pd.DataFrame()

rounds = ['rd1', 'rd2']
age_groups = ['adult', 'child', 'neo']

for r in rounds:
    for a in age_groups:
        
        questionnaire_df =  pd.read_csv(f"{path_prefix}healsl_{r}_{a}_v1.csv")
        age_df =            pd.read_csv(f"{path_prefix}healsl_{r}_{a}_age_v1.csv")
        narrative_df =      pd.read_csv(f"{path_prefix}healsl_{r}_{a}_narrative_v1.csv")

        narrative_df = narrative_df.rename(columns={'summary': 'open_narrative'})
        
        # Merge the dataframes
        narrative_only = narrative_df[['rowid','open_narrative']]
        sex_only = questionnaire_df[['rowid','sex_cod']]
        age_only = age_df[['rowid','age_value_death','age_unit_death']]
        
        merged_df = narrative_only.merge(sex_only, on='rowid').merge(age_only, on='rowid')

        # Fill in missing values with empty string
        merged_df['sex_cod'] = merged_df['sex_cod'].fillna('')
        
        merged_df['age_group'] = a
        merged_df['round'] = r
        
        # merged_df['group'] = f"{a}_{r}"

        assert not merged_df.isnull().values.any(), "Execution halted: NaN values found in merged_df"

        print(f"round: {r.ljust(10)} age group: {a.ljust(10)} len: {str(merged_df.shape[0]).ljust(10)}")
        # print(f"Sample of merged_df {merged_df.shape}:")
        # display(merged_df.sample(5))
        
        merged_all_df = pd.concat([merged_all_df, merged_df])

print("")        
print(f"Total length of merged_all_df: {len(merged_all_df)}")

  questionnaire_df =  pd.read_csv(f"{path_prefix}healsl_{r}_{a}_v1.csv")
  questionnaire_df =  pd.read_csv(f"{path_prefix}healsl_{r}_{a}_v1.csv")


round: rd1        age group: adult      len: 4987      
round: rd1        age group: child      len: 2998      
round: rd1        age group: neo        len: 585       
round: rd2        age group: adult      len: 2025      
round: rd2        age group: child      len: 1059      
round: rd2        age group: neo        len: 233       

Total length of merged_all_df: 11887


In [18]:
# Taking a sample of the merged_all_df
if PROPORTION_SAMPLING:
    
    # age_group + round is essentially the group, this column will assist collecting samples from different groups
    merged_all_df['group'] = merged_all_df['age_group'] + "_" + merged_all_df['round']

    # Get the sampling fraction
    sampling_frac = ((merged_all_df.value_counts('group') / len(merged_all_df)) * N_SAMPLES).round(0).astype(int).to_dict()
    
    # Initialize the dictionary to store the sample ids
    sample_ids = {}

    # Get sample based on fraction for each group
    for sample in sampling_frac:
        sample_ids[sample] = merged_all_df[merged_all_df['group'] == sample].sample(sampling_frac[sample], random_state=1).rowid.tolist()    
        print(f"{sample}: {sampling_frac[sample]} records")
        
    # Sort dict from largest group to smallest
    sorted_sample_ids = dict(sorted(sample_ids.items(), key=lambda item: len(item[1]), reverse=True))

    # Get the actual samples count
    sample_values_count = len([item for subitem in sorted_sample_ids.values() for item in subitem])

    # If sample count is more than N_SAMPLES required, remove the excess samples
    # starting from the group with the most samples and more than 10 samples
    if sample_values_count > N_SAMPLES: 
        excess = sample_values_count - N_SAMPLES
        print(f"There are more than {N_SAMPLES} samples. Removing excess samples.")
                
        for _ in range(excess):
            for key in sorted_sample_ids:
                
                if len(sorted_sample_ids[key]) > 10:
                    sorted_sample_ids[key].pop()
                    break
    else:
        print(f"There are {sample_values_count} samples. No need to remove any samples.")
        
    # Flatten the sample dictionary to a list of rowids
    sample_ids_list = [item for sublist in sorted_sample_ids.values() for item in sublist]

    # Compile a dataframe based on the sample rowids
    random_rowids = merged_all_df[merged_all_df['rowid'].isin(sample_ids_list)]
    
    # Construct a unique id rowid_repetition and append to the dataframe
    final_df = pd.concat([random_rowids.assign(uid=random_rowids['rowid'].astype(str) + "_" + str(r)) for r in range(N_REPEAT_RESPONSES)])
    
    # done with 'group', so drop it
    final_df = final_df.drop(columns=['group'])

    
# Using full dataset
else:
    
    # Duplicate rowid as uid 
    final_df = merged_all_df.assign(uid=merged_all_df['rowid'])

# Reorder uid as first column for presentation
final_df = final_df[['uid'] + [col for col in final_df.columns if col != 'uid']]


In [21]:
# Fill sex_cod NaN with empty string
final_df['sex_cod'] = final_df['sex_cod'].fillna("")

Unnamed: 0,uid,rowid,open_narrative,sex_cod,age_value_death,age_unit_death,age_group,round
0,14002421,14002421,according to the granddaughter of the deceased...,Female,53,Years,adult,rd1
1,14005966,14005966,"According to respondent, the deceased was a 29...",Male,29,Years,adult,rd1
2,14001514,14001514,"According to the deceased sisters, the decease...",Female,27,Years,adult,rd1
3,14009193,14009193,According to the niece the deceased was a male...,Male,40,Years,adult,rd1
4,14002210,14002210,According to the younger sister who was with h...,Female,42,Years,adult,rd1
...,...,...,...,...,...,...,...,...
228,24002039,24002039,"As per respondent, the deceased was a 7 days ...",Female,7,Days,neo,rd2
229,24002598,24002598,The deceased was a 0 old day female neonate wh...,Female,0,Days,neo,rd2
230,24001849,24001849,"According to the mother of the deceased, she h...",Male,0,Days,neo,rd2
231,24000702,24000702,"According to the respondent, the deceased was ...",Male,0,Days,neo,rd2


In [6]:
# Print results
print(f"Sampling: {PROPORTION_SAMPLING}")
print(f"Shape of final dataframe: {final_df.shape}")
final_df.sample(1)

Sampling: False
Shape of final dataframe: (11887, 8)


Unnamed: 0,uid,rowid,open_narrative,sex_cod,age_value_death,age_unit_death,age_group,round
1903,14002348,14002348,According to the respondent the deceased was w...,Male,65,Years,adult,rd1


In [7]:
import os


current_time = datetime.datetime.now(TIMEZONE)
formatted_time = current_time.strftime("%y%m%d_%H%M%S")

temp_output_file = OUTPUT_FILE

if PROPORTION_SAMPLING:
    temp_output_file = temp_output_file.replace(".csv", f"_sampled.csv")
temp_output_file = temp_output_file.replace(".csv", f"_{formatted_time}.csv")

try:
    final_df.to_csv(temp_output_file, index=False)

    print(f"Output saved to {temp_output_file}")
except Exception as e:
    print(f"Error saving to {temp_output_file}. Error: {e}")


Output saved to healsl_dataset_all_240309_040141.csv
