In [1]:
import pandas as pd
import os
import logging
import numpy as np

In [4]:
# Logging events 
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)

In [6]:
# Set directory to files to get list of file_paths excluding subdirectories
directory = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/Data_CSVDownloaded/02_forms'
file_paths = [os.path.join(directory, file) for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]

In [244]:
# Make PtReg file the first item in the list since we shall need it to be set as the base with all the participant registrations
PtReg = os.path.join(directory, 'PtReg.csv')
file_paths.insert(0, file_paths.pop(file_paths.index(PtReg)))
len(file_paths)

340

In [36]:
# Define the aggregation functions
def custom_agg(series):
    # Filter out NaN values before joining
    return ', '.join(series.dropna().astype(str))

def list_agg(series):
    return list(series.dropna())

# Specify the columns you want to group by
group_columns = ['xCohortID', 'xParticipantID']
drop_columns = ['ProtocolID', 'xSiteID']

def aggregate_data(grouping_columns, grouping_df): 
    # Create an empty aggregation dictionary
    agg_dict = {}
    
    # Dynamically add columns to the dictionary with the appropriate function
    for col in grouping_df.columns:
        if col in grouping_columns:
            continue  # Skip group by columns
        #if df[col].dtype == 'float64' or df[col].dtype == 'int64':
        #agg_dict[col] = ['mean', 'max']  # Apply both mean and max
        elif grouping_df[col].dtype in [np.float64, np.int64]:
            agg_dict[col] = list_agg  # Use list aggregation for numeric columns
    
        else:
            agg_dict[col] = custom_agg  # Apply custom concatenation
    
    # Apply the aggregation to the DataFrame
    grouped_df = grouping_df.groupby(group_columns).agg(agg_dict)

    # Flatten the multi-level columns if necessary
    grouped_df.columns = ['_'.join(col) if isinstance(col, tuple) else col for col in grouped_df.columns.values]

    grouped_df.reset_index(inplace=True)
    return grouped_df
    
def combine_echo_files(files):
    # Initialize an empty DataFrame and trackers
    base_df = pd.DataFrame()
    file_count = 0
    merged_files = []
    # Process each file
    for index, file in enumerate(files):
        if file.endswith('.csv'):
            # Read the current file into a DataFrame
            temp_df = pd.read_csv(file, low_memory=False)

            # Dynamic suffixes based on the file names
            base_file = files[index-1].replace(directory, '')
            base_form_name = os.path.splitext(base_file)[0]
            temp_file = file.replace(directory, '')
            temp_form_name = os.path.splitext(temp_file)[0]
            suffixes = (f'_{base_form_name }', f'_{temp_form_name}')
            
            logging.info('Reading %s - %s', file, suffixes)
        
            # Add the first file(PtReg)
            if base_df.empty:
                base_df = temp_df
            else:
               # Remove columns that are repeated and/or not relevant 
                temp_df = temp_df.drop(columns=[col for col in drop_columns if col in temp_df.columns])
               
                # Aggregate the data in the form to combine with the PtReg 
                temp_df = aggregate_data(group_columns, temp_df)
            
                # Merge the new DataFrame with the base DataFrame on the join key
                base_df = pd.merge(base_df, temp_df, on=['xCohortID', 'xParticipantID'], how='left', suffixes=suffixes)
        file_count += 1
        merged_files.append(file)
        logging.info('Files processed: %s', file_count)
    # Remove PtReg file from merged files so that it can be used as base in next batch
    merged_files.remove(PtReg)
    return base_df, merged_files, file_count

In [238]:
combined_df, merged_files, files_count= combine_echo_files(file_paths[0:3]) 

2024-04-18 22:04:57,436 - Reading /Users/faith/Desktop/ECHO/raw_downloaded/csv_files/PtReg.csv - ('_Ess_HSE_HOME_EC', '_PtReg')
2024-04-18 22:04:57,437 - Files processed: 1
2024-04-18 22:04:57,481 - Reading /Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CNH_SDQ_4.csv - ('_PtReg', '_Ess_CNH_SDQ_4')
2024-04-18 22:05:11,794 - Files processed: 2
2024-04-18 22:05:11,812 - Reading /Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_HSE_HOME_EC.csv - ('_Ess_CNH_SDQ_4', '_Ess_HSE_HOME_EC')
2024-04-18 22:05:17,047 - Files processed: 3


In [240]:
combined_df.to_csv('combined_echo_data_3.csv', index=False)

In [221]:
combined_df.shape

(63215, 159)

In [192]:
merged_files_30 = merged_files

In [194]:
merged_files_30

['/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CNH_SDQ_4.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_HSE_HOME_EC.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Rec_RCh_TSR_6_12.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Rec_RCh_SESC_6_12.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CWB_PGLS5a_A.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CHB_BLOCK2.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Rec_RCh_NDSR04_C.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CPH_PDS.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CNP_PPVT3a.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CNH_ASQ_18.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CHB_CFSP.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Rec_RCh_CPrU_PR.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CNH_ASQ_24.csv',
 '/Users/faith/Desktop/ECHO/raw_d

In [246]:
file_paths.index('/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CPH_CG_CAPE.csv')

98

In [248]:
file_paths[99]

'/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_Dem_GI.csv'

In [251]:
file_paths[149]

'/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CNH_ASQ_54.csv'

Notes:  file Rec_RCh_NDSR02_C.csv is large file number 54 and takes a while to execute ... over an hour on local computer resources 

In [39]:
batch1 = pd.read_csv('/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/raw_combined_batches/combined_echo_data_batch_1.csv', low_memory=False)

In [17]:
batch1.columns.to_list()

['xParticipantID',
 'xOthPtID1',
 'ProtocolID',
 'xCohortID',
 'xSiteID',
 'xCohortID_UH3',
 'ExplicitRegistration',
 'ParticipantType',
 'ParticipationLevel',
 'PregnancyNumber',
 'DeliveryOrder',
 'Ethnicity',
 'Race',
 'Sex',
 'xDateOfBirth',
 'YearOfBirth',
 'xExpectedDueDate',
 'ExpectedYearOfBirth',
 'xCohortEnrollmentDate',
 'YearOfCohortEnrollment',
 'xProtocolEnrollmentDate',
 'FinalParticipationLevel',
 'FinalLevelReason',
 'xOtherfinalReason',
 'Withdrawn',
 'xWithdrawnReason',
 'VisitName_Ess_CNH_SDQ_4',
 'xFormDT_Ess_CNH_SDQ_4',
 'SequenceNum_Ess_CNH_SDQ_4',
 'respondent_Ess_CNH_SDQ_4',
 'otherresp_Ess_CNH_SDQ_4',
 'sdq_4_1',
 'sdq_4_2',
 'sdq_4_3',
 'sdq_4_4',
 'sdq_4_5',
 'sdq_4_6',
 'sdq_4_7',
 'sdq_4_8',
 'sdq_4_9',
 'sdq_4_10',
 'sdq_4_11',
 'sdq_4_12',
 'sdq_4_13',
 'sdq_4_14',
 'sdq_4_15',
 'sdq_4_16',
 'sdq_4_17',
 'sdq_4_18',
 'sdq_4_19',
 'sdq_4_20',
 'sdq_4_21',
 'sdq_4_22',
 'sdq_4_23',
 'sdq_4_24',
 'sdq_4_25',
 'sdq_4_emotional',
 'sdq_4_conduct',
 'sdq_4_hyp

In [35]:
def get_form_names(columns_list):
    forms = []
    for item in columns_list: 
        if 'VisitName' in item: 
            print(item)
            form_name = item.strip('VisitName_')
            forms.append(form_name)
    return forms
    


In [40]:
batch1_forms = get_form_names(batch1.columns.to_list())
print(len(batch1_forms))
print(batch1_forms)

VisitName_Ess_CNH_SDQ_4
VisitName_Ess_HSE_HOME_EC
VisitName_Rec_RCh_TSR_6_12
VisitName_Rec_RCh_SESC_6_12
VisitName_Ess_CWB_PGLS5a_A
VisitName_Ess_CHB_BLOCK2
VisitName_Rec_RCh_NDSR04_C
VisitName_Ess_CPH_PDS
VisitName_Ess_CNP_PPVT3a
VisitName_Ess_CNH_ASQ_18
VisitName_Ess_CHB_CFSP
VisitName_Rec_RCh_CPrU_PR
VisitName_Ess_CNH_ASQ_24
VisitName_Ess_SRP_EAC_PR
VisitName_Ess_CNH_ASQ_30
VisitName_Ess_CNH_SCQ
VisitName_Ess_CWB_PLS8b_Ped
VisitName_Rec_RCg_CTS
VisitName_Ess_HSE_ACE_aPV_A
VisitName_Rec_RCh_Caff_PR
VisitName_Rec_RCg_PSOC
VisitName_Ess_CNH_SB5
VisitName_Ess_CNH_RECBQvSF
VisitName_Ess_CNH_PPSE4a_Ped
VisitName_Ess_Prg_MMRAaG
VisitName_Ess_HSE_FES_Coh
VisitName_Rec_RCh_C19SSD_cPR
VisitName_Ess_CHB_BLOCK3
VisitName_Rec_RCg_SRS2CG
VisitName_Ess_CHB_PPA8a_PP
VisitName_Rec_RCh_NTES_SR
VisitName_Ess_CNH_WISC5
VisitName_Rec_RCh_CADHD_PR
VisitName_Rec_Mat_PUQE
VisitName_Ess_SRP_HPAP_CR
VisitName_Ess_HSE_HOME_IT
VisitName_Rec_RCg_PGLS5a
VisitName_Ess_CNP_PGHM2a
VisitName_Ess_CNP_PSS14
VisitName_

In [41]:
'VisitName_Ess_Prg_MMRAaG' in batch1.columns.to_list() #PtReg

True

In [42]:
batch1_forms.append('PtReg')

In [44]:
print(batch1_forms)

['Ess_CNH_SDQ_4', 'Ess_HSE_HOME_EC', 'Rec_RCh_TSR_6_12', 'Rec_RCh_SESC_6_12', 'Ess_CWB_PGLS5a_A', 'Ess_CHB_BLOCK2', 'Rec_RCh_NDSR04_C', 'Ess_CPH_PDS', 'Ess_CNP_PPVT3', 'Ess_CNH_ASQ_18', 'Ess_CHB_CFSP', 'Rec_RCh_CPrU_PR', 'Ess_CNH_ASQ_24', 'Ess_SRP_EAC_PR', 'Ess_CNH_ASQ_30', 'Ess_CNH_SCQ', 'Ess_CWB_PLS8b_Ped', 'Rec_RCg_CTS', 'Ess_HSE_ACE_aPV_A', 'Rec_RCh_Caff_PR', 'Rec_RCg_PSOC', 'Ess_CNH_SB5', 'Ess_CNH_RECBQvSF', 'Ess_CNH_PPSE4a_Ped', 'Ess_Prg_MMRAaG', 'Ess_HSE_FES_Coh', 'Rec_RCh_C19SSD_cPR', 'Ess_CHB_BLOCK3', 'Rec_RCg_SRS2CG', 'Ess_CHB_PPA8a_PP', 'Rec_RCh_NTES_SR', 'Ess_CNH_WISC5', 'Rec_RCh_CADHD_PR', 'Rec_Mat_PUQE', 'Ess_SRP_HPAP_CR', 'Ess_HSE_HOME_IT', 'Rec_RCg_PGLS5', 'Ess_CNP_PGHM2', 'Ess_CNP_PSS14', 'Ess_CNH_WASI2', 'Ess_CNH_ASQ_33', 'Ess_CNH_ASQ_27', 'Ess_HSE_ACE_CR', 'Rec_RCh_PDS8a_Ped', 'Ess_HHx_C19Vac_A', 'Ess_HHx_C19_2_pA', 'Rec_RCg_NE_CE_C', 'Rec_RCh_PDS4a_ECPR', 'PtReg']


In [26]:
batch2 = pd.read_csv('/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/raw_combined_batches/combined_echo_data_batch_2.csv', low_memory=False)

In [27]:
batch2_forms = get_form_names(batch2.columns.to_list())
print(len(batch2_forms))
print(batch2_forms)

49
['Rec_RCh_PDS6a_PP', 'Ess_CWB_PGH8a_ECPR', 'Ess_Prg_MSupp_R', 'Ess_CNH_ASQ_2', 'Ess_CNH_WISC4', 'Ess_CNH_ASQ_6', 'Ess_CSH_PSRI4a_A', 'Ess_CNH_SDQ_2', 'Rec_RCh_NDSR02_C', 'Rec_RCh_MCHATR', 'Ess_CPH_PAI8a_PP', 'Rec_RCh_FES_Con', 'Rec_RCh_ASA_TS', 'Ess_HHx_C19_4_pP', 'Ess_CNH_ASQ_36', 'Ess_CNH_ASQ_22', 'Rec_RCg_NE_CE_P', 'Rec_RCh_HGSW_14_21', 'Rec_RCg_PAnx8', 'Ess_CPH_Air2_Adol', 'Ess_CSH_PSD4a_Ped', 'Ess_CSH_PSRI4a_PP', 'Ess_CNP_PSS10', 'Ess_CSH_SHCA2_SR', 'Ess_CNH_WISC3', 'Ess_CWB_CGB', 'Rec_RCh_SEco_PR', 'Ess_CNP_CRISYS', 'Rec_RCg_FES_Con2', 'Ess_CPH_Air2_MC', 'Ess_CNH_PPSE4a_PP', 'Ess_HSE_CHAOS', 'Ess_Prg_Life_PP', 'Ess_CWB_PLS8b_PP', 'Ess_CNP_ESSI', 'Ess_CPH_Air_MC', 'Ess_CNP_PGHP2', 'Ess_CHB_DSQ_SR2', 'Ess_HHx_MH_BF', 'Ess_CWB_PMP8a_Ped', 'Ess_CNH_ASQ_20', 'Ess_CNP_CESD', 'Rec_RCh_SPIRO', 'Ess_HSE_APQ_P9', 'Rec_RCh_Dent_SR', 'Ess_CPH_DXA', 'Ess_CNP_PInfS4', 'Ess_CPH_CG_CAPE', 'Ess_Dem_GI']


In [45]:
print(batch2_forms)

['Rec_RCh_PDS6a_PP', 'Ess_CWB_PGH8a_ECPR', 'Ess_Prg_MSupp_R', 'Ess_CNH_ASQ_2', 'Ess_CNH_WISC4', 'Ess_CNH_ASQ_6', 'Ess_CSH_PSRI4a_A', 'Ess_CNH_SDQ_2', 'Rec_RCh_NDSR02_C', 'Rec_RCh_MCHATR', 'Ess_CPH_PAI8a_PP', 'Rec_RCh_FES_Con', 'Rec_RCh_ASA_TS', 'Ess_HHx_C19_4_pP', 'Ess_CNH_ASQ_36', 'Ess_CNH_ASQ_22', 'Rec_RCg_NE_CE_P', 'Rec_RCh_HGSW_14_21', 'Rec_RCg_PAnx8', 'Ess_CPH_Air2_Adol', 'Ess_CSH_PSD4a_Ped', 'Ess_CSH_PSRI4a_PP', 'Ess_CNP_PSS10', 'Ess_CSH_SHCA2_SR', 'Ess_CNH_WISC3', 'Ess_CWB_CGB', 'Rec_RCh_SEco_PR', 'Ess_CNP_CRISYS', 'Rec_RCg_FES_Con2', 'Ess_CPH_Air2_MC', 'Ess_CNH_PPSE4a_PP', 'Ess_HSE_CHAOS', 'Ess_Prg_Life_PP', 'Ess_CWB_PLS8b_PP', 'Ess_CNP_ESSI', 'Ess_CPH_Air_MC', 'Ess_CNP_PGHP2', 'Ess_CHB_DSQ_SR2', 'Ess_HHx_MH_BF', 'Ess_CWB_PMP8a_Ped', 'Ess_CNH_ASQ_20', 'Ess_CNP_CESD', 'Rec_RCh_SPIRO', 'Ess_HSE_APQ_P9', 'Rec_RCh_Dent_SR', 'Ess_CPH_DXA', 'Ess_CNP_PInfS4', 'Ess_CPH_CG_CAPE', 'Ess_Dem_GI']


In [28]:
batch3 = pd.read_csv('/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/raw_combined_batches/combined_echo_data_batch_3.csv', low_memory=False)
batch3_forms = get_form_names(batch3.columns.to_list())
print(len(batch3_forms))
print(batch3_forms)

50
['Rec_RCg_PAng5', 'Ess_Dem_ECE', 'Ess_CNH_ASQ_4', 'Rec_Mat_VS_SI', 'Ess_CNH_BDI2', 'Ess_HHx_C19_aA', 'Ess_CNP_EPDS', 'Ess_CNH_SRS2_Sch_SF', 'Ess_CHB_PPA7a_ECPR', 'Ess_HSE_PFR4a_PP', 'Ess_CNP_WASI2A', 'Rec_RCh_PPAff8a_PP', 'Ess_CWB_PGHP2a_A', 'Ess_BPE_HCExp_PP', 'Ess_CNP_BSI', 'Ess_Prg_MSupp_PI', 'Ess_CNP_PHQ9', 'Ess_HSE_ACE_PR', 'Ess_BPE_HESHS_PI', 'Ess_SRP_HPAP_PR', 'Ess_HHx_C19_2_cP', 'Ess_Dem_Dem_B', 'Ess_CNH_BASC2', 'Ess_CSH_PSD4a_A', 'Ess_HHx_C19_2_aP', 'Ess_Prg_PSQI', 'Ess_CNP_ACE_aA', 'Ess_CPH_CAPE_C', 'Rec_RCh_PAnx8a_Ped', 'Ess_CNP_WEDS_S', 'Ess_Prg_PMCI', 'Ess_CNH_RIBQRvSF', 'Rec_RCh_ASA_TNS', 'Rec_RCh_Ho', 'Ess_Prg_PWtGpSR', 'Ess_Dem_Dem_C', 'Ess_CNH_BASC3', 'Ess_Prg_MSuppSF_R', 'Ess_HHx_C19_3_aP', 'Ess_CWB_PGHM2a_A', 'Ess_CNH_PSS4_A', 'Ess_CNH_NEPSY2', 'Ess_HHx_MH_EC', 'Ess_HSE_APQ_C9', 'Ess_BPE_OExp_PP', 'Rec_RCg_C19_aF', 'Ess_CHB_YRB_SB', 'Rec_RCh_SPIROv2', 'Ess_CHB_YRB_SU', 'Ess_CNH_ASQ_54']


In [29]:
batch4 = pd.read_csv('/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/raw_combined_batches/combined_echo_data_batch_4.csv', low_memory=False)
batch4_forms = get_form_names(batch4.columns.to_list())
print(len(batch4_forms))
print(batch4_forms)

50
['Ess_CNH_SDQ_11SR', 'Rec_RCg_C19_CEE2', 'Ess_CNP_PPVT4', 'Ess_Prg_BLOCK4', 'Rec_RCh_SEco_CR', 'Ess_CNH_PSS10_SR', 'Rec_RCg_PMP8', 'Ess_Prg_MFSP', 'Ess_CWB_PGH7_Ped', 'Ess_CNP_ACEBRFSS', 'Rec_RCh_SE_3_5', 'Rec_RCh_WJ3', 'Ess_CPH_Air_Adol', 'Ess_Dem_Dem_CG', 'Ess_Prg_Life_R', 'Ess_CNH_ASQ_42', 'Ess_CSH_SHCA_PR', 'Ess_Prg_DSQ_SR', 'Ess_CNP_PSS4', 'Ess_CNP_LSC', 'Rec_RCh_CMU_PR', 'Rec_RCg_ASR', 'Ess_CNP_WAIS4', 'Ess_BPE_OExp_R', 'Ess_HHx_C19_3_cP', 'Rec_RCh_PAnx8a_PP', 'Ess_ADM_Roster', 'Ess_Dem_CFSM', 'Rec_Mat_SRB_F', 'Ess_CNP_KRIEG', 'Ess_CNH_BRIEF', 'Ess_HHx_MH_MCA', 'Rec_RCh_CKCPT', 'Rec_RCh_SESC_3_5', 'Ess_ADM_WTHD', 'Rec_RCg_EPII_A', 'Ess_Prg_MMRA2', 'Ess_Dem_IAFS_C', 'Ess_CPH_CAPE_I', 'Ess_HSE_APQ_P', 'Ess_CSH_SHCA_SR', 'Ess_CPH_MRA_CAPE', 'Rec_RCh_CMU_SR', 'Rec_RCh_HPBS', 'Ess_CNP_BDI', 'Ess_Dem_CK12E', 'Ess_HHx_C19_2_pP', 'Rec_RCh_AAB', 'Ess_Dem_LA_P', 'Ess_BPE_HESHS_R']


In [30]:
batch5 = pd.read_csv('/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/raw_combined_batches/combined_echo_data_batch_5.csv', low_memory=False)
batch5_forms = get_form_names(batch5.columns.to_list())
print(len(batch5_forms))
print(batch5_forms)

50
['Rec_RCh_ASA_Total', 'Ess_CNH_BAY3', 'Ess_CNP_CTQ', 'Ess_HSE_PFR8a_Ped', 'Ess_BPE_HCExp_R', 'Ess_CNH_PPVT3', 'Rec_RCh_CFPQ', 'Ess_CNH_WPPSI3', 'Ess_CNH_SRS2_Pr', 'Ess_CNH_DAS2', 'Rec_RCg_NE_S_C', 'Ess_CNH_PSS10_PR', 'Rec_Mat_NDSR04', 'Ess_CNP_PES4', 'Rec_RCg_PSRI4a2', 'Ess_CNH_SRS2_Pre_SF', 'Ess_BPE_HESHS_PP', 'Ess_Prg_MWtHtM', 'Ess_Prg_DHQ2', 'Ess_CNH_ASQ_60', 'Ess_CNH_ASQ_48', 'Ess_Prg_MSupp_PP', 'Ess_BPE_HCExp_PI', 'Ess_CPH_TANST', 'Ess_CPH_CAPE', 'Ess_CNP_LES', 'Ess_Prg_FTND', 'Ess_CNP_LSC12', 'Ess_BPE_HESHS_C', 'Ess_HSE_PFR8a_PP', 'Ess_CSH_PSRI4a_Ped', 'Rec_RCh_CEBQ', 'Ess_Dem_LA_C', 'Rec_RCh_SEco_INF', 'Ess_BPE_HCExp_C', 'Ess_HHx_MH2_BF', 'Rec_RCh_PAnx4a_ECPR', 'Ess_HHx_C19Vac_PR', 'Ess_Prg_PSD4', 'Ess_CNH_WPPSI4', 'Ess_HHx_C19_3_pP', 'Ess_CPH_Air_EC', 'Ess_SRP_NTPPI_PR', 'Ess_HSE_APQ_C', 'Rec_RCg_C19_CEE', 'Ess_Dem_IAFS_P', 'Ess_CNH_PPVT4', 'Ess_CPH_PAI8a_Ped', 'Ess_CNP_PDep8', 'Rec_Mat_NDSR02']


In [31]:
batch6 = pd.read_csv('/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/raw_combined_batches/combined_echo_data_batch_6.csv', low_memory=False)
batch6_forms = get_form_names(batch6.columns.to_list())
print(len(batch6_forms))
print(batch6_forms)

50
['Ess_CPH_Air2_EC', 'Ess_CNH_SRS2_Sch', 'Ess_CNH_ASQ_9', 'Ess_Prg_MSHPrg', 'Rec_Mat_PPAQ', 'Ess_CNH_SRS2_A', 'Ess_HHx_C19_2_aA', 'Rec_RCg_CRISYS_SF', 'Ess_CHB_PPA8a_Ped', 'Ess_ADM_REVT_S', 'Ess_CNP_ACE_aP', 'Ess_CWB_PGH7_PP', 'Ess_CHB_DSQ_PR', 'Ess_CNH_ASQ_10', 'Rec_RCh_PPAff4a_ECPR', 'Ess_CNH_MSEL', 'Rec_RCh_Dent_PR', 'Ess_CNP_PInstrS4', 'Ess_CPH_Air2_Inf', 'Ess_Prg_PSRI4', 'Ess_Dem_HHC_C', 'Ess_HHx_C19_cP', 'Ess_CNH_ASQ_8', 'Ess_HHx_HIC', 'Rec_RCg_PSD4a2', 'Ess_CNH_PSS10_A', 'Ess_HHx_CBMRAaJ', 'Ess_HHx_C19_aP', 'Ess_CNH_ASQ_12', 'Ess_Prg_MSuppSF_PP', 'Ess_Prg_Life_PI', 'Ess_CSH_SHCA2_PR', 'Ess_CSH_SHAdult_A', 'Ess_CHB_IFP', 'Ess_CNH_SRS_Pr', 'Ess_HHx_C19_2_cA', 'Rec_RCh_HGSW_8_13', 'Ess_CNH_CBCL_Sch', 'Rec_RCh_CADHD_SR', 'Rec_RCh_TSR_3_5', 'Rec_RCh_PPAff8a_Ped', 'Ess_HHx_C19_4_cP', 'Ess_HHx_CBMRA', 'Ess_HHx_C19Vac', 'Ess_Dem_Occ_Adol', 'Ess_HSE_PFR4a_Ped', 'Ess_HHx_MH_I', 'Rec_RCh_C19SSD_aSR', 'Ess_CNP_PGH', 'Ess_CHB_CFH']


In [32]:
batch7 = pd.read_csv('/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/raw_combined_batches/combined_echo_data_batch_7.csv', low_memory=False)
batch7_forms = get_form_names(batch7.columns.to_list())
print(len(batch7_forms))
print(batch7_forms)

40
['Rec_RCh_SE_6_12', 'Ess_SRP_PComp6a_A', 'Rec_RCg_SHAdul', 'Ess_Dem_FRS', 'Ess_HSE_PARS', 'Ess_CNH_ASQ_16', 'Rec_RCg_NE_PA_C', 'Ess_CNH_SDQ_11', 'Ess_HHx_NNNS', 'Ess_CSH_SHInf', 'Ess_CSH_PSD4a_PP', 'Ess_SRP_PPR8a_Ped', 'Ess_CNH_BRIEF_2', 'Ess_CPH_Air_Inf', 'Ess_HHx_CBMRA2', 'Ess_CNH_RCBQvSF', 'Ess_Dem_HHC_P', 'Ess_HHx_C19_cPR', 'Ess_Prg_PMCI2', 'Ess_HHx_CBI', 'Ess_CNP_WEDS', 'Ess_CPH_BIOIMP', 'Ess_CNP_WASI', 'Ess_CNH_ASQ_14', 'Rec_RCh_CPrU_SR', 'Rec_RCh_Caff_SR', 'Ess_CHB_BLOCK', 'Ess_CWB_PMP8a_A', 'Ess_CNH_CBCL_Pr', 'Ess_CNP_SF36', 'Ess_CWB_PGH_A', 'Ess_HSE_ACE_aAV_A', 'Ess_Prg_MMRAaK', 'Ess_CPH_PPBP', 'Ess_CNH_SDQ_18SR', 'Ess_HHx_C19_4_aP', 'Ess_Dem_Occ_CG', 'Ess_CNH_BRIEF_P', 'Ess_Prg_MMRA', 'Ess_Prg_TPWtGSR']


In [47]:
batch_dict = {'Batch 1':batch1_forms, 'Batch 2':batch2_forms, 'Batch 3': batch3_forms, 'Batch 4': batch4_forms, 'Batch 5': batch5_forms, 'Batch 6': batch6_forms, 'Batch 7': batch7_forms}

In [49]:
batches_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in batch_dict.items()]))
batches_df

Unnamed: 0,Batch 1,Batch 2,Batch 3,Batch 4,Batch 5,Batch 6,Batch 7
0,Ess_CNH_SDQ_4,Rec_RCh_PDS6a_PP,Rec_RCg_PAng5,Ess_CNH_SDQ_11SR,Rec_RCh_ASA_Total,Ess_CPH_Air2_EC,Rec_RCh_SE_6_12
1,Ess_HSE_HOME_EC,Ess_CWB_PGH8a_ECPR,Ess_Dem_ECE,Rec_RCg_C19_CEE2,Ess_CNH_BAY3,Ess_CNH_SRS2_Sch,Ess_SRP_PComp6a_A
2,Rec_RCh_TSR_6_12,Ess_Prg_MSupp_R,Ess_CNH_ASQ_4,Ess_CNP_PPVT4,Ess_CNP_CTQ,Ess_CNH_ASQ_9,Rec_RCg_SHAdul
3,Rec_RCh_SESC_6_12,Ess_CNH_ASQ_2,Rec_Mat_VS_SI,Ess_Prg_BLOCK4,Ess_HSE_PFR8a_Ped,Ess_Prg_MSHPrg,Ess_Dem_FRS
4,Ess_CWB_PGLS5a_A,Ess_CNH_WISC4,Ess_CNH_BDI2,Rec_RCh_SEco_CR,Ess_BPE_HCExp_R,Rec_Mat_PPAQ,Ess_HSE_PARS
5,Ess_CHB_BLOCK2,Ess_CNH_ASQ_6,Ess_HHx_C19_aA,Ess_CNH_PSS10_SR,Ess_CNH_PPVT3,Ess_CNH_SRS2_A,Ess_CNH_ASQ_16
6,Rec_RCh_NDSR04_C,Ess_CSH_PSRI4a_A,Ess_CNP_EPDS,Rec_RCg_PMP8,Rec_RCh_CFPQ,Ess_HHx_C19_2_aA,Rec_RCg_NE_PA_C
7,Ess_CPH_PDS,Ess_CNH_SDQ_2,Ess_CNH_SRS2_Sch_SF,Ess_Prg_MFSP,Ess_CNH_WPPSI3,Rec_RCg_CRISYS_SF,Ess_CNH_SDQ_11
8,Ess_CNP_PPVT3,Rec_RCh_NDSR02_C,Ess_CHB_PPA7a_ECPR,Ess_CWB_PGH7_Ped,Ess_CNH_SRS2_Pr,Ess_CHB_PPA8a_Ped,Ess_HHx_NNNS
9,Ess_CNH_ASQ_18,Rec_RCh_MCHATR,Ess_HSE_PFR4a_PP,Ess_CNP_ACEBRFSS,Ess_CNH_DAS2,Ess_ADM_REVT_S,Ess_CSH_SHInf


# Data from Maternal and Nutrition Paper
https://www.sciencedirect.com/science/article/pii/S2475299123266032?via%3Dihub#kwrds0015

In [22]:
forms_list = ['Rec_RCh_ASA_TNS', 'Rec_RCh_ASA_Totals', 'Rec_RCh_ASA_TS', 'Ess_Prg_DHQ3', 'Ess_CHB_BLOCK', 'Ess_CHB_BLOCK2','Ess_CHB_BLOCK3', 'Ess_Prg_DSQ_SR', 'Ess_CHB_DSQ_SR2', 'Ess_CHB_DSQ_PR', 'Ess_Prg_MFSP', 'Ess_CHB_CFSP', 'Ess_CHB_IFP', 'Ess_CHB_CFH', 'Ess_Prg_MMRA', 'Ess_Prg_MMRA2', 'Ess_Prg_MMRAaG', 'Ess_Prg_MSupp_PP', 'Ess_Prg_MSupp_PI', 'Ess_Prg_MSupp_R', 'Ess_Prg_MSuppSF_PP', 'Ess_Prg_MSuppSF_PI', 'Ess_Prg_MSuppSF_R']

In [23]:
print(len(forms_list))

23


In [24]:
# Get a list of form csv files
specific_form_files = [form + '.csv' for form in forms_list]
print(specific_form_files)

['Rec_RCh_ASA_TNS.csv', 'Rec_RCh_ASA_Totals.csv', 'Rec_RCh_ASA_TS.csv', 'Ess_Prg_DHQ3.csv', 'Ess_CHB_BLOCK.csv', 'Ess_CHB_BLOCK2.csv', 'Ess_CHB_BLOCK3.csv', 'Ess_Prg_DSQ_SR.csv', 'Ess_CHB_DSQ_SR2.csv', 'Ess_CHB_DSQ_PR.csv', 'Ess_Prg_MFSP.csv', 'Ess_CHB_CFSP.csv', 'Ess_CHB_IFP.csv', 'Ess_CHB_CFH.csv', 'Ess_Prg_MMRA.csv', 'Ess_Prg_MMRA2.csv', 'Ess_Prg_MMRAaG.csv', 'Ess_Prg_MSupp_PP.csv', 'Ess_Prg_MSupp_PI.csv', 'Ess_Prg_MSupp_R.csv', 'Ess_Prg_MSuppSF_PP.csv', 'Ess_Prg_MSuppSF_PI.csv', 'Ess_Prg_MSuppSF_R.csv']


In [39]:
# Set directories with the forms 
forms_directory = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/Data_CSVDownloaded/02_forms'
PtReg ='/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/Data_CSVDownloaded/01_research/ptReg.csv'

In [40]:
specific_form_file_paths = [os.path.join(forms_directory, file) for file in os.listdir(forms_directory) if file.endswith('.csv') and file in specific_form_files]
len(specific_form_file_paths)

21

In [41]:
# Add pt_reg file to specific forms files as first file
specific_form_file_paths.insert(0, PtReg)
len(specific_form_file_paths)

22

In [42]:
combined_df, merged_files, files_count= combine_echo_files(specific_form_file_paths)

2024-05-19 20:10:13,253 - Reading /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/Data_CSVDownloaded/01_research/ptReg.csv - ('_/Ess_Prg_MMRA', '_/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/Data_CSVDownloaded/01_research/ptReg')
2024-05-19 20:10:13,254 - Files processed: 1
2024-05-19 20:10:13,377 - Reading /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/Data_CSVDownloaded/02_forms/Ess_CHB_BLOCK2.csv - ('_/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/Data_CSVDownloaded/01_research/ptReg', '_/Ess_CHB_BLOCK2')
2024-05-19 20:10:47,261 - Files processed: 2
2024-05-19 20:10:47,371 - Reading /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/Data_CSVDownloaded/02_forms/Ess_CHB_CFSP.csv - ('_/Ess_CHB_BLOCK2', '_/Ess_CHB_CFSP')
2024-05-19 20:12:01,980 - Files processed: 3
2024-05-19 20:12:02,001 - Reading /Users/faith/D

In [44]:
# Export to csv
combined_df.to_csv('/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/ECHO/data_dump/raw_combined_batches/maternal_child_nutrition')

In [47]:
combined_df.head(2)

Unnamed: 0,xParticipantID,xOthPtID1,ProtocolID,xCohortID,xSiteID,xCohortID_UH3,ExplicitRegistration,ParticipantType,ParticipationLevel,PregnancyNumber,...,mmra_f3h6_c,mmra_f3h7_c,mmra_f3h8_c,mmra_f3h9_c,mmra_sectionf_complete___1,mmra_setting,mmra_mode,mmra_version,mmra_language,MMRA_EmanifestSource
0,A10002-01-0,,EWCP_New,AAX06,,AAX06,0,P,2,1,...,,,,,,,,,,
1,A10670-01-0,,EWCP_New,AAX06,,AAX06,0,P,2,1,...,,,,,,,,,,


In [48]:
combined_df.shape

(63215, 5016)