In [241]:
import pandas as pd
import os
import logging
import numpy as np

In [242]:
# Logging events 
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)

In [243]:
# Set directory to files to get list of file_paths excluding subdirectories
directory = '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/'
file_paths = [os.path.join(directory, file) for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]

In [244]:
# Make PtReg file the first item in the list since we shall need it to be set as the base with all the participant registrations
PtReg = os.path.join(directory, 'PtReg.csv')
file_paths.insert(0, file_paths.pop(file_paths.index(PtReg)))
len(file_paths)

340

In [237]:
# Define the aggregation functions
def custom_agg(series):
    # Filter out NaN values before joining
    return ', '.join(series.dropna().astype(str))

def list_agg(series):
    return list(series.dropna())

# Specify the columns you want to group by
group_columns = ['xCohortID', 'xParticipantID']
drop_columns = ['ProtocolID', 'xSiteID']

def aggregate_data(grouping_columns, grouping_df): 
    # Create an empty aggregation dictionary
    agg_dict = {}
    
    # Dynamically add columns to the dictionary with the appropriate function
    for col in grouping_df.columns:
        if col in grouping_columns:
            continue  # Skip group by columns
        #if df[col].dtype == 'float64' or df[col].dtype == 'int64':
        #agg_dict[col] = ['mean', 'max']  # Apply both mean and max
        elif grouping_df[col].dtype in [np.float64, np.int64]:
            agg_dict[col] = list_agg  # Use list aggregation for numeric columns
    
        else:
            agg_dict[col] = custom_agg  # Apply custom concatenation
    
    # Apply the aggregation to the DataFrame
    grouped_df = grouping_df.groupby(group_columns).agg(agg_dict)

    # Flatten the multi-level columns if necessary
    grouped_df.columns = ['_'.join(col) if isinstance(col, tuple) else col for col in grouped_df.columns.values]

    grouped_df.reset_index(inplace=True)
    return grouped_df
    
def combine_echo_files(files):
    # Initialize an empty DataFrame and trackers
    base_df = pd.DataFrame()
    file_count = 0
    merged_files = []
    # Process each file
    for index, file in enumerate(files):
        if file.endswith('.csv'):
            # Read the current file into a DataFrame
            temp_df = pd.read_csv(file, low_memory=False)

            # Dynamic suffixes based on the file names
            base_file = files[index-1].replace(directory, '')
            base_form_name = os.path.splitext(base_file)[0]
            temp_file = file.replace(directory, '')
            temp_form_name = os.path.splitext(temp_file)[0]
            suffixes = (f'_{base_form_name }', f'_{temp_form_name}')
            
            logging.info('Reading %s - %s', file, suffixes)
        
            # Add the first file(PtReg)
            if base_df.empty:
                base_df = temp_df
            else:
               # Remove columns that are repeated and/or not relevant 
                temp_df = temp_df.drop(columns=[col for col in drop_columns if col in temp_df.columns])
               
                # Aggregate the data in the form to combine with the PtReg 
                temp_df = aggregate_data(group_columns, temp_df)
            
                # Merge the new DataFrame with the base DataFrame on the join key
                base_df = pd.merge(base_df, temp_df, on=['xCohortID', 'xParticipantID'], how='left', suffixes=suffixes)
        file_count += 1
        merged_files.append(file)
        logging.info('Files processed: %s', file_count)
    # Remove PtReg file from merged files so that it can be used as base in next batch
    merged_files.remove(PtReg)
    return base_df, merged_files, file_count

In [238]:
combined_df, merged_files, files_count= combine_echo_files(file_paths[0:3]) 

2024-04-18 22:04:57,436 - Reading /Users/faith/Desktop/ECHO/raw_downloaded/csv_files/PtReg.csv - ('_Ess_HSE_HOME_EC', '_PtReg')
2024-04-18 22:04:57,437 - Files processed: 1
2024-04-18 22:04:57,481 - Reading /Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CNH_SDQ_4.csv - ('_PtReg', '_Ess_CNH_SDQ_4')
2024-04-18 22:05:11,794 - Files processed: 2
2024-04-18 22:05:11,812 - Reading /Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_HSE_HOME_EC.csv - ('_Ess_CNH_SDQ_4', '_Ess_HSE_HOME_EC')
2024-04-18 22:05:17,047 - Files processed: 3


In [240]:
combined_df.to_csv('combined_echo_data_3.csv', index=False)

In [221]:
combined_df.shape

(63215, 159)

In [192]:
merged_files_30 = merged_files

In [194]:
merged_files_30

['/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CNH_SDQ_4.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_HSE_HOME_EC.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Rec_RCh_TSR_6_12.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Rec_RCh_SESC_6_12.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CWB_PGLS5a_A.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CHB_BLOCK2.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Rec_RCh_NDSR04_C.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CPH_PDS.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CNP_PPVT3a.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CNH_ASQ_18.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CHB_CFSP.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Rec_RCh_CPrU_PR.csv',
 '/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CNH_ASQ_24.csv',
 '/Users/faith/Desktop/ECHO/raw_d

In [246]:
file_paths.index('/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CPH_CG_CAPE.csv')

98

In [248]:
file_paths[99]

'/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_Dem_GI.csv'

In [251]:
file_paths[149]

'/Users/faith/Desktop/ECHO/raw_downloaded/csv_files/Ess_CNH_ASQ_54.csv'

Notes:  file Rec_RCh_NDSR02_C.csv is large file number 54 and takes a while to execute ... over an hour on local computer resources 