In [None]:
# imports
import pandas as pd
import os
from glob import glob
import hashlib

## 1. Merging Individual Assessments

**Note:** Each folder consist of all individual patient CSV files for a particular assessment. Each raw CSV file contains records that have a different query/question at a particular date in time.  The resulting post-processed dataframes will merge all individual CSV files (for a particular assessment) and process them so that at a specific date in time we have the responses to each question within a single record.

In [None]:
# folder path for assessment CSV files - WHO
folder_path = '/Users/bk/Desktop/exist_centers/data/who'

# # folder path for assessment CSV files - GAD
# folder_path = '/Users/bk/Desktop/exist_centers/data/gad'

# # folder path for assessment CSV files - PHQ
# folder_path = '/Users/bk/Desktop/exist_centers/data/phq'

# # folder path for assessment CSV files - PTSD
# folder_path = '/Users/bk/Desktop/exist_centers/data/ptsd'

# # folder path for assessment CSV files - DERS
# folder_path = '/Users/bk/Desktop/exist_centers/data/ders'

# # folder path for assessment CSV files - DERS2
# folder_path = '/Users/bk/Desktop/exist_centers/data/ders2'

In [None]:
# get all CSV files in folder
csv_files = glob(os.path.join(folder_path, '*.csv'))

In [None]:
# initialize empty list to store data frames
df_list = list()

# loop thru CSV files and process
for file_path in csv_files:
    df = pd.read_csv(file_path)
    
    # add a new column w/ file name
    df['file_name'] = os.path.basename(file_path)
    
    # add to list
    df_list.append(df)

# merge all data frames
combined_df = pd.concat(df_list, ignore_index=True)

In [None]:
combined_df.head(5)

In [None]:
# initialize empty list to store data frames
df_list = list()

# loop thru CSV files and process
for file_path in csv_files:
    df = pd.read_csv(file_path)
    
    # add a new column w/ file name
    df['file_name'] = os.path.basename(file_path)
    
    # identify question-related columns
    fixed_columns = ['question', 'code', 'issue', 'issue_code', 'file_name']
    
    # identify date columns
    date_columns = [col for col in df.columns if col not in fixed_columns]
    
    # reshape using pd.melt so that each date becomes a row
    df_melted = df.melt(id_vars=fixed_columns, 
                         value_vars=date_columns, 
                         var_name='assessment_date', 
                         value_name='response')
    
    # add to list
    df_list.append(df_melted)

# merge all data frames
df_combined = pd.concat(df_list, ignore_index=True)

# pivot so that question types become columns
df_final = df_combined.pivot_table(index=['file_name', 'assessment_date'], 
                                   columns='question', 
                                   values='response', 
                                   aggfunc='first').reset_index()

df_final.columns.name = None

In [None]:
df_final.head(20)

In [None]:
# # remove multi-index column naming
# df_final.columns.name = None

# extract first name, last name, and date part of file name
first_name = df_final['file_name'].apply(lambda x: x.split('_')[1] if len(x.split('_')) >= 4 else None)
last_name = df_final['file_name'].apply(lambda x: x.split('_')[2] if len(x.split('_')) >= 4 else None)

# full name: combin first and last name
df_final['full_name'] = first_name + " " + last_name

# extract the date_part of file name
df_final['date_part'] = df_final['file_name'].apply(lambda x: x.split('_')[3] if len(x.split('_')) >= 4 else None)

# # # reorder columns
# final_df = final_df.iloc[:, [0,8,9,1,2,3,4,5,6,7]]

In [None]:
df_final.head(20)

## 2. Merging with Full Patient List

**Note:** Here, we will merge our post-processed dataframe with the full patient list that contains each patients full name, MR/ID #, group identifier, and initial group identifier.

In [None]:
# load full patient version 2 CSV file
df_patient = pd.read_csv('/Users/bk/Desktop/exist_centers/data/patient_listv2.csv')

# merge dataframes to get patient ID
df_final = df_final.merge(df_patient[['full_name','patient_ID','group_identifier','initial_group_identifier']], on='full_name')

# print top 5 rows
df_final.head(5)

In [None]:
# # pseudonymization 
# SECRET_KEY = "THIS IS A SECRET!"

# name_mapping = dict()

# def pseudonymize_function(patient_id):
#     if patient_id not in name_mapping:
#         hash_input = (SECRET_KEY + patient_id).encode()
#         hashed_value = hashlib.sha256(hash_input).hexdigest()[:12]
#         name_mapping[patient_id] = f"{hashed_value}"
#     return name_mapping[patient_id]

# # apply pseudonymization function
# df_final['group_identifier'] = df_final['patient_ID'].apply(lambda x: pseudonymize_function(x) if pd.notnull(x) else None)

# # # # reorder columns 
# # df_final = df_final.iloc[:, [0,8,10,9,1,2,3,4,5,6,7]]

# column_order = ["file_name", "group_identifier", "date_part", "assessment_date"] + \
#                [col for col in df_final.columns if col not in ["file_name", "group_identifier", "date_part", "assessment_date", "full_name"]]

# df_final = df_final[column_order]

**Comment:** The pseudonmiation process was moved to the previous notebook *(01_extract_patient_list_notebook)*.

## 3. Removing Personally Identifiable Information (PII) columns

**Note:** This section will remove all PII data for safe and secure handoff to the rest of the team.

In [None]:
# subset dataframe: remove all PII columns
df_anon = df_final.drop(columns=['file_name','full_name','patient_ID'])
# df_anon = df_final.iloc[:,1:-1]

# rename column for readability
df_anon.rename(columns={'date_part':'file_part'}, inplace=True)

In [None]:
df_anon.shape

In [None]:
df_anon.head(10)

## 4. Handling duplicate records due to multiple MR/ID #'s

**Note:** Due to some patients having multiple MR/ID numbers, this also created multiple group identifiers for the same patient since we pseudonymized using patient MR/ID #. However, the initial group identifier column in the full patient list dataframe assigns a single unique ID for each patient. Therefore, we will drop the group identifier field and remove duplicate recrods.

In [None]:
df_anon2 = df_anon.drop(columns=['group_identifier'])

In [None]:
df_anon2[df_anon2.duplicated(keep=False)].head(10)

In [None]:
# re-order columns for readability
cols = ['initial_group_identifier', 'file_part','assessment_date'] + [col for col in df_anon2.columns if col not in ['initial_group_identifier','file_part','assessment_date']]
df_anon2 = df_anon2[cols]

In [None]:
# removes duplicates
df_anon2 = df_anon2.drop_duplicates()

In [None]:
# sanity check
df_anon2[df_anon2.duplicated(keep=False)].head(10)

In [None]:
df_anon2.shape

## 5. Save Final Anonomyzed Data Frames

In [None]:
# # save final merged CSV - WHO
# df_anon2.to_csv('who_merged.csv', index=False)

# # save final merged CSV - GAD
# df_anon2.to_csv('gad_merged.csv', index=False)

# # save final merged CSV - PHQ
# df_anon2.to_csv('phq_merged.csv', index=False)

# # save final merged CSV - PTSD
# df_anon2.to_csv('ptsd_merged.csv', index=False)

# # save final merged CSV - DERS
# df_anon2.to_csv('ders_merged.csv', index=False)

# # save final merged CSV - DERS2 
# df_anon2.to_csv('ders2_merged.csv', index=False)