In [None]:
# imports
import pandas as pd
import numpy as np
import os
import hashlib

## 1. Extract Patient List

**Note:** The final dataframe will contain a list of patients who have completed atleast 1 industry standard assessments *(WHO, GAD, PHQ, PTSD, DERS, DERS2)*.

In [None]:
# aggregated reports folder path
folder_path = '/Users/bk/Desktop/exist_centers/aggregated_reports'  

# list of file names
csv_files = ['who_aggregated.csv', 'gad_aggregated.csv', 'phq_aggregated.csv', 
             'ptsd_aggregated.csv','ders_aggregated.csv','ders2_aggregated.csv']

# initialize emnpty list
df_list = list()

# extract patient names from each CSV
for file in csv_files:
    file_path = os.path.join(folder_path, file)  
    df = pd.read_csv(file_path, usecols=['Patient'])
    df_list.append(df)

# merge data frames
df_final = pd.concat(df_list, ignore_index=True)

# remove duplicate names
df_final = df_final.drop_duplicates()

In [None]:
df_final

In [None]:
# extract first name, last name, patient ID
df_final['first_name'] = df_final['Patient'].str.extract(r'^(\w+)')[0].str.lower()
df_final['last_name'] = df_final['Patient'].str.extract(r'^\w+ (\w+)')[0].str.lower()
df_final['patient_ID'] = df_final['Patient'].str.extract(r'([A-Z]{2}-\d{4}-\d+)')

In [None]:
df_final

In [None]:
len(df_final)

In [None]:
len(df_final['patient_ID'].unique())

In [None]:
len(df_final['last_name'].unique())

In [None]:
len(df_final['first_name'].unique())

In [None]:
df_final[df_final[['last_name']].duplicated(keep=False)].sort_values(by=['last_name'])

In [None]:
df_final[df_final[['first_name','last_name']].duplicated(keep=False)].sort_values(by=['last_name'])

In [None]:
len(df_final['patient_ID'].unique()) - len(df_final[df_final[['first_name','last_name']].duplicated()])

In [None]:
# full_name: combine first and last name
df_final["full_name"] = df_final.first_name + " " + df_final.last_name

In [None]:
df_final

## 2. Pseudonomization

**Note:** Here, we will pseudonymize patients using their patient MR/ID # and give them an anonomyzed 12 character group_identifier.

In [None]:
# pseudonymization 
SECRET_KEY = "THIS IS A SECRET!"

name_mapping = dict()

def pseudonymize_function(patient_id):
    if patient_id not in name_mapping:
        hash_input = (SECRET_KEY + patient_id).encode()
        hashed_value = hashlib.sha256(hash_input).hexdigest()[:12]
        name_mapping[patient_id] = f"{hashed_value}"
    return name_mapping[patient_id]

# apply pseudonymization function
df_final['group_identifier'] = df_final['patient_ID'].apply(lambda x: pseudonymize_function(x) if pd.notnull(x) else None)

In [None]:
df_final

In [None]:
# # save full patient list CSV
# df_final.to_csv('patient_list.csv', index=False)

## 3. Handling Duplicate Patient Names

**Note:** We identified 3 patients that have multiple patient MR/ID #'s. Going forward, we will use the initial group identifier which will assign a single unique ID for each patient. The resulting dataframe will allow us to correctly map each patient to their assessment in the data wrangling notebook.

In [None]:
# df_final[df_final[['first_name','last_name']].duplicated(keep=False)].sort_values(by=['last_name'])

In [None]:
# load patient_intake_anon.csv
df_intake = pd.read_csv('/Users/bk/Desktop/exist_centers/data/patient_intake_anon.csv')

In [None]:
df_intake.head(10)

In [None]:
df_final[df_final[['first_name','last_name']].duplicated(keep=False)].sort_values(by=['last_name'])

In [None]:
df_intake.head()

In [None]:
df_merge = df_final.merge(df_intake[['group_identifier','initial_group_identifier']], on='group_identifier', how='left')

In [None]:
df_merge

In [None]:
df_merge.loc[df_merge['group_identifier'] == '216266668847', 'initial_group_identifier'] = 216266668847

In [None]:
df_merge

In [None]:
df_merge[df_merge.full_name.duplicated(keep=False)].sort_values(by='full_name')

## 4. Save Final Patient List Data Frame

In [None]:
# # save patient list version 2
# df_merge.to_csv('patient_listv2.csv', index=False)