In [None]:
# Goal: merge MUSE cache with the MUSE-EDW map to get a PatientID for each patient in MUSE. 
%load_ext autoreload
%autoreload 2
import pandas as pd
import sys
import os
from tqdm.notebook import tqdm

sys.path.append('../src/')

from ecg_feature_names import (
    xml_file_to_name_dob,
    map_xml_path_to_afib
)
from ecg_preprocessing_fs import create_name_dob_hash

In [None]:
# Parallel processing setup
import ipyparallel as ipp

n_engines = 6
cluster = ipp.Cluster(n=n_engines)
cluster.start_cluster_sync()
rc = cluster.connect_client_sync()
rc.wait_for_engines(n_engines)
dview = rc[:]

In [None]:
# Create a dataframe that maps MRN to filepath
def process_file_for_muse_cache(fname, name_dob_f, afib_f):
    mrn = fname.split('/')[-1].split('_')[1]
    name_dob_dict = name_dob_f(fname)
    name_dob_dict['afib'] = afib_f(fname)
    patient_dict = {'muse_mrn': mrn, 'path': fname}
    patient_dict.update(name_dob_dict)
    return patient_dict

fpath = '../outputs_intermediate/muse_cache_files/patient_mrn_to_file.csv' # 2016-2019
# fpath = '../outputs_intermediate/muse_cache_files/patient_mrn_to_file_2020_2021_2022.csv' # 2020-2022
# fpath = '../outputs_intermediate/muse_cache_files/patient_mrn_to_file_2023_2024.csv' # 2023-2024

start_year = 2016
end_year = 2019
muse_dir = "/run/user/1419529909/gvfs/smb-share:server=fs.analyticsenclave.org,share=ecgadult/muse_adult/"

hospitals = os.listdir(muse_dir)
patient_mrn_to_file_df = []
for hosp  in tqdm(hospitals):
    if hosp == 'CSV_FILES':
        continue  

    print(hosp)

    hosp_dir = muse_dir + hosp
    years = os.listdir(hosp_dir)
    print(years)
    
    years = [y for y in years if (int(y) >= start_year) and (int(y) <= end_year)]
    print(years)
    for year in tqdm(years):
        print(year)
        months = os.listdir(hosp_dir + '/' + year)
        year_dir = hosp_dir + '/' + year 
        for month in months:
            print(month)
            month_dir = year_dir + '/' + month
            fnames = os.listdir(month_dir)
            fnames = [month_dir + '/' + fname for fname in fnames]
            # Can parallelize this
            cache = dview.map_sync(process_file_for_muse_cache, fnames, [xml_file_to_name_dob for i in range(len(fnames))],
                                    [map_xml_path_to_afib for i in range(len(fnames))])
            patient_mrn_to_file_df.extend(cache)
    
    pd.DataFrame(patient_mrn_to_file_df).to_csv(hosp + '_muse_cache_2023_2024.csv')
patient_mrn_to_file_df = pd.DataFrame(patient_mrn_to_file_df)

# Clean MUSE mrns
def remove_ME(x):
    x = str(x)
    if x.startswith('ME'):
        return x[2:]
    return x

def get_day(path):
    try:
        day = int(path.split('/')[-1].split('_')[-2][6:8])
    except:
        print(path)
    return day

patient_mrn_to_file_df['year'] = patient_mrn_to_file_df['path'].apply(lambda x: x.split('/')[8]).astype(int)
patient_mrn_to_file_df['month'] = patient_mrn_to_file_df['path'].apply(lambda x: x.split('/')[9]).astype(int)
patient_mrn_to_file_df['ecg_location'] = patient_mrn_to_file_df['path'].apply(lambda x: x.split('/')[7])
patient_mrn_to_file_df['day'] = patient_mrn_to_file_df['path'].apply(get_day)
patient_mrn_to_file_df['UniqueID'] = create_name_dob_hash(patient_mrn_to_file_df, 'PatientFirstName', 'PatientLastName', 'DateofBirth')
patient_mrn_to_file_df.to_csv(fpath)

### Merge MUSE cache with MUSE-EDW Map

In [None]:
from ecg_preprocessing_fs import create_name_dob_hash

muse_cache_df = pd.concat([pd.read_csv('../outputs_intermediate/muse_cache_files/patient_mrn_to_file.csv', dtype='str'), 
                           pd.read_csv('../outputs_intermediate/muse_cache_files/patient_mrn_to_file_2020_2021_2022.csv', dtype='str')])

print("MUSE unique locations: ",muse_cache_df['ecg_location'].unique())
print("Muse cache df columns: ",muse_cache_df.keys())
muse_cache_df['PatientFirstName'].fillna('nan', inplace=True)


muse_cache_df['UniqueID'] = create_name_dob_hash(muse_cache_df, 'PatientFirstName', 'PatientLastName', 'DateofBirth')

print(len(muse_cache_df))
muse_patients = muse_cache_df.groupby(['UniqueID', 'year']).first().reset_index()
print(len(muse_patients))

# Remove people with dates of birth on January 1, 1900 and January 1 1901
# it's not the age that's suspicious, it's that there are so many people born on these exact days
# (around 500 total)
print(len(muse_patients))
muse_patients = muse_patients[muse_patients['DateofBirth'] != '01-01-1900']
muse_patients = muse_patients[muse_patients['DateofBirth'] != '01-01-1901']
print(len(muse_patients))

In [None]:
# Read in MUSE-EDW map (provided by Annabel and Brianna)
muse_edw_map = pd.read_csv('muse_edw_map.csv')
muse_edw_map['PatientFirstName'].fillna('nan', inplace=True)
muse_edw_map = muse_edw_map[['PatientID', 'PatientFirstName', 'PatientLastName', 'DOB']].drop_duplicates()
muse_edw_map['UniqueID'] = create_name_dob_hash(muse_edw_map, 'PatientFirstName', 'PatientLastName', 'DOB')
muse_edw_map = muse_edw_map[['UniqueID','PatientID']]
print("# of Unique patients in MUSE 2016-2022 inclusive: ", muse_patients['UniqueID'].nunique())

# Merge MUSE cache with MUSE-EDW map to get a PatientID for each patient in MUSE. 
muse_patients = muse_patients[['UniqueID', 'PatientFirstName', 'PatientLastName', 'DateofBirth', 'year']]

muse_patients = muse_patients[muse_patients['UniqueID'].isin(muse_edw_map['UniqueID'])]
muse_to_patient_id_map = pd.merge(muse_patients, muse_edw_map, on='UniqueID')
print(muse_to_patient_id_map['UniqueID'].nunique())
print(len(muse_to_patient_id_map))
muse_to_patient_id_map.to_csv('../outputs_intermediate/all_muse_to_patient_id.csv')

muse_to_patient_id_map = pd.read_csv('../outputs_intermediate/all_muse_to_patient_id.csv')
print("# of unique patients: ", muse_to_patient_id_map['UniqueID'].nunique())