## get all patient IDs

In [None]:
import os
import csv
import pandas as pd
import shutil
import pickle
import numpy as np
import warnings
warnings.filterwarnings('ignore')


obtain all involved samples

In [None]:

personids = pickle.load(open('personids.pkl', 'rb'))

# basic input feature preprocessing


demographic information

In [None]:
import pickle
demographics = pickle.load(open('NewHome/demographics.pkl', 'rb'))
demographics.drop_duplicates(inplace=True)

def map_eth(x):
    if x == 38003564:
        return 'Not Hispanic'
    elif x == 38003563:
        return 'Hispanic'
    else:
        return None
    
demographics['Ethnicity'] = demographics['ethnicity_concept_id'].apply(map_eth )


def map_gender(x):
    if x == 'Female':
        return 'F'
    elif x == 'Male':
        return 'M'
    else:
        return None
demographics['Gender'] = demographics['gender'].apply(map_gender )

demographics.head()

In [None]:
import numpy as np

demographics = demographics[demographics['person_id'].isin(personids)]
print(demographics.shape)

In [None]:
demographics_csv0 = demographics[['person_id', 'date_of_birth']]

demographics_csv1 = demographics[['person_id', 'date_of_birth', 'Ethnicity']]

demographics_csv2 = demographics[['person_id', 'date_of_birth','Gender']]

demographics_csv0.drop_duplicates(inplace=True)
demographics_csv1.drop_duplicates(inplace=True)
demographics_csv2.drop_duplicates(inplace=True)

demographics_csv0['date_of_birth'] = demographics_csv0['date_of_birth'].dt.date
demographics_csv1['date_of_birth'] = demographics_csv1['date_of_birth'].dt.date
demographics_csv2['date_of_birth'] = demographics_csv2['date_of_birth'].dt.date


demographics_csv0.dropna(subset=['date_of_birth'], inplace=True)
demographics_csv1.dropna(subset=['Ethnicity'], inplace=True)
demographics_csv2.dropna(subset=['Gender'], inplace=True)

print('demographics_csv0', demographics_csv0.shape)

print('demographics_csv1', demographics_csv1.shape, 'demographics_csv2', demographics_csv2.shape)


In [None]:
demographics_csv0 = demographics_csv0.rename(columns = {'person_id':'patient_id','date_of_birth':'start'})

demographics_csv1 = demographics_csv1.rename(columns = {'person_id':'patient_id','date_of_birth':'start', 'Ethnicity':'code'})
demographics_csv2 = demographics_csv2.rename(columns = {'person_id':'patient_id','date_of_birth':'start', 'Gender':'code'})

demographics_csv0['code']= 'Birth/Birth'
demographics_csv1['code']= 'Ethnicity/' + demographics_csv1['code']
demographics_csv2['code']= 'Gender/' + demographics_csv2['code']


In [None]:
demographics_csv = pd.concat([demographics_csv0, demographics_csv1, demographics_csv2], axis=0)
demographics_csv

mapping vacabulary harmorization


In [None]:
# load definition code

code = pd.read_csv('NewHome/ADRD_dx_med_codes.csv',  index_col=0)
code_icd10 = code[code.Code_type=='ICD-10'].Code.values.tolist()
code_icd10 = [i.strip() for i in code_icd10]
print('ICD code', len(code_icd10))

code_rxcui = code[code.Code_type=='RXCUI'].Code.values.tolist()
code_rxcui = [i.strip() for i in code_rxcui]
print('RXCUI code', len(code_rxcui))

snomed_2_icd10 = pd.read_csv('NewHome/csv_filter_mapping_snomed_to_Icd10cm.csv',  index_col=0)[['referencedComponentId', 'mapTarget']].drop_duplicates()
print('Load mapping from snomed_2_icd10 and drop duplicates :', snomed_2_icd10.shape)


code_snomed = []

for c10 in code_icd10: 
    c_snomed = snomed_2_icd10[snomed_2_icd10['mapTarget'] == c10].referencedComponentId.values.tolist()
    code_snomed.extend(c_snomed)

code_snomed = list(set(code_snomed))
code_snomed = [str(i) for i in code_snomed]

exc_code_snomed = ['SNOMED/' + str(cid) for cid in code_snomed]
exc_code_rxcui = ['RxNorm/' + str(cid) for cid in code_rxcui]

assert not any([' ' in c for c in exc_code_snomed])
assert not any([' ' in c for c in exc_code_rxcui])

print('SNOMED code', len(exc_code_snomed))
print('RXCUI code', len(exc_code_rxcui))


 diagnosis information


In [None]:
dxdata = pickle.load(open('NewHome/dxdata.pkl', 'rb'))

In [None]:
dxmotor = dxdata[dxdata['person_id'].isin(set(personids))]
print('dxmotor (for the cohort)', dxmotor.shape)

dxmotor = dxmotor[['person_id', 'condition_start_date', 'concept_code']]\
.rename(columns={'condition_start_date':'start', 'concept_code':'code', 'person_id':'patient_id'})


dxmotor = dxmotor[~dxmotor['code'].isin(set(code_snomed))]
print('dxmotor (remove adrd snomed codes)', dxmotor.shape)

dxmotor['code'] = 'SNOMED/' + dxmotor['code']

dxmotor.drop_duplicates(inplace=True)
display(dxmotor)


medication information

In [None]:
rxdata = pickle.load(open('NewHome/drugdata_ingredient.pkl', 'rb'))

rxmotor = rxdata[rxdata['person_id'].isin(set(personids))]
print('rxmotor (for the cohort)', rxmotor.shape)

rxmotor = rxmotor[['person_id', 'drug_exposure_start_date', 'RXCUI_ingredient']]\
.rename(columns={'drug_exposure_start_date':'start', 'RXCUI_ingredient':'code', 'person_id':'patient_id'})

rxmotor = rxmotor[~rxmotor['code'].isin(set(code_rxcui))]
print('rxmotor (remove adrd rxcui codes)', rxmotor.shape)

rxmotor['code'] = 'RxNorm/' + rxmotor['code']

rxmotor.drop_duplicates(inplace=True)
display(rxmotor)


# filter observation window

In [None]:

def get_all_feature_for_psm_match(input_psm_match_collection, input_match_cci_collection, match_ratio):
    
    def summary(_input_psm_match_collection, match_ratio):
        index=0
        sample_summary_df = pd.DataFrame(columns=["type", "num_samples", "cp", "year"]) 
        cps = [1]
        years = [0, 1, 2, 5, 10]

        for year in years:
            for cp in cps:
                _match_df = _input_psm_match_collection['ADRD_filter_cci_match_and_index_obs{}'.format(str(year))]
                selected_columns = ['psm_control_{}'.format(str(mi)) for mi in range(1, match_ratio+1)]

                _match_df[selected_columns] = _match_df[selected_columns].astype('Int64')

                values_list = list(set(_match_df[selected_columns].values.flatten().tolist()))

                case_row = ["Case", _match_df.index.nunique(), "CP "+ str(cp), str(year) + "-yr Prediction Window"] 
                control_row = ["Control",  len(values_list),  "CP "+ str(cp), str(year) + "-yr Prediction Window"]
                sample_summary_df.loc[index] = case_row
                index += 1
                sample_summary_df.loc[index] = control_row
                index += 1
        display(sample_summary_df)
        
        
    def format_psm_match(_input_psm_match_collection, match_ratio):
        obs_years = [0, 1, 2, 5, 10]
        _psm_match_collection = {}
        for year in obs_years:
            _match_df = _input_psm_match_collection['ADRD_filter_cci_match_and_index_obs{}'.format(str(year))]
            selected_columns = ['psm_control_{}'.format(str(mi)) for mi in range(1, match_ratio+1)]

            _match_df[selected_columns] = _match_df[selected_columns].astype('Int64')

            _psm_match_collection['ADRD_filter_cci_match_and_index_obs{}'.format(str(year))] = _match_df

        return _psm_match_collection      
        
        
    def format_case_control_list(_input_psm_match_collection, _input_match_cci_collection, match_ratio):
        years = [0, 1, 2, 5, 10]
        cps = [1]
        _cases_list = {}
        _controls_list = {}
        
        for year in years:
            for cp in cps:
                all_df = _input_psm_match_collection['ADRD_filter_cci_match_and_index_obs{}'.format(str(year))]
                case_df = all_df['case_index_date'].reset_index()

                case_df['case_id'] =  case_df['case_id'].astype('Int64')
                _cases_list['{}_yr_window'.format(str(year))] = {cp: case_df}

                selected_columns = ['psm_control_{}'.format(str(mi)) for mi in range(1, match_ratio+1)]
               
                all_df[selected_columns] = all_df[selected_columns].astype('Int64')
                control_melted = all_df.reset_index().melt(id_vars=['case_id'], value_vars=selected_columns, value_name='control_id').drop('variable', axis=1)
                control_melted = control_melted[~control_melted['control_id'].isnull()]

                case_control_index = _input_match_cci_collection['ADRD_filter_cci_match_and_index_obs{}'.format(str(year))]
                control_index_melted = control_melted.merge(case_control_index, left_on=['case_id', 'control_id'],\
                                                            right_on=['case_person_id', 'control_person_id' ], how='left')

                control_index_melted = control_index_melted[[ 'control_id', 'infer_index_date']]
                control_index_melted.columns = ['control_id', 'control_index_date']
                _controls_list['{}_yr_window'.format(str(year))] = {cp:control_index_melted} 

        return _cases_list, _controls_list
        
        
    print('Summary of matched cohort: ')
    summary(input_psm_match_collection, match_ratio)
    
    print('Format matched cohort ')
    _psm_match_collection = format_psm_match(input_psm_match_collection, match_ratio)
        
    print('Format case and control lists ')

    cases_list, controls_list = format_case_control_list(_psm_match_collection, input_match_cci_collection, match_ratio)
    
    return cases_list, controls_list



In [None]:

ratio = 10
savepath = 'large_coa'
input_match_cci_collection =    pickle.load( open('NewHome/cci_psm_match_90_ratio_{}_{}.pkl'.format(savepath, str(ratio)), 'rb'))
input_psm_match_collection =    pickle.load( open('NewHome/psm_match_90_ratio_{}_{}.pkl'.format(savepath, str(ratio)), 'rb'))

cases_list, controls_list = get_all_feature_for_psm_match(input_psm_match_collection, input_match_cci_collection, ratio)
        

## get prediction time


In [None]:
import random
import csv
# get prediction time 
random.seed(4533421)

year = 1
    
case_index = cases_list[f'{year}_yr_window'][1].rename(columns={'case_id':'patient_id', 'case_index_date':'index_date'})

control_index = controls_list[f'{year}_yr_window'][1].rename(columns={'control_id':'patient_id', 'control_index_date':'index_date'})

predict_date = pd.concat([case_index, control_index], axis=0)

predict_date['prediction_time'] = predict_date['index_date'] - pd.DateOffset(years=year, days=1 if year==0 else 0)

display(predict_date)


In [None]:
patient_index = dict(zip(predict_date.patient_id.values, [pd.Timestamp(d) for d in predict_date.prediction_time.values]))


## filter data with this prediction window


In [None]:
print('dxmotor |', dxmotor.shape)
dxmotor_this_year = dxmotor[dxmotor.patient_id.isin(set(predict_date.patient_id.values.tolist()))]

dxmotor_this_year['start'] = pd.to_datetime(dxmotor_this_year['start'])
print('dxmotor this year |', dxmotor_this_year.shape,'\n\t', dxmotor_this_year['start'].dtype)

dxmotor_1 = dxmotor_this_year[dxmotor_this_year.apply(lambda row: row['start'] < patient_index[row['patient_id']], axis=1)]
print('dxmotor earlier than prediction |', dxmotor_1.shape)
dxmotor_1

In [None]:
print('rxmotor |', rxmotor.shape)
rxmotor_this_year = rxmotor[rxmotor.patient_id.isin(set(predict_date.patient_id.values.tolist()))]

rxmotor_this_year['start'] = pd.to_datetime(rxmotor_this_year['start'])
print('rxmotor this year |', rxmotor_this_year.shape,'\n\t', rxmotor_this_year['start'].dtype)

rxmotor_1 = rxmotor_this_year[rxmotor_this_year.apply(lambda row: row['start'] < patient_index[row['patient_id']], axis=1)]
print('dxmotor earlier than prediction |', rxmotor_1.shape)
rxmotor_1

In [None]:
cat_motor = pd.concat([dxmotor_1, rxmotor_1], axis=0)

cat_motor

In [None]:

cat_motor_patients = cat_motor.patient_id.nunique()
print('patientts in motor', cat_motor_patients)

date_range = cat_motor.groupby('patient_id')['start'].agg(['min', 'max'])
date_range['EHR_length'] = (date_range['max'] - date_range['min']).dt.days / 365
date_range = date_range[date_range['EHR_length']>=1]
date_range_patients = date_range.index.values.tolist()
date_range_patients_test = date_range_patients

dx_motor_1_EHRs = dxmotor_1[dxmotor_1['patient_id'].isin(date_range_patients_test)]
rx_motor_1_EHRs = rxmotor_1[rxmotor_1['patient_id'].isin(date_range_patients_test)]

demo_1_EHRs = demographics_csv[demographics_csv['patient_id'].isin(date_range_patients_test)]


##### save if needed

In [None]:

predict_date_1_EHRs = predict_date[predict_date['patient_id'].isin(date_range_patients_test)]
predict_date_1_EHRs.to_csv(f'NewHome/trash/prediction_times_{year}.csv', index=False)

In [None]:
if not os.path.isdir(f'NewHome/csvinput/year_{year}/'):
    os.makedirs(f'NewHome/csvinput/year_{year}/')

rx_motor_1_EHRs['value'] = None
rx_motor_1_EHRs['units'] = None
rx_motor_1_EHRs['dosage'] = None
rx_motor_1_EHRs.to_csv(f'NewHome/csvinput/year_{year}/rxmotor.csv', sep=',', index=False)


dx_motor_1_EHRs['value']=None
dx_motor_1_EHRs['units']=None
dx_motor_1_EHRs['dosage']=None
dx_motor_1_EHRs.to_csv(f'NewHome/csvinput/year_{year}/dxmotor.csv', sep=',', index=False)


In [None]:

demo_1_EHRs['value']=None
demo_1_EHRs['units']=None
demo_1_EHRs['dosage']=None
demo_1_EHRs.to_csv(f'NewHome/csvinput/year_{year}/demographics_csv.csv', sep=',', index=False)



## Convert to the extract directory
We now convert the dataset we created above to an extract using the function [etl_simple_femr](https://github.com/som-shahlab/femr/blob/main/src/femr/etl_pipelines/simple.py#L66) from the femr repo

We need to first create folders to save the dataset and associated files 

In [None]:
import shutil
import os

INPUT_DIR = f'NewHome/csvinput/year_{year}'

TARGET_DIR = f'NewHome/trash/year_{year}'

LOG_DIR = os.path.join(TARGET_DIR, "logs")
EXTRACT_DIR = os.path.join(TARGET_DIR, "extract")


if os.path.exists(TARGET_DIR):
    shutil.rmtree(TARGET_DIR)

os.mkdir(TARGET_DIR)

In [None]:

import femr
import femr.etl_pipelines.simple
os.system(f"etl_simple_femr {INPUT_DIR} {EXTRACT_DIR} {LOG_DIR} --num_threads 2")

## extract representation by motor

In [None]:
import femr.datasets
database = femr.datasets.PatientDatabase(f'NewHome/trash/year_{year}/extract')

patients = list(database)

print('example patient: ', patients[10])
testpatient = database[patients[0]]


In [None]:
!ls NewHome/motor_dir/

!femr_compute_representations --data_path NewHome/trash/year_1/extract --model_path NewHome/motor_dir --prediction_times_path NewHome/trash/prediction_times_1.csv --batch_size 32 NewHome/motor_dir/motor_reprs_1.pkl
