In [5]:
import pandas as pd
import numpy as np
import os
import json
import gcsfs
import urllib.request
import subprocess
import pickle
from tqdm import tqdm
from datetime import datetime
from google.cloud import bigquery
from collections import OrderedDict
import pickle
my_bucket = os.getenv("WORKSPACE_BUCKET")

In [9]:
import requests
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


# prepare input ids

In [None]:
all_patient_ids = pickle.load(open('ADRD_ids.pkl', 'rb'))
all_patient_ids = all_patient_ids.person_id.tolist()
print('Process a  set of patients', len(all_patient_ids))


In [42]:

batch_size = 30000
batches = [all_patient_ids[i:i + batch_size] for i in range(0, len(all_patient_ids), batch_size)]


# Dxdata processing, merge with condition, and get original code id

In [31]:


class dxData(object):
    def __init__(self, save_path):
        self.savepath = save_path # save if needed


    def get_condition(self, ids, batches=True): 
        if batches is True:
            data_batches = []
            for batch in ids:

                dx_encounter_controls_query = f"""
                    SELECT DISTINCT co.person_id, co.condition_concept_id, co.condition_source_concept_id, 
                            co.condition_source_value, co.condition_start_date
                    FROM {os.environ["WORKSPACE_CDR"]}.condition_occurrence co
                    LEFT JOIN {os.environ["WORKSPACE_CDR"]}.concept c 
                        ON co.condition_concept_id = c.concept_id
                    WHERE co.person_id IN ({', '.join(map(str, batch))})
                        AND co.condition_concept_id != 0
                        AND c.domain_id = 'Condition'
                        AND  c.vocabulary_id = 'SNOMED'
                    """

                batch_data = pd.read_gbq(dx_encounter_controls_query, 
                             dialect="standard",
                             use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
                             progress_bar_type="tqdm_notebook")

                data_batches.append(batch_data)

            data = pd.concat(data_batches, ignore_index=True)

        else:

            dx_encounter_controls_query = f"""
                SELECT DISTINCT co.person_id, co.condition_concept_id, co.condition_source_concept_id, 
                        co.condition_source_value, co.condition_start_date
                FROM {os.environ["WORKSPACE_CDR"]}.condition_occurrence co
                LEFT JOIN {os.environ["WORKSPACE_CDR"]}.concept c 
                    ON co.condition_concept_id = c.concept_id
                WHERE co.person_id IN ({', '.join(map(str, ids))})
                    AND co.condition_concept_id != 0
                    AND c.domain_id = 'Condition'
                    AND  c.vocabulary_id = 'SNOMED'
                """
            data = pd.read_gbq(dx_encounter_controls_query, 
                             dialect="standard",
                             use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
                             progress_bar_type="tqdm_notebook")
            
        if self.savepath is not None:
            pickle.dump(data, open(f'{savepath1}.pkl', 'wb'))
        return data



In [None]:
savepath1 = 'uf_large_r_diag' # by setting savepath, the extracted data will be automatically saved
dxDataProcess = dxData(save_path=savepath1)

dxdata = dxDataProcess.get_condition(batches)
print('extract dx: ', dxdata.shape)


dxdata = dxdata[~dxdata['condition_concept_id'].isin([0])]
print('after checking zero concept id: ', dxdata.shape)


#  Drug data definition, and get original source code

In [43]:
class drugData(object):

    def __init__(self, savepath):
        self.savepath = savepath

        

    def get_medication(self, ids, batches=True): 
        if batches is True:
            data_batches = []
            for batch in ids:

                drug_unique_exposure_query = f"""
                    SELECT DISTINCT de.person_id, de.drug_concept_id, de.drug_source_concept_id, 
                                    de.drug_source_value, de.drug_exposure_start_date
                    FROM {os.environ["WORKSPACE_CDR"]}.drug_exposure de
                    LEFT JOIN {os.environ["WORKSPACE_CDR"]}.concept c 
                        ON de.drug_concept_id = c.concept_id
                    WHERE de.person_id IN ({', '.join(map(str, batch))})
                      AND de.drug_concept_id != 0
                      AND c.vocabulary_id IN ('RxNorm', 'RxNorm Extension')
                      AND c.domain_id = 'Drug'
                    """

                batch_data = pd.read_gbq(drug_unique_exposure_query, 
                             dialect="standard",
                             use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
                             progress_bar_type="tqdm_notebook")

                data_batches.append(batch_data)

            data = pd.concat(data_batches, ignore_index=True)

        else:

            drug_unique_exposure_query = f"""
            SELECT DISTINCT de.person_id, de.drug_concept_id, de.drug_source_concept_id, 
                            de.drug_source_value, de.drug_exposure_start_date
            FROM {os.environ["WORKSPACE_CDR"]}.drug_exposure de
            LEFT JOIN {os.environ["WORKSPACE_CDR"]}.concept c 
                ON de.drug_concept_id = c.concept_id
            WHERE de.person_id IN ({', '.join(map(str, ids))})
              AND de.drug_concept_id != 0
              AND c.vocabulary_id IN ('RxNorm', 'RxNorm Extension')
              AND c.domain_id = 'Drug'
            """
            data = pd.read_gbq(drug_unique_exposure_query, 
                             dialect="standard",
                             use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
                             progress_bar_type="tqdm_notebook")
            
        if self.savepath is not None:
            pickle.dump(data, open(f'{savepath1}.pkl', 'wb'))
        return data
    


### conduct data preprocessing

In [None]:
savepath1 = 'uf_large_r_drug'

drugDataProcess = drugData(savepath1)
ddrug = drugDataProcess.get_medication(batches, batches=True)
print('get drug data', ddrug.shape)

ddrug = ddrug[~ddrug['drug_concept_id'].isin([0])]

print('filter drug data by 0 concept id', ddrug.shape)



# Process and construct measurement/lab test features, get values and original code id

### lb_data definition

In [64]:
class lbData(object):

    
    def __init__(self, savepath):

        self.savepath = savepath
    

    def get_measurement(self, ids, batches=True): 
        if batches is True:
            data_batches = []
            for batch in ids:

                query = f"""
                    SELECT DISTINCT t.person_id, t.measurement_concept_id, t.measurement_date, t.measurement_datetime, t.value_as_number, t.unit_concept_id, t.unit_source_value, t.range_high, t.range_low, c.vocabulary_id, c.concept_code
                    FROM {os.environ["WORKSPACE_CDR"]}.measurement t
                    LEFT JOIN {os.environ["WORKSPACE_CDR"]}.concept c
                        ON t.measurement_concept_id = c.concept_id
                    WHERE t.person_id IN ({', '.join(map(str, batch))})
                      AND t.measurement_concept_id  != 0
                      AND c.vocabulary_id = 'LOINC'
                      AND c.domain_id = 'Measurement'
                      AND t.value_as_number IS NOT NULL
                    """

                batch_data = pd.read_gbq(query, 
                         dialect="standard",
                         use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
                         progress_bar_type="tqdm_notebook")

                data_batches.append(batch_data)

            data = pd.concat(data_batches, ignore_index=True)

        else:

            query = f"""
                SELECT DISTINCT t.person_id, t.measurement_concept_id, t.measurement_date, t.measurement_datetime, t.value_as_number, t.unit_concept_id, t.unit_source_value, t.range_high, t.range_low, c.vocabulary_id, c.concept_code
                FROM {os.environ["WORKSPACE_CDR"]}.measurement t
                LEFT JOIN {os.environ["WORKSPACE_CDR"]}.concept c
                    ON t.measurement_concept_id = c.concept_id
                WHERE t.person_id IN ({', '.join(map(str, ids))})
                  AND t.measurement_concept_id  != 0
                  AND c.vocabulary_id = 'LOINC'
                  AND c.domain_id = 'Measurement'
                  AND t.value_as_number IS NOT NULL
                """

            batch_data = pd.read_gbq(query, 
                     dialect="standard",
                     use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
                     progress_bar_type="tqdm_notebook")
            
        if self.savepath is not None:
            pickle.dump(data, open(f'{savepath1}.pkl', 'wb'))
        return data


    def get_filtered_lbdata(self, lb_data, keeplist, chunk_size=500000, save_bucket=False):
        print('warning: deprecated')
        allcolumns = lb_data.columns
        filtered_df = pd.DataFrame(columns=allcolumns)
        
        for i in tqdm(range(0, len(lb_data), chunk_size)):
            chunk = lb_data.iloc[i:i + chunk_size]
            filtered_chunk = chunk[chunk['measurement_concept_id'].isin(keeplist)]
#             filter_chunk_list.append(filtered_chunk)
            filtered_df = pd.concat([filtered_df, filtered_chunk], axis=0)

    
        print('After measurement filtering', filtered_df.shape)
        
        return filtered_df

# remove measurement concept id = 0 


In [None]:
savepath1 = 'uf_large_r_lab'

labDataProcess = lbData(savepath1)
labdata = labDataProcess.get_measurement(batches, batches=True)
print('get drug data', labdata.shape)

labdata = labdata[~labdata['measurement_concept_id'].isin([0])]

print('filter drug data by 0 concept id', labdata.shape)



### filter by manual list

In [None]:
keeplist = [] # set your list of labs 
filtered_df = labDataProcess.get_filtered_lbdata(lb_data=labdata, keeplist=keeplist)
filtered_df
