In [None]:
import pandas as pd
import numpy as np
from google.cloud import bigquery

In [None]:
def run_query(query): 
    # Set up the BigQuery client
    project_id = 'som-nero-phi-sywang-starr'
    client = bigquery.Client(project=project_id)

    # Execute the query
    df = client.query(query, project=project_id).to_dataframe()

    return df

## Codes

In [None]:
# OMOP concept IDs
glaucoma_codes = [437541, 441284,435262,432312,438155,4318691,436398,441561,4191001,4072218,435543,436972,
                  437273,436687,434928,436975,765264,436110,765051,436108,440396,4246656,37204003,37204004,
                  760891,432626,4041191,4335596,433473,760888,40403168,761205,761148,765904,437269,4078543,
                  36684712,36684647,36684774,4065195,434030,433768,441005,36684734,440105,36684668,36684794,
                  437276,37108932,4336013,437553,37207911,37311952,4334259,37207972,4244668,432908,37207851,
                  441556,36713858,761614,4334260,36684699,4316071,4195502,4194237,761327,36713124,36713123,
                  4213414,4035651,36684765,36716349,42537442,438151,761612,761611,37208211,761610,37208212,
                  761613,761621,36684635,761575,761577,761576,4152558,376688]

glaucoma_suspect_borderline_codes = [4102183,40481141,4109420,3663197,4210875,435809,433767,432311]

## Load Data

In [None]:
query = """
SELECT 
    co.condition_occurrence_id,
    co.person_id,
    co.condition_concept_id,
    co.condition_start_date,
    co.condition_start_datetime,
    mc.*
FROM `som-nero-phi-sywang-starr.gps_stanford_clinic.condition_occurrence` AS co
LEFT JOIN `som-nero-phi-sywang-starr.gps_stanford_clinic.mrn_crosswalk` AS mc
ON co.person_id = mc.person_id;

"""
cond_occurrence_dat = run_query(query)
cond_occurrence_dat = cond_occurrence_dat.drop('person_id_1', axis = 1, inplace = False)
print(f"# of rows: {len(cond_occurrence_dat)}")
print(f"# of unique pats: {len(cond_occurrence_dat['MRN'].unique())}")

In [None]:
cond_occurrence_dat.head()

## Identify and remove suspect/borderline glaucoma patients

This version uses the criteria of 1 DX code, but using 2 is also an option.

In [None]:
# Exclude suspects unless they also have >= 1 definitive glaucoma diagnoses

# Suspects/borderline by MRN
glauc_sus = cond_occurrence_dat[cond_occurrence_dat['condition_concept_id'].isin(glaucoma_suspect_borderline_codes)]['MRN'].unique()

# Among suspects, keep those with >= 1 definitive glaucoma codes
d = cond_occurrence_dat[
    cond_occurrence_dat['condition_concept_id'].isin(glaucoma_codes) &
    cond_occurrence_dat['MRN'].isin(glauc_sus)
]
d = d['MRN'].value_counts().to_dict()
d = {k for k, v in d.items() if v >= 1}

# Exclude suspects who do NOT meet the >= 1 definitive dx threshold
pats_to_exclude_from_cohort = list(set(glauc_sus) - set(d))

# Final dataset used downstream
dataset_minus_suspect_glauc = cond_occurrence_dat[
    ~cond_occurrence_dat['MRN'].isin(pats_to_exclude_from_cohort)
]

print(f"Number of patients excluded from cohort due to suspect/borderline glaucoma: {len(pats_to_exclude_from_cohort)}")

total_participants = dataset_minus_suspect_glauc['MRN'].nunique()
print(f"There is a total of {total_participants} patients that have visited an eye doctor.")

# (For debugging) keep a slice of excluded suspects for inspection
# dataset_suspect_glauc = cond_occurrence_dat[
#     cond_occurrence_dat['MRN'].isin(pats_to_exclude_from_cohort)
# ]

## Identify Glaucoma Patients

In [None]:
glauc_pats = dataset_minus_suspect_glauc[dataset_minus_suspect_glauc.condition_concept_id.isin(glaucoma_codes)]

glauc_pats = glauc_pats['MRN'].value_counts().to_dict()
glauc_pats = [k for k,v in glauc_pats.items() if v >= 1]

glauc_pats_df = pd.DataFrame(glauc_pats, columns=['MRN'])
glauc_pats_df['outcome'] = 1

In [None]:
glauc_pats_df.head()

## Get First Glaucoma Diag Date

In [None]:
glauc_pats_df_temp = dataset_minus_suspect_glauc[dataset_minus_suspect_glauc.condition_concept_id.isin(glaucoma_codes)]
glauc_pats_date = glauc_pats_df_temp.groupby('MRN')['condition_start_datetime'].min().to_frame().reset_index()
glauc_pats_date['diag_date']  = glauc_pats_date['condition_start_datetime'].dt.date
glauc_pats_date.drop(['condition_start_datetime'], axis = 1, inplace = True)

In [None]:
glauc_pats_date.head()

In [None]:
glauc_pats_df = glauc_pats_df.merge(glauc_pats_date, how = 'left', on = 'MRN')

In [None]:
print(f'There is a total of {len(glauc_pats)} patients that have been diagnosed with glaucoma' + 
      '(>= 2 glaucoma diagnoses).')

## Identify Non-Glaucoma Patients

In [None]:
all_pats = dataset_minus_suspect_glauc['MRN'].unique()
nonglauc_pats = (list(set(all_pats) - set(glauc_pats)))

nonglauc_pats_df = pd.DataFrame(nonglauc_pats, columns=['MRN'])
nonglauc_pats_df['outcome'] = 0
nonglauc_pats_df.head()

## Combine and Save Cohort

In [None]:
cohort = pd.concat([glauc_pats_df, nonglauc_pats_df], ignore_index=True)
cohort.head()

In [None]:
cohort.tail()

In [None]:
print(f'There is a total of {len(cohort)} patients in this cohort.')
cohort.to_csv('processed_data/cohort.csv', index=False)