In [None]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from datetime import date

In [None]:
def run_query(query): 
    # Set up the BigQuery client
    project_id = 'som-nero-phi-sywang-starr'
    client = bigquery.Client(project=project_id)

    # Execute the query
    df = client.query(query, project=project_id).to_dataframe()

    return df

## Load Initial Cohort

In [None]:
cohort = pd.read_csv("processed_data/cohort.csv", dtype={'MRN': 'string'})

In [None]:
# Expected columns: MRN, outcome (0/1), diag_date
cohort.head()

In [None]:
cohort.shape

In [None]:
print(f'There is a total of {len(cohort)} patients in this cohort.')

## Load Demographic Data

In [None]:
query = """
SELECT *  
FROM `som-nero-phi-sywang-starr.gps_stanford_clinic.person` as co
LEFT JOIN `som-nero-phi-sywang-starr.gps_stanford_clinic.mrn_crosswalk` AS mc
ON co.person_id = mc.person_id;
"""
demo_dat = run_query(query)
demo_dat = demo_dat.drop(columns = ['person_id_1', 'location_id', 'provider_id', 'care_site_id', 'person_source_value'], axis = 1, inplace = False)

In [None]:
# Expected columns
demo_dat.columns

In [None]:
demo_dat.head()

## Age

In [None]:
def age(birthDate):
    today = date.today()
    age = today.year - birthDate.year - ((today.month, today.day) <
         (birthDate.month, birthDate.day))
    return age

In [None]:
demo_dat['age'] = demo_dat['birth_datetime'].apply(age)

In [None]:
demo_dat.drop(columns=['year_of_birth', 'month_of_birth', 'day_of_birth', 'birth_datetime', 'source_dob'], inplace = True)

In [None]:
demo_dat.head()

## Ethnicity

In [None]:
demo_dat['ethnicity_concept_id'].value_counts()

In [None]:
ethnicity_map = {
    38003564: 'nonhispanic',
    38003563: 'hispanic',
    0: 'other'
}
# Apply the mapping
demo_dat['ethnicity'] = demo_dat['ethnicity_concept_id'].map(ethnicity_map)
demo_dat.drop(columns=['ethnicity_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id'], inplace = True)

In [None]:
demo_dat['ethnicity'].value_counts()

In [None]:
demo_dat.head()

## Sex at Birth

In [None]:
demo_dat['gender_concept_id'].value_counts()

In [None]:
sx_birth_mapping = {
    8507: 'male',
    8532: 'female'
}

# Apply the mapping
demo_dat['sx_birth'] = demo_dat['gender_concept_id'].map(sx_birth_mapping)
demo_dat.drop(columns=['gender_concept_id', 'gender_source_value', 'gender_source_concept_id'], inplace = True)

In [None]:
demo_dat['sx_birth'].value_counts()

In [None]:
demo_dat.head()

## Race

In [None]:
demo_dat['race_concept_id'].value_counts()

In [None]:
race_mapping = {
    0: 'other',
    8515: 'asian',
    8527: 'white',
    8516: 'black',
    8557: 'nhpi',
    8657: 'aian'
}

# Apply the mapping
demo_dat['race'] = demo_dat['race_concept_id'].map(race_mapping)
demo_dat.drop(columns=['race_concept_id', 'race_source_value', 'race_source_concept_id'], inplace = True)

In [None]:
demo_dat['race'].value_counts()

In [None]:
demo_dat.head()

In [None]:
demo_dat.to_csv('processed_data/demo_not_1h_encoded.csv', index = False)

## Create Dummies

In [None]:
ethnicity_one_hot =  pd.get_dummies(demo_dat['ethnicity'],prefix = 'ethnicity').astype(int)
sx_birth_one_hot =  pd.get_dummies(demo_dat['sx_birth'],prefix = 'sx_birth').astype(int)
race_one_hot =  pd.get_dummies(demo_dat['race'],prefix = 'race').astype(int)

In [None]:
demo_dat = demo_dat.join(race_one_hot)
demo_dat = demo_dat.join(sx_birth_one_hot)
demo_dat = demo_dat.join(ethnicity_one_hot)

In [None]:
demo_dat.head()

In [None]:
demo_dat.drop(columns=['ethnicity', 'sx_birth', 'race'], inplace = True)

## Save File

In [None]:
demo_dat.drop(columns=['person_id'], inplace = True)
demo_dat_final = demo_dat[demo_dat.MRN.isin(cohort.MRN)]
assert len(demo_dat_final) == len(cohort)

In [None]:
demo_dat_final.head()

In [None]:
demo_dat_final.to_csv('processed_data/demo.csv', index = False)