### Query files and push files to Big Query
* Use Cohort's original cohort to join with demographics, order_proc for code status and admission orders
* After processing and joining the these files in R (Cohort_queries notebook), push to Big Query:
including the final adjusted cohort (processed in R, saved, called from here, pushed to BQ)

* Generate: admit_code and admit_inpatient from SQL. 
* Process and save as: ordertypes and admit_inpatient.

In [None]:
import pandas as pd
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

%matplotlib inline
%load_ext rpy2.ipython

In [None]:
import os 
from google.cloud import bigquery
from google.cloud.bigquery import dbapi

##Use correct path based on whether you are, Nero or local
# use Ctrl + Insert to copy and Shift + Insert to paste

# for Nero:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/jupyter/.config/gcloud/application_default_credentials.json'

# for local computer: check path first
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'C:\Users\User\AppData\Roaming\gcloud\application_default_credentials.json' 

##set correct Nero project
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

##Setting up BQ API, m1:
client = bigquery.Client()

##using dbAPI connection, m2:
conn = dbapi.connect(client)

### Conor's original queried cohort

In [None]:
# new cohort from shc_core dataset now
q = "SELECT * FROM `triage.triage_cohort_draft_2019`"
query_job = client.query(q)
df = query_job.to_dataframe()

In [None]:
df.to_csv('../../../Data/Ccohort.csv', index=False)
Ccohort = pd.read_csv('../../../Data/Ccohort.csv')

In [None]:
print(len(df))
df["label"].value_counts() # 14.23% of 1 34092 and 5659 vs new cohort 2019 14.13%

### Execute SQL queries to get encounters, inpatient-id, code status and demographics

In [None]:
q = open('../.././SQL/cohort_features/m1_encounter.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv('../../../Data/encounters.csv', index=False)

In [None]:
# skip this if not adjusting admission time
q = open('../.././SQL/cohort_features/m2_X_admit_inpatient.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv('../../../Data/admit_inpatients.csv', index=False)

In [None]:
q = open('../.././SQL/cohort_features/m3_codestatus.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv('../../../Data/code_status.csv', index=False)

In [None]:
q = open('../.././SQL/cohort_features/m4_demographic.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv('../../../Data/demographic.csv', index=False)

### Push updated cohort with inpatient_id_coded from R1 notebook to BQ
- Run R1 notebook
- Output of R1 notebook is *cohort* (same as cohort_demo), pushed to BQ
- Use this cohort size 43,493 to query labs and flowsheet to reduce the size/time -- as in SQL folder

In [None]:
# Updated cohort
cohort = pd.read_csv('../../../Data/cohort.csv')
print(len(cohort))
print(list(cohort.columns))
cohort.head(5)

In [None]:
table_schema = [{'name' : 'anon_id', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label_max24', 'type' : 'INTEGER'}]
                       
DATASET_NAME = 'triage'
TABLE_NAME = 'cohort'
cohort.to_gbq(destination_table='triage.%s' % TABLE_NAME,
              project_id='som-nero-phi-jonc101',
              table_schema=table_schema,
              if_exists='replace')

### Execute SQL queries to get flowsheet and labs

In [None]:
q = open('../.././SQL/cohort_features/m5.1_flowsheet_HW.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv('../../../Data/flowsheet_HW.csv', index=False)

In [None]:
q = open('../.././SQL/cohort_features/m5_flowsheet.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv('../../../Data/flowsheet.csv', index=False)

In [None]:
q = open('../.././SQL/m6_labs_additional_labnames.sql', 'r')
query_job =  client.query(q.read()).to_dataframe().to_csv('../../../Data/labs_additional.csv', index=False)

### Get lab data from BigQuery, from Tiffany's SQL folder "labs_and_flowsheet" 

In [None]:
q = "SELECT * FROM triage.triage_cohort_2019_filtered_labs"
query_job = client.query(q).to_dataframe().to_csv('../../../Data/lab_result.csv', index=False)

In [None]:
q = "SELECT * FROM triage.triage_cohort_2019_all_labs"
query_job = client.query(q).to_dataframe().to_csv('../../../Data/all_labs.csv', index=False)

In [None]:
q = "SELECT * FROM triage.triage_cohort_2019_labs_basename_filtered"
query_job = client.query(q).to_dataframe().to_csv('../../../Data/labs_basename.csv', index=False)

### Close queries

In [None]:
q.close()

### OLD -- Query and save files (demographics, code status, and admission) for joining later

In [None]:
# adapted Tiffany's function to save files
def query_data(sqlfile, savefile):
    # open and read the sql file
    query = open(sqlfile, 'r')
    resultsDF = pd.read_sql_query(query.read(), conn);
    
    # save the results to a file, close query and return df
    resultsDF.to_csv(savefile, index=False)
    query.close()
    return resultsDF

In [None]:
# demograhic table, join with cohort
sqlfile = 'SQL/m4_demographic.sql'
outfile = './Data/demographic.csv'
df = query_data(sqlfile, outfile)

### OLD -- TABLE schema

In [None]:
# %load_ext google.cloud.bigquery
# cohort file
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'}]
                       
DATASET_NAME = 'traige_TE'
TABLE_NAME = 'triage_cohort_adjusted'
cohort_adj.to_gbq(destination_table='traige_TE.%s' % TABLE_NAME,
                 project_id='mining-clinical-decisions',
                 table_schema=table_schema,
                 if_exists='replace')

In [None]:
cohort_demo = pd.read_csv('./Data/cohort_demo.csv')
print(len(cohort_demo))
cohort_demo.head(5)

In [None]:
# demographics file
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'gender', 'type' : 'STRING'},
                {'name' : 'race', 'type' : 'STRING'},
                {'name' : 'language', 'type' : 'STRING'},
                {'name' : 'insurance', 'type' : 'STRING'},
                {'name' : 'recent_height', 'type' : 'INTEGER'},
                {'name' : 'recent_weight', 'type' : 'INTEGER'}, 
                {'name' : 'age', 'type' : 'INTEGER'},
                {'name' : 'recent_date', 'type' : 'DATE'}]
                       
DATASET_NAME = 'traige_TE'
TABLE_NAME = 'triage_cohort_adjusted_demographics'
cohort_demo.to_gbq(destination_table='traige_TE.%s' % TABLE_NAME,
                 project_id='mining-clinical-decisions',
                 table_schema=table_schema,
                 if_exists='replace')

In [None]:
# push flowsheet up BQ:
# clean demographics file, with new variables medis and English
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'gender', 'type' : 'STRING'},
                {'name' : 'race', 'type' : 'STRING'},
                {'name' : 'age', 'type' : 'INTEGER'},
                {'name' : 'medis', 'type' : 'INTEGER'},
                {'name' : 'Engl', 'type' : 'INTEGER'},
                {'name' : 'Height', 'type' : 'INTEGER'},
                {'name' : 'Weight', 'type' : 'INTEGER'}]
                       
DATASET_NAME = 'traige_TE'
TABLE_NAME = 'triage_cohort_adjusted_demographics_clean'
cohort_demo_clean.to_gbq(destination_table='traige_TE.%s' % TABLE_NAME,
                 project_id='mining-clinical-decisions',
                 table_schema=table_schema,
                 if_exists='replace')

In [None]:
cohort_demo_imputed = pd.read_csv('./Data/cohort_demo_imputed.csv')
print(len(cohort_demo_imputed))
cohort_demo_imputed.head(5)

In [None]:
# push flowsheet up BQ:
# imputed_demographics file
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'inpatient_data_id_coded', 'type': 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'label', 'type' : 'INTEGER'},
                {'name' : 'Engl', 'type' : 'INTEGER'},
                {'name' : 'gender1', 'type' : 'STRING'},
                {'name' : 'race1', 'type' : 'STRING'},
                {'name' : 'age1', 'type' : 'INTEGER'},
                {'name' : 'medis1', 'type' : 'INTEGER'},
                {'name' : 'Height1', 'type' : 'INTEGER'},
                {'name' : 'Weight1', 'type' : 'INTEGER'},
                {'name' : 'delta_H', 'type' : 'INTEGER'},
                {'name' : 'delta_W', 'type' : 'INTEGER'},]

                       
DATASET_NAME = 'traige_TE'
TABLE_NAME = 'triage_cohort_adjusted_demographics_imputed'
cohort_demo_imputed.to_gbq(destination_table='traige_TE.%s' % TABLE_NAME,
                 project_id='mining-clinical-decisions',
                 table_schema=table_schema,
                 if_exists='replace')

In [None]:
cohort_vitals_clean = pd.read_csv('./Data/cohort_vitals_clean.csv')
print(len(cohort_vitals_clean))
cohort_vitals_clean.tail(5)

In [None]:
# cleaned vitals
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'recorded_time', 'type' : 'TIMESTAMP'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'feature_type', 'type' : 'STRING'}]

                       
DATASET_NAME = 'traige_TE'
TABLE_NAME = 'triage_cohort_adjusted_vitals_clean'
cohort_vitals_clean.to_gbq(destination_table='traige_TE.%s' % TABLE_NAME,
                 project_id='mining-clinical-decisions',
                 table_schema=table_schema,
                 if_exists='replace')

In [None]:
cohort_short_labs_clean_4binning = pd.read_csv('./Data/cohort_short_labs_clean_4binning.csv')
print(len(cohort_short_labs_clean_4binning))
cohort_short_labs_clean_4binning.head(5)

In [None]:
# cleaned labs
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'result_time', 'type' : 'TIMESTAMP'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'feature_type', 'type' : 'STRING'}]

                       
DATASET_NAME = 'traige_TE'
TABLE_NAME = 'triage_cohort_adjusted_short_labs_clean_4binning'
cohort_short_labs_clean_4binning.to_gbq(destination_table='traige_TE.%s' % TABLE_NAME,
                 project_id='mining-clinical-decisions',
                 table_schema=table_schema,
                 if_exists='replace')

In [None]:
cohort_vitals_labs_long = pd.read_csv('./Data/cohort_vitals_labs_long.csv')
cohort_vitals_labs_long.tail(5)
print(len(cohort_vitals_labs_long))

In [None]:
cohort_vitals_labs_long.loc[cohort_vitals_labs_long['features'] == "DBP"].describe()

In [None]:
# cohort_vitals_labs_long.loc[cohort_vitals_labs_long['features'] == "DBP"].head()
cohort_vitals_labs_long = cohort_vitals_labs_long.drop('time', axis=1)
cohort_vitals_labs_long.loc[cohort_vitals_labs_long['features'] == "DBP"].head()

In [None]:
# vitals and labs combined in a specific format
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'features', 'type' : 'STRING'},
#                 {'name' : 'time', 'type' : 'TIMESTAMP'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'feature_type', 'type' : 'STRING'}]

                       
DATASET_NAME = 'traige_TE'
TABLE_NAME = 'triage_cohort_adjusted_vitals_labs_4binning'
cohort_vitals_labs_long.to_gbq(destination_table='traige_TE.%s' % TABLE_NAME,
                 project_id='mining-clinical-decisions',
                 table_schema=table_schema,
                 if_exists='replace')

In [None]:
# no time col
cohort_demo_vitals_labs_long = pd.read_csv('./Data/cohort_demo1hot_vitals_labs_long.csv') 
print(len(cohort_demo_vitals_labs_long))
cohort_demo_vitals_labs_long.head(5)

In [None]:
# demographics, vitals, labs in the specific format
table_schema = [{'name' : 'jc_uid', 'type' : 'STRING'},
                {'name' : 'pat_enc_csn_id_coded', 'type' : 'INTEGER'},
                {'name' : 'admit_time', 'type' : 'TIMESTAMP'},
                {'name' : 'features', 'type' : 'STRING'},
                {'name' : 'time', 'type' : 'TIMESTAMP'},
                {'name' : 'values', 'type' : 'FLOAT'},
                {'name' : 'feature_type', 'type' : 'STRING'}]

                       
DATASET_NAME = 'traige_TE'
TABLE_NAME = 'triage_cohort_adjusted_demo1hot_vitals_labs_long'
cohort_demo_vitals_labs_long.to_gbq(destination_table='traige_TE.%s' % TABLE_NAME,
                 project_id='mining-clinical-decisions',
                 table_schema=table_schema,
                 if_exists='replace')