In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os 
from google.cloud import bigquery
from tqdm import tqdm
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/ccorbin/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

client=bigquery.Client()



In [2]:
ROUND="validation"
#ROUND="test"

In [11]:
def read_bq_to_pandas(query, nrows, chunksize=500000):
    offsets = [i for i in range(0, nrows, chunksize)]
    df = pd.DataFrame()
    for offset in tqdm(offsets):
        query_str = query + " LIMIT {chunksize} OFFSET {offset}"
        query_str = query_str.format(chunksize=chunksize, offset=offset)
        query_job = client.query(query_str)
        df_slice = query_job.result().to_dataframe()
        df = pd.concat([df, df_slice])
    
    return df

In [None]:
query = """
SELECT f.*, EXTRACT(YEAR from f.index_time) year
FROM mining-clinical-decisions.abx.feature_counts_long f
RIGHT JOIN  mining-clinical-decisions.abx.final_cohort_table c
USING (pat_enc_csn_id_coded)
WHERE c.label_unobserved = 0
ORDER BY pat_enc_csn_id_coded, features
"""
df = read_bq_to_pandas(query, nrows=2688783, chunksize=500000)

 50%|█████     | 3/6 [02:21<02:21, 47.27s/it]

In [10]:
### Sanity check - no duplicate rows
assert(len(df) == len(df.drop_duplicates()))

AssertionError: 

In [9]:
if ROUND == "validation":
    print(ROUND)
    training_examples = df[df['year'] < 2018]
    test_examples = df[df['year'] == 2018]
else:
    print(ROUND)
    training_examples = df[df['year'] < 2019]
    test_examples = df[df['year'] == 2019]


validation


In [None]:
from scipy.sparse import csr_matrix, save_npz

def build_vocab(data):
    """Builds vocabulary for of terms from the data. Assigns each unique term to a monotonically increasing integer."""
    vocabulary = {}
    for i, d in enumerate(data):
        for j, term in enumerate(d):
            vocabulary.setdefault(term, len(vocabulary))
    return vocabulary

def create_sparse_feature_matrix(train_data, apply_data):
    """Creates sparse matrix efficiently from long form dataframe.  We build a vocabulary
       from the training set, then apply vocab to the apply_set
       
       Parameters
       ----------
       train_data : long form pandas DataFrame
           Data to use to build vocabulary
       apply_data : long form pandas DataFrame
           Data to transform to sparse matrix for input to ML models
    
       Returns
       -------
       csr_data : scipy csr_matrix
           Sparse matrix version of apply_data to feed into ML models. 
    """
    
    train_features = train_data.groupby('pat_enc_csn_id_coded').agg({
        'features' : lambda x: list(x),
        'values' : lambda x: list(x)}).reset_index()
    train_feature_names = [doc for doc in train_features.features.values]
    train_feature_values = [doc for doc in train_features['values'].values]
    train_csns = [csn for csn in train_features.pat_enc_csn_id_coded.values]
    
    apply_features = apply_data.groupby('pat_enc_csn_id_coded').agg({
        'features' : lambda x: list(x),
        'values' : lambda x: list(x)}).reset_index()
    apply_features_names = [doc for doc in apply_features.features.values]
    apply_features_values = [doc for doc in apply_features['values'].values]
    apply_csns = [csn for csn in apply_features.pat_enc_csn_id_coded.values]

    
    vocabulary = build_vocab(train_feature_names)
    indptr = [0]
    indices = []
    data = []
    for i, d in enumerate(apply_features_names):
        for j, term in enumerate(d):
            if term not in vocabulary:
                continue
            else:
                indices.append(vocabulary[term])
                data.append(apply_features_values[i][j])
            if j == 0:
                # Add zero to data and max index in vocabulary to indices in case max feature indice isn't in apply features.
                indices.append(len(vocabulary)-1)
                data.append(0)
        indptr.append(len(indices))
    
    csr_data = csr_matrix((data, indices, indptr), dtype=float)
    
    return csr_data, apply_csns, vocabulary


In [None]:
train_csr, train_csns, train_vocab = create_sparse_feature_matrix(training_examples, training_examples)
test_csr, test_csns, test_and_val_vocab = create_sparse_feature_matrix(test_examples, test_examples)

In [None]:
q_cohort = """
SELECT * 
FROM mining-clinical-decisions.abx.final_cohort_table
WHERE label_unobserved = 0
ORDER BY pat_enc_csn_id_coded
"""
query_job = client.query(q_cohort)
df_cohort = query_job.result().to_dataframe()

In [None]:
if ROUND == 'validation'
    train_labels = df_cohort[df_cohort['index_time'].dt.year < 2018]
    test_labels = df_cohort[df_cohort['index_time'].dt.year == 2018]
else:
    train_labels = df_cohort[df_cohort['index_time'].dt.year < 2019]
    test_labels = df_cohort[df_cohort['index_time'].dt.year == 2019]

for a, b in zip(train_labels['pat_enc_csn_id_coded'].values, train_csns):
    assert a == b
for a, b in zip(test_labels['pat_enc_csn_id_coded'].values, test_csns):
    assert a == b

In [None]:
from scipy.sparse import csr_matrix, save_npz

path = '/home/ccorbin/er_infection/data/'
os.makedirs(path, exist_ok=True)

# Save feature matrix
save_npz(os.path.join(path, 'training_examples.npz'), train_csr)
save_npz(os.path.join(path, 'test_examples.npz'), test_csr)

# Save labels
train_labels.to_csv(os.path.join(path, 'training_labels.csv'), index=None)
test_labels.to_csv(os.path.join(path, 'test_labels.csv'), index=None)