In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os 
from google.cloud import bigquery
from tqdm import tqdm
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/ccorbin/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

client=bigquery.Client()



In [2]:
ROUND="validation"
#ROUND="test"

In [3]:
def read_bq_to_pandas(query, nrows, chunksize=500000):
    offsets = [i for i in range(0, nrows, chunksize)]
    df = pd.DataFrame()
    for offset in tqdm(offsets):
        query_str = query + " LIMIT {chunksize} OFFSET {offset}"
        query_str = query_str.format(chunksize=chunksize, offset=offset)
        query_job = client.query(query_str)
        df_slice = query_job.result().to_dataframe()
        df = pd.concat([df, df_slice])
    
    return df

In [4]:
query = """
SELECT f.*, EXTRACT(YEAR from f.index_time) year
FROM mining-clinical-decisions.abx.feature_counts_long f
RIGHT JOIN  mining-clinical-decisions.abx.final_cohort_table c
USING (pat_enc_csn_id_coded)
ORDER BY pat_enc_csn_id_coded, features
"""
df = read_bq_to_pandas(query, nrows=7754289, chunksize=500000)

100%|██████████| 16/16 [10:48<00:00, 40.55s/it]


In [5]:
### Sanity check - no duplicate rows
assert(len(df) == len(df.drop_duplicates()))

In [6]:
if ROUND == "validation":
    fold_years = [2009 + i for i in range(10)]
    end_year = 2019
else:
    fold_years = [2009 + i for i in range(11)]
    end_year = 2020

print(ROUND)
folds = []
for year in fold_years:
    training_examples = df[(df['year'] < end_year) & (df['year'] != year)]
    test_examples = df[(df['year'] == year)]
    folds.append({'training_examples' : training_examples, 'test_examples' : test_examples})

validation


In [7]:
from scipy.sparse import csr_matrix, save_npz

def build_vocab(data):
    """Builds vocabulary for of terms from the data. Assigns each unique term to a monotonically increasing integer."""
    vocabulary = {}
    for i, d in enumerate(data):
        for j, term in enumerate(d):
            vocabulary.setdefault(term, len(vocabulary))
    return vocabulary

def create_sparse_feature_matrix(train_data, apply_data):
    """Creates sparse matrix efficiently from long form dataframe.  We build a vocabulary
       from the training set, then apply vocab to the apply_set
       
       Parameters
       ----------
       train_data : long form pandas DataFrame
           Data to use to build vocabulary
       apply_data : long form pandas DataFrame
           Data to transform to sparse matrix for input to ML models
    
       Returns
       -------
       csr_data : scipy csr_matrix
           Sparse matrix version of apply_data to feed into ML models. 
    """
    
    train_features = train_data.groupby('pat_enc_csn_id_coded').agg({
        'features' : lambda x: list(x),
        'value' : lambda x: list(x)}).reset_index()
    train_feature_names = [doc for doc in train_features.features.values]
    train_feature_values = [doc for doc in train_features['value'].values]
    train_csns = [csn for csn in train_features.pat_enc_csn_id_coded.values]
    
    apply_features = apply_data.groupby('pat_enc_csn_id_coded').agg({
        'features' : lambda x: list(x),
        'value' : lambda x: list(x)}).reset_index()
    apply_features_names = [doc for doc in apply_features.features.values]
    apply_features_values = [doc for doc in apply_features['value'].values]
    apply_csns = [csn for csn in apply_features.pat_enc_csn_id_coded.values]

    
    vocabulary = build_vocab(train_feature_names)
    indptr = [0]
    indices = []
    data = []
    for i, d in enumerate(apply_features_names):
        for j, term in enumerate(d):
            if term not in vocabulary:
                continue
            else:
                indices.append(vocabulary[term])
                data.append(apply_features_values[i][j])
            if j == 0:
                # Add zero to data and max index in vocabulary to indices in case max feature indice isn't in apply features.
                indices.append(len(vocabulary)-1)
                data.append(0)
        indptr.append(len(indices))
    
    csr_data = csr_matrix((data, indices, indptr), dtype=float)
    
    return csr_data, apply_csns, vocabulary


In [8]:
fold_data  = []
for fold in folds:
    train_csr, train_csns, train_vocab = create_sparse_feature_matrix(fold['training_examples'], fold['training_examples'])
    test_csr, test_csns, test_and_val_vocab = create_sparse_feature_matrix(fold['training_examples'], fold['test_examples'])
    
    fold_data.append({'train_csr' : train_csr, 'train_csns' : train_csns, 'train_vocab': train_vocab,
                       'test_csr' : test_csr, 'test_csns' : test_csns, 'test_and_val_vocab' : test_and_val_vocab})
    

In [9]:
q_cohort = """
SELECT * 
FROM mining-clinical-decisions.abx.final_cohort_table
ORDER BY pat_enc_csn_id_coded
"""
query_job = client.query(q_cohort)
df_cohort = query_job.result().to_dataframe()

In [10]:
print(ROUND)
fold_labels = []
for i, year in enumerate(fold_years):
    training_labels = df_cohort[(df_cohort['index_time'].dt.year < end_year) & (df_cohort['index_time'].dt.year != year)]
    test_labels = df_cohort[(df_cohort['index_time'].dt.year == year)]
    fold_labels.append({'training_labels' : training_labels, 'test_labels' : test_labels})

    # Sanity checks
    for a, b in zip(training_labels['pat_enc_csn_id_coded'].values, fold_data[i]['train_csns']):
        assert a == b
    for a, b in zip(test_labels['pat_enc_csn_id_coded'].values, fold_data[i]['test_csns']):
        assert a == b

validation


In [13]:
from scipy.sparse import csr_matrix, save_npz

base_path = '/home/ccorbin/er_infection/data/censored_task/{}'

for i, year in enumerate(fold_years):
    path = base_path.format(year)
    os.makedirs(path, exist_ok=True)
    
    # Save feature matrix
    save_npz(os.path.join(path, 'training_examples.npz'), fold_data[i]['train_csr'])
    save_npz(os.path.join(path, 'test_examples.npz'), fold_data[i]['test_csr'])

    # Save labels
    fold_labels[i]['training_labels'].to_csv(os.path.join(path, 'training_labels.csv'), index=None)
    fold_labels[i]['test_labels'].to_csv(os.path.join(path, 'test_labels.csv'), index=None)