## Imports

In [None]:
import sys
import time
import importlib
import sparse
import datetime
import pandas as pd
import numpy as np
import scipy.sparse
import matplotlib.pyplot as plt

import Utils.dbutils as dbutils
import Utils.data_utils as data_utils
# import Utils.PopulateAux as PopulateAux
import Generators.CohortGenerator as CohortGenerator
import Generators.FeatureGenerator as FeatureGenerator
import config
local_imports = (
    dbutils,
    data_utils,
    # PopulateAux,
    CohortGenerator,
    FeatureGenerator,
    config
)
for i in local_imports:
    i = importlib.reload(i)

## Cohort, Outcome and Feature Collection

### 1. Set up a connection to the OMOP CDM database

Parameters for connection to be specified in ./config.py

In [None]:
# database connection
username = config.PG_USERNAME
password = config.PG_PASSWORD
database_name = config.DB_NAME

config_path = 'postgresql://{username}:{password}@{database_name}'.format(
    username = username,
    password = password,
    database_name = database_name
)

# schemas 
schema_name = 'flexible_windows_fixed_test' # all created tables will be created using this schema

# caching
reset_schema = False # if true, rebuild all data from scratch

# set up database, reset schemas as needed
db = dbutils.Database(config_path, schema_name)
if reset_schema:
    db.execute(
        'drop schema if exists {} cascade'.format(schema_name)
    )
db.execute(
    'create schema if not exists {}'.format(schema_name)
)

### Build a Cohort for the End of Life Prediction Task

##### Create a Cohort Object that can be constructed as specified by SQL File & Params

In [None]:
cohort_name = 'test_cohort_all_prediction_times_heldout'
cohort_script_path = config.SQL_PATH_COHORTS + '/test_cohort_all_prediction_times_heldout.sql'

# cohort parameters  
params = {
          'cohort_table_name'     : cohort_name,
          'schema_name'           : schema_name,
          'gap'                   : '3 months',
          'outcome_window'        : '6 months',
          'eligibility_period'    : '1 year',       ## Enrollment duration before prediction time
          'positive_pred_unif'    : 'true',         ## If 'true', then uniformly sample deltas...
          'positive_pred_delta'   : '6 months',     ## ...otherwise, fixed interval.
          # 'negative_pred_date'    : '2017-01-01',  
          'dummy_date'            : '1900-01-01',   ## Can be arbitrary, but must be consistent with feature SQL scripts
          'max_prediction_date'   : '2017-01-01'    ## Only use prediction times up to this parameter   
         }

cohort = CohortGenerator.Cohort(
    schema_name=schema_name,
    cohort_table_name=cohort_name,
    cohort_generation_script=cohort_script_path,
    cohort_generation_kwargs=params,
    outcome_col_name='y'
)

##### Build Cohort

In [None]:
cohort.build(db, replace=True)

In [None]:
cohort._cohort.y.value_counts()

In [None]:
import matplotlib.dates as mdates
plt.hist(pd.to_datetime(cohort._cohort[cohort._cohort.y == 1].end_date))
plt.gca().xaxis.set_major_locator(mdates.YearLocator((1)))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%d %b %Y"))
plt.gcf().autofmt_xdate()
plt.title("Prediction time for positive samples")
plt.show()

In [None]:
import matplotlib.dates as mdates
plt.hist(pd.to_datetime(cohort._cohort[cohort._cohort.y == 0].end_date))
plt.gca().xaxis.set_major_locator(mdates.YearLocator((1)))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%d %b %Y"))
plt.gcf().autofmt_xdate()
plt.title("Prediction time for negative samples")
plt.show()

In [None]:
print("Min end date for positive samples:", min(cohort._cohort[cohort._cohort.y == 1].end_date))
print("Max end date for positive samples:", max(cohort._cohort[cohort._cohort.y == 1].end_date))
print("Min end date for negative samples:", min(cohort._cohort[cohort._cohort.y == 0].end_date))
print("Max end date for negative samples:", max(cohort._cohort[cohort._cohort.y == 0].end_date))

### Get a Time Series of Features for Cohort Members

In [None]:
featureSet = FeatureGenerator.FeatureSet(db)
featureSet.add_default_features(
    ['drugs_relative','conditions_relative','procedures_relative','specialty_relative'],
    schema_name,
    cohort_name
)

In [None]:
%%time
# Build the Feature Set by executing SQL queries and reading into sparse matrices
cache_data_path = '/tmp/cache_data_flexible_fixed_test'
featureSet.build(cohort, from_cached=False, cache_file=cache_data_path)

In [None]:
len(featureSet.time_map)

In [None]:
%%time
outcomes_filt, feature_matrix_3d_transpose, remap, good_feature_names = \
    FeatureGenerator.postprocess_feature_matrix(cohort, featureSet, training_end_date_col='dummy_date')

### Build a Model to Predict End of Life using this data

#### Transform Data to get a vector of counts over 1, 6, 12, 24 and infinity (represented by 10000 days) month windows for each Patient

In [None]:
%%time
feature_matrix_counts, feature_names = data_utils.window_data(
    window_lengths = [30, 180, 365, 730, 10000],
    feature_matrix = feature_matrix_3d_transpose,
    all_feature_names = good_feature_names,
    cohort = cohort,
    featureSet = featureSet,
    cohort_end_date_col = 'dummy_date'
)

In [None]:
feature_matrix_counts.T

In [None]:
outcomes_filt.value_counts()

#### Set up a standard sklearn modelling pipeline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, average_precision_score

lr = LogisticRegression(
    class_weight='balanced', C=0.02,
    penalty='l1', fit_intercept=True,
    solver='liblinear', random_state=0,
    verbose=0, max_iter = 200, tol=1e-1
)

def sparse_ufunc(f):
    def wrapper(*a, **k):
        X = a[0]
        if not scipy.sparse.isspmatrix(X):
            raise ValueError
        X2 = X.copy()
        X2.data = f(X2.data, *(a[1:]), **k)
        return X2
    return wrapper

@sparse_ufunc
def tr_func(X, kwarg=1):
    return np.clip(X, 0, kwarg)

func = FunctionTransformer(
    func=tr_func,
    accept_sparse=True,
    validate=True,
    kw_args={'kwarg': 1}
)

# The classifier will transform each data point using func, which here takes a count vector to a binary vector
# Then, it will use logistic regression to classify the transformed data
clf = Pipeline([
    ('func',func),
    ('lr', lr)
])

#### Instantiate, Train and Test Model

In [None]:
from tqdm import tqdm_notebook

In [None]:
indices = range(len(outcomes_filt))
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(
    feature_matrix_counts.T, outcomes_filt, indices,
    test_size=0.33, random_state=1
)

VAL_SZ = 10000
vals = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0]
scores = []
best_score = -1
best_model = None
for C in tqdm_notebook(vals):
    lr = LogisticRegression(
        class_weight='balanced', C=C,
        penalty='l1', fit_intercept=True,
        solver='liblinear', random_state=0,
        verbose=0, max_iter = 200, tol=1e-1
    )

    clf = Pipeline([
        ('func',func),
        ('lr', lr)
    ])

    clf.fit(X_train, y_train)
    score = roc_auc_score(y_test[:VAL_SZ], clf.predict_proba(X_test[:VAL_SZ, :])[:, 1])
    if score > best_score:
        best_score = score
        best_model = clf
    scores.append(score)
plt.plot(np.log(np.array(vals)), scores)
plt.show();
pred = best_model.predict_proba(X_test[VAL_SZ:, :])[:, 1]
print('Model Test AUC: {0:.2f}'.format(roc_auc_score(y_test[VAL_SZ:], pred)))

#### Evaluate Model : RoC and Precision-Recall Curves

In [None]:
fpr, tpr, _ = roc_curve(y_test[VAL_SZ:], pred)
plt.figure()
lw = 2
plt.fill_between(fpr, tpr, color='b', alpha = 0.2,
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc_score(y_test[VAL_SZ:], pred))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic - EoL Model')
plt.legend(loc="lower right")
plt.show()

In [None]:
precision, recall, _ = precision_recall_curve(y_test[VAL_SZ:], pred)

from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test[VAL_SZ:], pred)

plt.figure()
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(
    recall, precision, alpha=0.2, color='b',
    label='P-R curve (average precision = %0.2f)' % average_precision
)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
plt.legend(loc="upper right")
plt.title('Precision-Recall curve - EoL Model')
plt.show()

## SARD test set
From Rohan's notebook

In [None]:
import pickle
eol_data_path = '{INSERT PATH HERE}/eol_all_data.data'
with open(, "rb" ) as f:
    SAVED_MODEL_DATA = pickle.load(f)

In [None]:
cohort_sard = SAVED_MODEL_DATA['cohort'] # cohort.cohort_ is a dataframe containing PID's and outcomes for each patient
featureSet_sard = SAVED_MODEL_DATA['featureSet'] 
dataset_dict = SAVED_MODEL_DATA['dataset_dict'] 
feature_matrix_3d_transpose_sard = SAVED_MODEL_DATA['feature_matrix_3d_transpose'] 

In [None]:
# collect feature names
good_feature_names_sard = np.vectorize(dataset_dict['maps']['concept'].get)(
    dataset_dict['remap']['concept']
)

# get feature counts over the given time windows
feature_matrix_counts_sard, feature_names_sard = data_utils.window_data_sorted(
    window_lengths = [30, 180, 365, 730, 10000], # Fixed this because this set of windows does consistently better, but can choose other settings from list of options in paper
    feature_matrix = feature_matrix_3d_transpose_sard,
    all_feature_names = good_feature_names_sard,
    cohort = cohort_sard, 
    featureSet = featureSet_sard
)
feature_matrix_counts_sard = feature_matrix_counts_sard.T

In [None]:
from scipy.sparse import vstack as vstack_spm

In [None]:
# create train, test and validate sets of patients
val_size = 5000
val_size_from_train = 14319 # VALIDATION SET MAY BE TOO SMALL, SO WE TAKE THE LAST `val_size_from_train` EXAMPLES OF TRAIN AND MOVE TO VAL
indices_all_sard = range(len(dataset_dict['outcomes_filt']))
X_train_sard, X_test_sard, y_train_sard, y_test_sard, indices_train_sard, indices_test_sard = train_test_split(
    feature_matrix_counts_sard, dataset_dict['outcomes_filt'], indices_all_sard,
    test_size=0.2, random_state=1
)
X_val_sard = X_train_sard[-val_size_from_train:]
y_val_sard = y_train_sard[-val_size_from_train:]

X_val_sard = vstack_spm((X_val_sard, X_test_sard[:val_size]))
y_val_sard = pd.concat((y_val_sard,  y_test_sard[:val_size]))

X_train_sard = X_train_sard[:-val_size_from_train]
y_train_sard = y_train_sard[:-val_size_from_train]

X_test_sard = X_test_sard[val_size:]
y_test_sard = y_test_sard[val_size:]

In [None]:
print("SARD dataset dimension: %d, our dataset dimension: %d" % (X_test_sard.shape[1], X_test.shape[1]))

In [None]:
print("Common features: %d" % (len(set(feature_names).intersection(set(feature_names_sard)))))

In [None]:
## Reindex SARD test set to fit into this model
feature_index_sard = {feature_names_sard[i]: i for i in range(len(feature_names_sard))}
reindex = []
for ft in feature_names:
    if ft in feature_index_sard:
        reindex.append(feature_index_sard[ft])
    else:
        reindex.append(len(feature_names_sard))  # will add zero column later on

In [None]:
from scipy.sparse import hstack, csr_matrix
X_test_sard_with_zero = csr_matrix(hstack([X_test_sard, csr_matrix(np.zeros((X_test_sard.shape[0], 1)))]))

In [None]:
X_test_sard_reindexed = X_test_sard_with_zero[:, reindex]

In [None]:
print('AUC on SARD test set: {0:.2f}'.format(roc_auc_score(y_test_sard, best_model.predict_proba(X_test_sard_reindexed)[:, 1])))
print('Average precision on SARD test set: %.2f' % (average_precision_score(y_test_sard, best_model.predict_proba(X_test_sard_reindexed)[:, 1])))

#### Evaluate Model : Extract Feature Weights

In [None]:
feature_weights = pd.DataFrame({
    'feature_name': feature_names,
    'feature_weight': list(best_model.get_params()['lr'].coef_[0])
}).sort_values(by='feature_weight')
pd.set_option('display.max_colwidth', -1)

In [None]:
feature_weights.tail(20)

In [None]:
feature_weights.head(20)

In [None]:
for category in ['procedure', 'condition', 'drug', 'specialty']:
    print('Number of nonzero {} feature weights : {}'.format(
        category,
        sum(feature_weights.loc[
            [i for i,j in enumerate(feature_names) if '- ' + category + ' -' in j]
        ]['feature_weight'] != 0)
    ))