In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fhir_client import FHIRClient
import logging
import pandas as pd

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

client = FHIRClient(service_base_url='https://r3.smarthealthit.org', logger=logger)



## Querying Patients
There are two general ways of searching for patients with specific properties.

The first one is to search by coding system:

In [48]:
pats = client.get_all_patients()

INFO:root:Received 1 observations in 0.64 seconds.


{'coding': [{'system': 'http://loinc.org', 'code': '69453-9', 'display': 'Cause of Death [US Standard Certificate of Death]'}], 'text': 'Cause of Death [US Standard Certificate of Death]'}


INFO:root:Received 0 observations in 1.53 seconds.


IndexError: list index out of range

In [8]:
procedures = client.get_all_procedures()
pd.DataFrame([prod.code['coding'][0] for prod in procedures]).drop_duplicates().sort_values(by=['display']).head()

INFO:root:Received 18117 procedures in 133.63 seconds.


Unnamed: 0,code,display,system
893,183450002,Admission to burn unit,http://snomed.info/sct
1911,305340004,Admission to long stay hospital,http://snomed.info/sct
83,305428000,Admission to orthopedic department,http://snomed.info/sct
6217,305433001,Admission to trauma surgery department,http://snomed.info/sct
13687,35637008,Alcohol rehabilitation,http://snomed.info/sct


In [5]:
patients_by_procedure_code = client.get_patients_by_procedure_code("http://snomed.info/sct","73761001")
"Retrieved {} patients with a total of {} observations".format( len(patients_by_procedure_code), 
                                                               sum([len(pat.observations) for pat in patients_by_procedure_code]))

INFO:root:Received 47 observations in 0.20 seconds.
INFO:root:Received 165 observations in 0.67 seconds.
INFO:root:Received 162 observations in 0.68 seconds.
INFO:root:Received 54 observations in 2.05 seconds.
INFO:root:Received 178 observations in 2.80 seconds.
INFO:root:Received 63 observations in 0.35 seconds.
INFO:root:Received 45 observations in 0.20 seconds.
INFO:root:Received 48 observations in 0.20 seconds.
INFO:root:Received 61 observations in 0.33 seconds.
INFO:root:Received 98 observations in 0.54 seconds.
INFO:root:Received 155 observations in 0.87 seconds.
INFO:root:Received 49 observations in 0.20 seconds.
INFO:root:Received 89 observations in 0.39 seconds.
INFO:root:Received 41 observations in 0.19 seconds.
INFO:root:Received 139 observations in 0.53 seconds.
INFO:root:Received 33 observations in 0.18 seconds.
INFO:root:Received 163 observations in 0.65 seconds.
INFO:root:Received 163 observations in 0.78 seconds.
INFO:root:Received 37 observations in 0.19 seconds.
INFO:

INFO:root:Received 201 observations in 0.81 seconds.
INFO:root:Received 55 observations in 0.33 seconds.
INFO:root:Received 113 observations in 0.49 seconds.
INFO:root:Received 162 observations in 0.76 seconds.
INFO:root:Received 24 observations in 0.45 seconds.
INFO:root:Received 149 observations in 0.53 seconds.
INFO:root:Received 52 observations in 1.97 seconds.
INFO:root:Received 52 observations in 2.24 seconds.
INFO:root:Received 195 observations in 1.46 seconds.
INFO:root:Received 41 observations in 0.19 seconds.
INFO:root:Received 147 observations in 0.52 seconds.
INFO:root:Received 52 observations in 0.43 seconds.
INFO:root:Received 33 observations in 0.32 seconds.
INFO:root:Received 62 observations in 0.33 seconds.
INFO:root:Received 155 observations in 0.84 seconds.
INFO:root:Received 64 observations in 0.49 seconds.
INFO:root:Received 86 observations in 0.59 seconds.
INFO:root:Received 36 observations in 0.31 seconds.
INFO:root:Received 34 observations in 0.20 seconds.
INFO:

'Retrieved 252 patients with a total of 23145 observations'

The second one is by text. The searched text will be `CodeableConcept.text`, `Coding.display`, or `Identifier.type.text`:

In [29]:
#conditions = client.get_all_conditions()
pd.DataFrame([cond.code['coding'][0] for cond in conditions]).groupby('code').count().sort_values(by='display', ascending=False).head(50)

Unnamed: 0_level_0,display,system
code,Unnamed: 1_level_1,Unnamed: 2_level_1
444814009,1128,1128
195662009,590,590
10509002,502,502
15777000,418,418
38341003,405,405
40055000,271,271
65363002,211,211
72892002,209,209
43878008,178,178
44465007,123,123


In [18]:
patients_by_condition_text = client.get_patients_by_condition_text("Abdominal pain")
"Retrieved {} patients with a total of {} observations".format( len(patients_by_condition_text), 
                                                               sum([len(pat.observations) for pat in patients_by_condition_text]))


INFO:root:Received 154 observations in 0.63 seconds.
INFO:root:Received 59 observations in 0.32 seconds.
INFO:root:Received 162 observations in 0.65 seconds.
INFO:root:Received 206 observations in 0.83 seconds.
INFO:root:Received 4 patients in 3.04 seconds.


'Retrieved 4 patients with a total of 581 observations'

## Machine Learning

In [20]:
from ml_on_fhir import MLOnFHIR
from fhir_objects.patient import Patient
from sklearn.tree import DecisionTreeClassifier

ml_fhir = MLOnFHIR(Patient, feature_attrs=['birthDate'], label_attrs=['gender'])
X, y, trained_clf = ml_fhir.fit(patients_by_condition_text, DecisionTreeClassifier())

from sklearn.metrics import accuracy_score, roc_curve, auc
fpr, tpr, _ = roc_curve(y, trained_clf.predict(X))
print("Prediction accuracy {}".format( auc(fpr, tpr) ) )

INFO:root:Extracting attributes from data set
INFO:root:Preprocessing data
INFO:root:Started training of clf
INFO:root:Training completed


Prediction accuracy 1.0


## Custom Preprocessing Classes

#### The first five values of the ` birthDate` feature that has been preprocessed into an age in years:

In [6]:
X[:5]

array([[57],
       [56],
       [72],
       [61],
       [77]])

If you want to preprocess fhir resources differently, feel free to implement your own preprocessing class. 
We can use the `register_preprocessor` function to do so. It is crucial to stick to the following naming scheme: 
    
    "<FHIR_Object_Name><FHIR_Resource_Name>Processor"
    
Furthemore, at least the methods `fit` and `transform` have to be implemented.

In [17]:
from preprocessing import register_preprocessor

from sklearn.base import BaseEstimator
import datetime as dt
import numpy as np
from fhir_objects.fhir_resources import date_format

class PatientBirthdateProcessor(BaseEstimator):
    """
    Calculates the age in days to use birthdate as a feature 
    """
    def transform(self, X, **transform_params):
        ages = []
        for birthdate in X:
            b_date = dt.datetime.strptime(birthdate[0], date_format)
            ages.append([int(
                            (dt.datetime.now().date() - b_date.date()).days)])
        return np.array(ages)

    def fit(self, X, y=None, **fit_params):
        return self
    
register_preprocessor(PatientBirthdateProcessor)



<module 'preprocessing' from '/Users/bockc/Projects/ml-on-fhir/src/preprocessing.py'>


In [8]:
ml_fhir = MLOnFHIR(Patient, feature_attrs=['birthDate'], label_attrs=['gender'])
X, y, trained_clf = ml_fhir.fit(patients_by_procedure_code, DecisionTreeClassifier())

INFO:root:Extracting attributes from data set
INFO:root:Preprocessing data
INFO:root:Started training of clf
INFO:root:Training completed


#### The `birthDate` feature is now the age in days:

In [9]:
X[:5]

array([[21140],
       [20769],
       [26591],
       [22512],
       [28271]])

# Experimental Section

In [1]:
%load_ext autoreload

In [3]:
%autoreload 2

import preprocessing
from preprocessing import register_preprocessor, PatientProcessorFactory

from sklearn.base import BaseEstimator
import datetime as dt
import numpy as np
from fhir_objects.fhir_resources import date_format

from fhir_client import FHIRClient
import logging
logging.basicConfig(level=logging.INFO)
import pandas as pd
 
class ObservationBmiProcessor(BaseEstimator):
    """
    Gets the latest bmi 
    """
    def __init__(self):
        self.patient_attribute_name = 'bmiLatest'
        
    def transform(self, X, **transform_params):
        condition = lambda x: x.code['coding'][0]['display']=='bmi'
        bmis = list(filter(condition, X))
        bmis = sorted(bmis, reverse=True)
        if len(bmis) >= 1:
            return self.patient_attribute_name, float(bmis[0].valueQuantity['value'])
        else:
            return self.patient_attribute_name, 0.0

    def fit(self, X, y=None, **fit_params):
        return self


class ObservationWeightProcessor(BaseEstimator):
    """
    Gets the latest weight 
    """
    def __init__(self):
        self.patient_attribute_name = 'weightLatest'
        
    def transform(self, X, **transform_params):
        condition = lambda x: x.code['coding'][0]['display']=='Body Weight'
        weights = list(filter(condition, X))
        weights = sorted(weights, reverse=True)
        if len(weights) >= 1:
            return self.patient_attribute_name, float(weights[0].valueQuantity['value'])
        else:
            return self.patient_attribute_name, 0.0

    def fit(self, X, y=None, **fit_params):
        return self

class ObservationHeightProcessor(BaseEstimator):
    """
    Gets the latest height 
    """
    def __init__(self):
        self.patient_attribute_name = 'heightLatest'
        
    def transform(self, X, **transform_params):
        condition = lambda x: x.code['coding'][0]['display']=='Body Height'
        heights = list(filter(condition, X))
        heights = sorted(heights, reverse=True)
        if len(heights) >= 1:
            return self.patient_attribute_name, float(heights[0].valueQuantity['value'])
        else:
            return self.patient_attribute_name, 0.0

    def fit(self, X, y=None, **fit_params):
        return self
    
    
register_preprocessor(ObservationBmiProcessor)
register_preprocessor(ObservationWeightProcessor)
register_preprocessor(ObservationHeightProcessor)


logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
client = FHIRClient(service_base_url='https://r3.smarthealthit.org', logger=logger, preprocessor=preprocessing)


INFO:root:Adding Patient Processor for ObservationBmiProcessor
INFO:root:Name of patient processor will be PatientbmiLatestProcessor
INFO:root:Adding Patient Processor for ObservationWeightProcessor
INFO:root:Name of patient processor will be PatientweightLatestProcessor
INFO:root:Adding Patient Processor for ObservationHeightProcessor
INFO:root:Name of patient processor will be PatientheightLatestProcessor


In [5]:
patients_by_condition_text = client.get_patients_by_condition_code(system="http://snomed.info/sct", code='284549007')

from ml_on_fhir import MLOnFHIR
from fhir_objects.patient import Patient
from sklearn.tree import DecisionTreeClassifier

ml_fhir = MLOnFHIR(Patient, feature_attrs=['weightLatest', 'heightLatest', 'birthDate'], label_attrs=['gender'])
X, y, trained_clf = ml_fhir.fit(patients_by_condition_text, DecisionTreeClassifier())

X

INFO:root:Received 55 observations in 0.93 seconds.
INFO:root:Received 156 observations in 0.91 seconds.
INFO:root:Received 162 observations in 0.79 seconds.
INFO:root:Received 40 observations in 0.17 seconds.
INFO:root:Received 28 observations in 0.15 seconds.
INFO:root:Received 66 observations in 0.31 seconds.
INFO:root:Received 37 observations in 0.43 seconds.
INFO:root:Received 20 observations in 0.15 seconds.
INFO:root:Received 40 observations in 0.46 seconds.
INFO:root:Received 41 observations in 0.19 seconds.
INFO:root:Received 60 observations in 0.55 seconds.
INFO:root:Received 43 observations in 0.17 seconds.
INFO:root:Received 42 observations in 0.17 seconds.
INFO:root:Received 44 observations in 0.17 seconds.
INFO:root:Received 58 observations in 0.54 seconds.
INFO:root:Received 86 observations in 0.34 seconds.
INFO:root:Received 71 observations in 0.34 seconds.
INFO:root:Received 102 observations in 0.71 seconds.
INFO:root:Received 139 observations in 0.57 seconds.
INFO:roo

array([[ 95.36046667, 190.18486579,  73.        ],
       [ 97.49603727, 163.32592941,  73.        ],
       [107.21100215, 184.63812872,  65.        ],
       [104.71454805, 159.07164573,  56.        ],
       [ 89.37567951, 172.70795043,  50.        ],
       [ 68.92199529, 163.62760056,  19.        ],
       [ 36.68892797, 153.4643521 ,  14.        ],
       [ 66.75219922, 155.38576832,  30.        ],
       [ 28.20501251, 160.88897411,  13.        ],
       [ 74.92175223, 183.16681942,  21.        ],
       [101.35309463, 181.78032899,  60.        ],
       [ 96.83218416, 165.03532131,  52.        ],
       [110.18113839, 172.97952962,  56.        ],
       [102.20748302, 163.40519842,  56.        ],
       [107.75321437, 156.4582031 ,  50.        ],
       [ 95.20348358, 163.02683609,  55.        ],
       [ 34.34515917, 138.37715162,  11.        ],
       [ 87.02745417, 175.53448007,  83.        ],
       [ 59.32842537, 165.47740886,  87.        ],
       [ 62.01920851, 185.24532

In [4]:

from sklearn.metrics import accuracy_score, roc_curve, auc
fpr, tpr, _ = roc_curve(y, trained_clf.predict(X))
print("Prediction accuracy {}".format( auc(fpr, tpr) ) )

Prediction accuracy 1.0


In [115]:
np.array([[1.,2.], [2.,3.]]).dtype

dtype('float64')