# OMOP Create Dataset

Connect to database

In [3]:
import __init__

In [4]:
import numpy as np
import pandas as pd
import datetime

# Open a connection to OMOP
from fleming_lib.tools import connect_to_omop
conn = connect_to_omop()

## Select list of patients

### SQL requests


- not be "limited" (Concept_id: 2001018843 and 2001030812 and 4127294)


In [5]:
query = """
select
    distinct person_id
from 
    measurement 
where 
    measurement_source_concept_id in (2001018843, 2001030812, 4127294)  
    ;"""

limitation = pd.read_sql_query(query, conn)

- Age >= 15 
- Processed only by intensive care unit

In [49]:
# in ICU
query = """
with icu as (
   select
       care_site_id, care_site_name, place_of_service_source_value
   from
       care_site
   where
       lower(place_of_service_source_value) like '%intensive%'
    or
        place_of_service_source_value = 'Cardiac surgery recovery unit' -- c'est aussi ICU
    or
        place_of_service_source_value = 'Coronary care unit'  -- c'est aussi ICU
   and
       lower(place_of_service_source_value) not like '%neonatal%'    -- removing Neonatal ICUs
   )
select
   distinct vd.person_id
from
   visit_detail vd
join
   icu
on
   icu.care_site_id = vd.care_site_id
;"""

icu = pd.read_sql_query(query, conn)

In [50]:
# age > 15
query = """
with person_age as (
    select
        p.person_id, (v.visit_start_date - p.birth_datetime)/365.25 age
    from
        person p
    left outer join
        visit_occurrence v
    on
        p.person_id = v.person_id)
select
    person_id, age
from
    person_age
where
    age > 15            -- including persons older than 15
and 
    age < 150             -- removing any outliers
;"""

age = pd.read_sql_query(query, conn)

### Cohort fusion

In [51]:
list_patient = list(set(icu.person_id) & set(age.person_id) & set(limitation.person_id))
list_patient[:10]

[62063367,
 62063368,
 62063369,
 62063370,
 62063371,
 62063372,
 62063373,
 62063374,
 62063377,
 62063378]

In [52]:
len(list_patient)

31056

## Build dataset for each patient

### Extract patients meta

In [53]:
query = """
select
    distinct p.person_id, p.gender_source_value gender, p.race_source_value race, p.birth_datetime
from
    person p
    ;"""

meta = pd.read_sql_query(query, conn)

# Note: ethnicity is empty so we do not add it

In [54]:
meta['birth_datetime'] = pd.to_datetime(meta['birth_datetime'])

In [55]:
meta.iloc[:10]

Unnamed: 0,person_id,gender,race,birth_datetime
0,62065089,F,ASIAN,2107-06-29
1,62106569,M,ASIAN,2047-11-01
2,62073299,F,ASIAN,2190-04-23
3,62096906,M,ASIAN,2182-09-07
4,62102055,M,ASIAN,2087-02-28
5,62106786,F,ASIAN,2161-03-31
6,62083063,F,ASIAN,2125-12-16
7,62078234,M,ASIAN,2110-06-19
8,62065073,F,ASIAN,2074-05-27
9,62100837,F,ASIAN,2136-07-22


In [56]:
print('Counts:')
for name in ['gender', 'race']:
    print(name + '\n' + '-'*30)
    print(meta[name].value_counts())
    print('')

Counts:
gender
------------------------------
M    26121
F    20399
Name: gender, dtype: int64

race
------------------------------
WHITE                                                       32074
UNKNOWN/NOT SPECIFIED                                        4236
BLACK/AFRICAN AMERICAN                                       3585
HISPANIC OR LATINO                                           1350
ASIAN                                                        1304
OTHER                                                        1256
UNABLE TO OBTAIN                                              792
PATIENT DECLINED TO ANSWER                                    498
ASIAN - CHINESE                                               223
BLACK/CAPE VERDEAN                                            159
HISPANIC/LATINO - PUERTO RICAN                                146
MULTI RACE ETHNICITY                                          111
WHITE - RUSSIAN                                               105
BLACK/HAIT

In [57]:
from fleming_lib.utils import add_categories

In [58]:
# Dictionary containing unique categories for each categorical variable
categories = dict()

categorical_variables = ['gender', 'race']

categories = add_categories(categories, meta, categorical_variables)

In [59]:
categories

{'gender': Index(['F', 'M', 'NaN'], dtype='object'),
 'race': Index(['AMERICAN INDIAN/ALASKA NATIVE',
        'AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE', 'ASIAN',
        'ASIAN - ASIAN INDIAN', 'ASIAN - CAMBODIAN', 'ASIAN - CHINESE',
        'ASIAN - FILIPINO', 'ASIAN - JAPANESE', 'ASIAN - KOREAN',
        'ASIAN - OTHER', 'ASIAN - THAI', 'ASIAN - VIETNAMESE', 'BLACK/AFRICAN',
        'BLACK/AFRICAN AMERICAN', 'BLACK/CAPE VERDEAN', 'BLACK/HAITIAN',
        'CARIBBEAN ISLAND', 'HISPANIC OR LATINO',
        'HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)',
        'HISPANIC/LATINO - COLOMBIAN', 'HISPANIC/LATINO - CUBAN',
        'HISPANIC/LATINO - DOMINICAN', 'HISPANIC/LATINO - GUATEMALAN',
        'HISPANIC/LATINO - HONDURAN', 'HISPANIC/LATINO - MEXICAN',
        'HISPANIC/LATINO - PUERTO RICAN', 'HISPANIC/LATINO - SALVADORAN',
        'MIDDLE EASTERN', 'MULTI RACE ETHNICITY',
        'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER', 'OTHER',
        'PATIENT DECLINED TO ANSWER'

In [60]:
from fleming_lib.utils import to_categorical, to_onehot

# Convert categorical variable to 'categorical' type
meta = to_categorical(meta, categorical_variables, categories)

# One-hot encode categorical variables
meta = to_onehot(meta, categorical_variables)

In [61]:
meta.iloc[:10]

Unnamed: 0,person_id,birth_datetime,gender_F,gender_M,gender_NaN,race_AMERICAN INDIAN/ALASKA NATIVE,race_AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE,race_ASIAN,race_ASIAN - ASIAN INDIAN,race_ASIAN - CAMBODIAN,...,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN/NOT SPECIFIED,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN,race_NaN
0,62065089,2107-06-29,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,62106569,2047-11-01,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,62073299,2190-04-23,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,62096906,2182-09-07,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,62102055,2087-02-28,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,62106786,2161-03-31,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,62083063,2125-12-16,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,62078234,2110-06-19,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,62065073,2074-05-27,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,62100837,2136-07-22,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
meta_names = meta.columns

### Extract measures

Among measures, we need to get all the available classes for each categorical variables (in order to one-hot encode properly each batch of data i.e. for each patient):
    - "Heart rate rhythm"

In [63]:
query = """
select
    distinct m.measurement_concept_name, m.value_source_value
from 
    measurement m 
where
    m.measurement_concept_id in
    (3022318   -- heart_rhythm
    )
order by
    m.measurement_concept_name, m.value_source_value
;"""

unique_categ_values = pd.read_sql_query(query, conn)

categorical_variables = ['Heart rate rhythm']

# Adding categories of each categorical variables to dict 'categories'
for var in categorical_variables:
    tmp = unique_categ_values.loc[unique_categ_values.measurement_concept_name == var]
    tmp.drop('measurement_concept_name', axis=1, inplace=True)
    tmp.rename(index=str, columns={'value_source_value': var}, inplace=True)
    categories = add_categories(categories, tmp, var)

#### SQL requests

In [64]:
patient_id = (62063368, 62106569)  # person_id

In [65]:
query = """
select
    distinct m.person_id, m.measurement_datetime, m.measurement_concept_name, m.value_source_value, m.unit_source_value, d.death_datetime
from 
    measurement m 
left join 
    death d on d.person_id = m.person_id
where
    m.measurement_concept_id in
    (3022318,   -- heart_rhythm
     3024171,   -- respiratory_rate
     3028354,   -- vent_settings
     3012888,   -- diastolic_bp
     3027598,   -- map_bp
     3004249,   -- systolic_bp
     3027018,   -- heart_rate
     3020891,   -- temperature
     3016502,   -- spo2
     3020716,   -- fio2
     3032652    -- glasgow coma scale
    )
and m.person_id in {}
order by m.person_id, m.measurement_datetime
;""".format(patient_id)

measures = pd.read_sql_query(query, conn)

In [66]:
measures.iloc[:10]

Unnamed: 0,person_id,measurement_datetime,measurement_concept_name,value_source_value,unit_source_value,death_datetime
0,62063368,2188-11-12 10:00:00,Respiratory rate,38,BPM,2188-11-22 12:00:00
1,62063368,2188-11-12 10:00:00,Oxygen saturation in Arterial blood,97,%,2188-11-22 12:00:00
2,62063368,2188-11-12 10:00:00,Mean blood pressure,107.33300018310547,mmHg,2188-11-22 12:00:00
3,62063368,2188-11-12 10:00:00,Glasgow coma scale,15,points,2188-11-22 12:00:00
4,62063368,2188-11-12 10:00:00,Body temperature,36.388900756835938,Deg. C,2188-11-22 12:00:00
5,62063368,2188-11-12 10:00:00,BP diastolic,88,mmHg,2188-11-22 12:00:00
6,62063368,2188-11-12 10:00:00,BP systolic,146,mmHg,2188-11-22 12:00:00
7,62063368,2188-11-12 10:00:00,Body temperature,97.5,Deg. F,2188-11-22 12:00:00
8,62063368,2188-11-12 10:00:00,Heart rate rhythm,Sinus Tachy,,2188-11-22 12:00:00
9,62063368,2188-11-12 10:30:00,Mean blood pressure,110,mmHg,2188-11-22 12:00:00


#### Change type datetime

In [67]:
measures['death_datetime'] = pd.to_datetime(measures['death_datetime'])
measures['measurement_datetime'] = pd.to_datetime(measures['measurement_datetime'])
measures.iloc[:10]

Unnamed: 0,person_id,measurement_datetime,measurement_concept_name,value_source_value,unit_source_value,death_datetime
0,62063368,2188-11-12 10:00:00,Respiratory rate,38,BPM,2188-11-22 12:00:00
1,62063368,2188-11-12 10:00:00,Oxygen saturation in Arterial blood,97,%,2188-11-22 12:00:00
2,62063368,2188-11-12 10:00:00,Mean blood pressure,107.33300018310547,mmHg,2188-11-22 12:00:00
3,62063368,2188-11-12 10:00:00,Glasgow coma scale,15,points,2188-11-22 12:00:00
4,62063368,2188-11-12 10:00:00,Body temperature,36.388900756835938,Deg. C,2188-11-22 12:00:00
5,62063368,2188-11-12 10:00:00,BP diastolic,88,mmHg,2188-11-22 12:00:00
6,62063368,2188-11-12 10:00:00,BP systolic,146,mmHg,2188-11-22 12:00:00
7,62063368,2188-11-12 10:00:00,Body temperature,97.5,Deg. F,2188-11-22 12:00:00
8,62063368,2188-11-12 10:00:00,Heart rate rhythm,Sinus Tachy,,2188-11-22 12:00:00
9,62063368,2188-11-12 10:30:00,Mean blood pressure,110,mmHg,2188-11-22 12:00:00


#### Add target value (y)

In [6]:
from fleming_lib.metrics import add_target, add_super_target

In [69]:
measures = measures.groupby('person_id').apply(add_target)
measures = measures.groupby('person_id').apply(add_super_target)

#### Row to columns

In [70]:
measures = measures.pivot_table(index=['measurement_datetime','target','super_target','person_id'], columns='measurement_concept_name', values='value_source_value', aggfunc='first')

Reset index

In [71]:
measures.reset_index(inplace=True)
measures.columns.name = None

In [72]:
measures.iloc[:10]

Unnamed: 0,measurement_datetime,target,super_target,person_id,BP diastolic,BP systolic,Body temperature,Glasgow coma scale,Heart rate,Heart rate rhythm,Mean blood pressure,Mean pressure Respiratory system airway Calculated,Oxygen concentration breathed,Oxygen saturation in Arterial blood,Respiratory rate
0,2111-04-13 16:00:00,0,0,62106569,,,,,,LBBB (Left Bundle Branch Block),,,,,
1,2111-04-13 16:40:00,0,0,62106569,,,,,93.0,,,,,96.0,
2,2111-04-13 16:42:00,0,0,62106569,,,,,,,,,,,25.0
3,2111-04-13 16:45:00,0,0,62106569,,,,,92.0,SR (Sinus Rhythm),,,,97.0,24.0
4,2111-04-13 16:47:00,0,0,62106569,96.0,181.0,,,,,115.0,,,,
5,2111-04-13 17:00:00,0,0,62106569,84.0,175.0,,,87.0,,102.0,,,96.0,21.0
6,2111-04-13 17:01:00,0,0,62106569,,,99.1,,88.0,LBBB (Left Bundle Branch Block),,,,97.0,22.0
7,2111-04-13 17:15:00,0,0,62106569,92.0,185.0,,,87.0,LBBB (Left Bundle Branch Block),114.0,,,96.0,21.0
8,2111-04-13 17:30:00,0,0,62106569,96.0,184.0,,,88.0,LBBB (Left Bundle Branch Block),114.0,,,95.0,20.0
9,2111-04-13 17:45:00,0,0,62106569,103.0,197.0,,,90.0,LBBB (Left Bundle Branch Block),114.0,,,96.0,22.0


#### Change types

In [73]:
from fleming_lib.utils import to_categorical, convert_frac, to_numeric, to_onehot

Numerical variables

In [74]:
numerical_variables = ['BP diastolic', 'BP systolic', 'Body temperature', 'Heart rate', 'Mean blood pressure', 'Glasgow coma scale', 'Oxygen concentration breathed', 'Mean pressure Respiratory system airway Calculated', 'Oxygen saturation in Arterial blood', 'Respiratory rate']

measures = convert_frac(measures, numerical_variables)
measures = to_numeric(measures, numerical_variables)

Categorical variables

In [75]:
categorical_variables = ['Heart rate rhythm']

measures = to_categorical(measures, categorical_variables, categories)
# One-hot encode categorical variables
measures = to_onehot(measures, categorical_variables)

#### Add meta data

In [76]:
data = pd.merge(measures, meta, on='person_id')

In [77]:
data.iloc[:10]

Unnamed: 0,measurement_datetime,target,super_target,person_id,BP diastolic,BP systolic,Body temperature,Glasgow coma scale,Heart rate,Mean blood pressure,...,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN/NOT SPECIFIED,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN,race_NaN
0,2111-04-13 16:00:00,0,0,62106569,,,,,,,...,0,0,0,0,0,0,0,0,0,0
1,2111-04-13 16:40:00,0,0,62106569,,,,,93.0,,...,0,0,0,0,0,0,0,0,0,0
2,2111-04-13 16:42:00,0,0,62106569,,,,,,,...,0,0,0,0,0,0,0,0,0,0
3,2111-04-13 16:45:00,0,0,62106569,,,,,92.0,,...,0,0,0,0,0,0,0,0,0,0
4,2111-04-13 16:47:00,0,0,62106569,96.0,181.0,,,,115.0,...,0,0,0,0,0,0,0,0,0,0
5,2111-04-13 17:00:00,0,0,62106569,84.0,175.0,,,87.0,102.0,...,0,0,0,0,0,0,0,0,0,0
6,2111-04-13 17:01:00,0,0,62106569,,,99.1,,88.0,,...,0,0,0,0,0,0,0,0,0,0
7,2111-04-13 17:15:00,0,0,62106569,92.0,185.0,,,87.0,114.0,...,0,0,0,0,0,0,0,0,0,0
8,2111-04-13 17:30:00,0,0,62106569,96.0,184.0,,,88.0,114.0,...,0,0,0,0,0,0,0,0,0,0
9,2111-04-13 17:45:00,0,0,62106569,103.0,197.0,,,90.0,114.0,...,0,0,0,0,0,0,0,0,0,0


#### Compute age

In [78]:
from fleming_lib.metrics import add_age, add_rolling_avg

In [79]:
data = data.groupby('person_id').apply(add_age)

In [80]:
data.iloc[:10]

Unnamed: 0,measurement_datetime,target,super_target,person_id,BP diastolic,BP systolic,Body temperature,Glasgow coma scale,Heart rate,Mean blood pressure,...,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN/NOT SPECIFIED,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN,race_NaN,age
0,2111-04-13 16:00:00,0,0,62106569,,,,,,,...,0,0,0,0,0,0,0,0,0,63.4
1,2111-04-13 16:40:00,0,0,62106569,,,,,93.0,,...,0,0,0,0,0,0,0,0,0,63.4
2,2111-04-13 16:42:00,0,0,62106569,,,,,,,...,0,0,0,0,0,0,0,0,0,63.4
3,2111-04-13 16:45:00,0,0,62106569,,,,,92.0,,...,0,0,0,0,0,0,0,0,0,63.4
4,2111-04-13 16:47:00,0,0,62106569,96.0,181.0,,,,115.0,...,0,0,0,0,0,0,0,0,0,63.4
5,2111-04-13 17:00:00,0,0,62106569,84.0,175.0,,,87.0,102.0,...,0,0,0,0,0,0,0,0,0,63.4
6,2111-04-13 17:01:00,0,0,62106569,,,99.1,,88.0,,...,0,0,0,0,0,0,0,0,0,63.4
7,2111-04-13 17:15:00,0,0,62106569,92.0,185.0,,,87.0,114.0,...,0,0,0,0,0,0,0,0,0,63.4
8,2111-04-13 17:30:00,0,0,62106569,96.0,184.0,,,88.0,114.0,...,0,0,0,0,0,0,0,0,0,63.4
9,2111-04-13 17:45:00,0,0,62106569,103.0,197.0,,,90.0,114.0,...,0,0,0,0,0,0,0,0,0,63.4


### Data enhancement

#### Add rolling mean 

In [81]:
data = data.groupby('person_id').apply(add_rolling_avg, column='Respiratory rate', window=2)

data.iloc[:10]

Unnamed: 0,measurement_datetime,target,super_target,person_id,BP diastolic,BP systolic,Body temperature,Glasgow coma scale,Heart rate,Mean blood pressure,...,race_UNABLE TO OBTAIN,race_UNKNOWN/NOT SPECIFIED,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN,race_NaN,age,Respiratory rate avg h-2
0,2111-04-13 16:00:00,0,0,62106569,,,,,,,...,0,0,0,0,0,0,0,0,63.4,
1,2111-04-13 16:40:00,0,0,62106569,,,,,93.0,,...,0,0,0,0,0,0,0,0,63.4,
2,2111-04-13 16:42:00,0,0,62106569,,,,,,,...,0,0,0,0,0,0,0,0,63.4,
3,2111-04-13 16:45:00,0,0,62106569,,,,,92.0,,...,0,0,0,0,0,0,0,0,63.4,25.0
4,2111-04-13 16:47:00,0,0,62106569,96.0,181.0,,,,115.0,...,0,0,0,0,0,0,0,0,63.4,24.5
5,2111-04-13 17:00:00,0,0,62106569,84.0,175.0,,,87.0,102.0,...,0,0,0,0,0,0,0,0,63.4,24.5
6,2111-04-13 17:01:00,0,0,62106569,,,99.1,,88.0,,...,0,0,0,0,0,0,0,0,63.4,23.333333
7,2111-04-13 17:15:00,0,0,62106569,92.0,185.0,,,87.0,114.0,...,0,0,0,0,0,0,0,0,63.4,23.0
8,2111-04-13 17:30:00,0,0,62106569,96.0,184.0,,,88.0,114.0,...,0,0,0,0,0,0,0,0,63.4,22.6
9,2111-04-13 17:45:00,0,0,62106569,103.0,197.0,,,90.0,114.0,...,0,0,0,0,0,0,0,0,63.4,22.166667


### Create Dataset

In [7]:
import time
import warnings

from fleming_lib.metrics import add_rolling_avg, add_target, add_super_target, add_age
from fleming_lib.utils import to_categorical, to_onehot, to_numeric, convert_frac, add_categories, add_missing_columns, check_length


def create_dataset(list_patients, n_patients_per_batch=10, verbose=False):
    """Create list of dataset given a list of patients.
    
    Parameters
    ----------
    list_patients : list of int
        List of patients ID.
    n_patients_per_batch : int (default=10)
        Number of patients to sequentially load data for, in order not to cause timeout if the query is too long to
        process by the server.
    verbose : bool (default=False)
        Verbosity level.
    
    Returns
    -------
    dataset : pd.DataFrame
        Dataset containing all data associated to each patient.
        
    """
    t0 = time.time()
    frame = []
    
    if not isinstance(list_patients, list):
        list_patients = [list_patients]
        
    n_patients = len(list_patients)
    
    # Extract meta data
    # -----------------
    if verbose:
        msg = 'Extracting meta data...'
        delta_t = str(int(time.time() - t0)) + ' s'
        print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')
            
    query = """
    select
        distinct p.person_id, p.gender_source_value gender, p.race_source_value race, p.birth_datetime
    from
        person p
        ;"""

    meta = pd.read_sql_query(query, conn)

    # Dictionary containing unique categories for each categorical variable
    categories = dict()
    
    # Convert categorical variable to 'categorical' type
    categorical_variables = ['gender', 'race']
    # Extracting categories for each categorical variable
    categories = add_categories(categories, meta, categorical_variables)
    
    meta = to_categorical(meta, categorical_variables, categories)
    meta = to_onehot(meta, categorical_variables)
    # One-hot column names
    meta_names = meta.columns
    
    # Extract unique measurements values from categorical variables
    # (here 'Heart rate rhythm')
    query = """
        select
            distinct m.measurement_concept_name, m.value_source_value
        from 
            measurement m 
        where
            m.measurement_concept_id in
            (3022318   -- heart_rhythm
            )
        order by
            m.measurement_concept_name, m.value_source_value
        ;"""

    unique_categ_values = pd.read_sql_query(query, conn)
    categorical_variables = ['Heart rate rhythm']
    # Adding categories of each categorical variables to dict 'categories'
    for var in categorical_variables:
        tmp = unique_categ_values.loc[unique_categ_values.measurement_concept_name == var]
        tmp.drop('measurement_concept_name', axis=1, inplace=True)
        tmp.rename(index=str, columns={'value_source_value': var}, inplace=True)
        categories = add_categories(categories, tmp, var)
    
    # Create sublist of patients (batch)
    n_iter = int(n_patients / n_patients_per_batch)
    sublists_patients = [list_patients[i: i+n_patients_per_batch]
                         for i in range(0, n_patients, n_patients_per_batch)]
    n_sublists = len(sublists_patients)
    
    # Extracting data for each patient
    for i, sublist_patients in enumerate(sublists_patients):
        if verbose:
            base_msg = 'Batch {}/{}'.format(i+1, n_sublists)
            msg = base_msg
            delta_t = str(int(time.time() - t0)) + ' s'
            print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')
        
        # Extract measures
        # ----------------
        if verbose:
            add_msg = 'Extracting measures...'
            msg = base_msg + ' - ' + add_msg
            delta_t = str(int(time.time() - t0)) + ' s'
            print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')
            
        if len(sublist_patients) == 1:
            match_person = "m.person_id = {}".format(sublist_patients[0])
        else:
            match_person = "m.person_id in {}".format(tuple(sublist_patients))
            
        query = """
        select
            distinct m.person_id, m.measurement_datetime, m.measurement_concept_name, m.value_source_value, m.unit_source_value, d.death_datetime
        from 
            measurement m 
        left join 
            death d on d.person_id = m.person_id
        where
            measurement_concept_id IN
            (3022318,   -- heart_rhythm
             3024171,   -- respiratory_rate
             3028354,   -- vent_settings
             3012888,   -- diastolic_bp
             3027598,   -- map_bp
             3004249,   -- systolic_bp
             3027018,   -- heart_rate
             3020891,   -- temperature
             3016502,   -- spo2
             3020716,   -- fio2
             3032652    -- glasgow coma scale
            )
        and {}
        order by measurement_datetime
            ;""".format(match_person)
        
        df = pd.read_sql_query(query, conn)
        
        # Check if data is empty for a patient
        check_length(df)

        if verbose:
            add_msg = 'Formatting data...'
            msg = base_msg + ' - ' + add_msg
            delta_t = str(int(time.time() - t0)) + ' s'
            print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')

        df['death_datetime'] = pd.to_datetime(df['death_datetime'])
        df['measurement_datetime'] = pd.to_datetime(df['measurement_datetime'])
        
        # Add target: patient's death' status, relative to the measurement datetime (target) and to the hospital stay (super_target)
        df = df.groupby('person_id').apply(add_target)
        df = df.groupby('person_id').apply(add_super_target)
        
        # Convert to timeseries matrix
        df = df.pivot_table(index= ['measurement_datetime','target','super_target','person_id'], columns='measurement_concept_name', values='value_source_value', aggfunc='first')
        df.reset_index(inplace=True)
        df.columns.name = None
        
        # Convert types
        # -------------
        # Convert to numerical
        numerical_variables = ['BP diastolic', 'BP systolic', 'Body temperature', 'Heart rate', 'Mean blood pressure', 'Glasgow coma scale', 'Oxygen concentration breathed', 'Mean pressure Respiratory system airway Calculated', 'Oxygen saturation in Arterial blood', 'Respiratory rate']
           
        df = add_missing_columns(df, numerical_variables)
    
        df = convert_frac(df, numerical_variables)
        df = to_numeric(df, numerical_variables)
        
        # Convert to categorical and one-hot encode
        categorical_variables = ['Heart rate rhythm']
        
        df = add_missing_columns(df, categorical_variables)
        
        df = to_categorical(df, categorical_variables, categories)
        df = to_onehot(df, categorical_variables)
        
        # Add meta data to measures
        # -------------------------
        if verbose:
            add_msg = 'Adding meta data...'
            msg = base_msg + ' - ' + add_msg
            delta_t = str(int(time.time() - t0)) + ' s'
            print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')
            
        df = pd.merge(df, meta, how='inner', on='person_id')
        
        # Add additional features
        # -----------------------
        if verbose:
            add_msg = 'Adding additional features...'
            msg = base_msg + ' - ' + add_msg
            delta_t = str(int(time.time() - t0)) + ' s'
            print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')
        
        # - age
        df = df.groupby('person_id').apply(add_age, round_to_dec=1)
        # - 2h rolling average respiratory rate
        df = df.groupby('person_id').apply(add_rolling_avg, column='Respiratory rate', window=2)
        
        frame.append(df)
        
        if verbose:
            add_msg = 'Done'
            msg = base_msg + ' - ' + add_msg
            delta_t = str(int(time.time() - t0)) + ' s'
            print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')
            print('')
            
    # Concat dataframes
    dataset = pd.concat(frame)
    dataset = dataset.reindex_axis(frame[0].columns, axis=1)
    
    return dataset

### Example on a patient who indeed died

In [8]:
query = """SELECT * FROM death WHERE person_id = 62063368 LIMIT 10;"""
df = pd.read_sql_query(query, conn)

In [9]:
df

Unnamed: 0,person_id,death_date,death_datetime,death_type_concept_id,cause_concept_id,cause_source_value,cause_source_concept_id
0,62063368,2188-11-22,2188-11-22 12:00:00,38003569,,,


In [10]:
dataset = create_dataset([62063368, 62063384, 62063393], verbose=True, n_patients_per_batch=10)

Batch 1/1 - Done                                                                                     [17 s      ]




#### Export dataset

CSV

In [28]:
dataset.to_csv('dataset_omop.csv', sep='\t', encoding='utf-8', index=False)

Export each dataframe dataset corresponding to a given patient to CSV format.

In [29]:
for patient_id in dataset.person_id.unique():
    fname = 'dataset_omop_{}'.format(patient_id)
    dataset.loc[dataset.person_id == patient_id].to_csv(fname, sep='\t', encoding='utf-8', index=False)

Pickle

In [30]:
import pickle as pkl

pkl.dump(dataset, open('dataset_omop.pkl', 'wb'))

In [31]:
for patient_id in dataset.person_id.unique():
    fname = 'dataset_omop_{}'.format(patient_id)
    pkl.dump(dataset.loc[dataset.person_id == patient_id], open(fname, 'wb'))

#### Import dataset

CSV

In [54]:
df = pd.read_csv('dataset_omop.csv', sep='\t', encoding='utf-8')

Pickle

In [55]:
df = pkl.load(open('dataset_omop.pkl', 'rb'))