# OMOP Create Dataset

Connect to database

In [19]:
# Get your credentials
import numpy as np
omop_login = np.load('full_omop_login.npy').item()

# Open a connection to OMOP
from fleming_lib.tools import connect_to_omop
conn = connect_to_omop(login_dict=omop_login)

# Get ready for SQL query processing
import pandas as pd
import datetime

## Select list of patients

### SQL requests


- not be "limited" (Concept_id: 2001018843 and 2001030812 and 4127294)


In [2]:
query = """
select
    distinct person_id
from 
    measurement 
where 
    measurement_source_concept_id in (2001018843, 2001030812, 4127294)  
    ;"""

limitation = pd.read_sql_query(query, conn)

KeyboardInterrupt: 

- Age >= 15 
- Processed only by intensive care unit

In [180]:
# in ICU
query = """
with icu as (
    select
        care_site_id, care_site_name, place_of_service_source_value
    from
        care_site
    where
        lower(place_of_service_source_value) like '%intensive%'     -- selecting ICUs
    and
        lower(place_of_service_source_value) not like '%neonatal%'    -- removing Neonatal ICUs
    )
select
    vd.person_id
from
    visit_detail vd
join
    icu
on
    icu.care_site_id = vd.care_site_id
;"""

icu = pd.read_sql_query(query, conn)

In [181]:
# age > 15
query = """
with person_age as (
    select
        p.person_id, (v.visit_start_date - p.birth_datetime)/365.25 age
    from
        person p
    left outer join
        visit_occurrence v
    on
        p.person_id = v.person_id)
select
    person_id, age
from
    person_age
where
    age > 15            -- including persons older than 15
and 
    age < 150             -- removing any outliers
;"""

age = pd.read_sql_query(query, conn)

### Cohort fusion

In [182]:
list_patient = list(set(icu.person_id) & set(age.person_id) & set(limitation.person_id))
list_patient[:10]

[62095360,
 62095362,
 62095364,
 62095365,
 62095367,
 62095368,
 62095369,
 62095372,
 62095375,
 62095376]

In [183]:
len(list_patient)

21404

## Build dataset for each patient

### Extract patients meta

In [2]:
query = """
select
    distinct p.person_id, p.gender_source_value gender, p.race_source_value race, p.birth_datetime
from
    person p
    ;"""

meta = pd.read_sql_query(query, conn)

# Note: ethnicity is empty so we do not add it

In [3]:
meta['birth_datetime'] = pd.to_datetime(meta['birth_datetime'])

In [4]:
meta.iloc[:10]

Unnamed: 0,person_id,gender,race,birth_datetime
0,62065089,F,ASIAN,2107-06-29
1,62106569,M,ASIAN,2047-11-01
2,62073299,F,ASIAN,2190-04-23
3,62096906,M,ASIAN,2182-09-07
4,62102055,M,ASIAN,2087-02-28
5,62106786,F,ASIAN,2161-03-31
6,62083063,F,ASIAN,2125-12-16
7,62078234,M,ASIAN,2110-06-19
8,62065073,F,ASIAN,2074-05-27
9,62100837,F,ASIAN,2136-07-22


In [5]:
print('Counts:')
for name in ['gender', 'race']:
    print(name + '\n' + '-'*30)
    print(meta[name].value_counts())
    print('')

Counts:
gender
------------------------------
M    26121
F    20399
Name: gender, dtype: int64

race
------------------------------
WHITE                                                       32074
UNKNOWN/NOT SPECIFIED                                        4236
BLACK/AFRICAN AMERICAN                                       3585
HISPANIC OR LATINO                                           1350
ASIAN                                                        1304
OTHER                                                        1256
UNABLE TO OBTAIN                                              792
PATIENT DECLINED TO ANSWER                                    498
ASIAN - CHINESE                                               223
BLACK/CAPE VERDEAN                                            159
HISPANIC/LATINO - PUERTO RICAN                                146
MULTI RACE ETHNICITY                                          111
WHITE - RUSSIAN                                               105
BLACK/HAIT

In [6]:
from fleming_lib.utils import to_categorical, to_onehot

# one hot encoding
categorical_variables = ['gender', 'race']

# Convert categorical variable to 'categorical' type
meta = to_categorical(meta, categorical_variables)

In [7]:
# One-hot encode categorical variables
meta = to_onehot(meta, categorical_variables)

In [8]:
meta.iloc[:10]

Unnamed: 0,person_id,birth_datetime,gender_F,gender_M,race_AMERICAN INDIAN/ALASKA NATIVE,race_AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE,race_ASIAN,race_ASIAN - ASIAN INDIAN,race_ASIAN - CAMBODIAN,race_ASIAN - CHINESE,...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN/NOT SPECIFIED,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
0,62065089,2107-06-29,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,62106569,2047-11-01,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,62073299,2190-04-23,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,62096906,2182-09-07,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,62102055,2087-02-28,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,62106786,2161-03-31,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,62083063,2125-12-16,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,62078234,2110-06-19,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,62065073,2074-05-27,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,62100837,2136-07-22,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
meta_names = meta.columns

### Extract measures

#### SQL requests

In [10]:
patient_id = 62063368  # person_id

In [11]:
query = """
select
    distinct m.person_id, m.measurement_datetime, m.measurement_concept_name, m.value_source_value, m.unit_source_value, d.death_datetime
from 
    measurement m 
left join 
    death d on d.person_id = m.person_id
where
    m.measurement_concept_id in
    (3022318,   -- heart_rhythm
     3024171,   -- respiratory_rate
     3028354,   -- vent_settings
     3012888,   -- diastolic_bp
     3027598,   -- map_bp
     3004249,   -- systolic_bp
     3027018,   -- heart_rate
     3020891,   -- temperature
     3016502,   -- spo2
     3020716,   -- fio2
     3032652    -- glasgow coma scale
    )
and m.person_id = {}
order by m.measurement_datetime
;""".format(patient_id)

measures = pd.read_sql_query(query, conn)

In [12]:
measures.iloc[:10]

Unnamed: 0,person_id,measurement_datetime,measurement_concept_name,value_source_value,unit_source_value,death_datetime
0,62063368,2188-11-12 10:00:00,Mean blood pressure,107.33300018310547,mmHg,2188-11-22 12:00:00
1,62063368,2188-11-12 10:00:00,Respiratory rate,38,BPM,2188-11-22 12:00:00
2,62063368,2188-11-12 10:00:00,Body temperature,36.388900756835938,Deg. C,2188-11-22 12:00:00
3,62063368,2188-11-12 10:00:00,Body temperature,97.5,Deg. F,2188-11-22 12:00:00
4,62063368,2188-11-12 10:00:00,BP diastolic,88,mmHg,2188-11-22 12:00:00
5,62063368,2188-11-12 10:00:00,Heart rate rhythm,Sinus Tachy,,2188-11-22 12:00:00
6,62063368,2188-11-12 10:00:00,Oxygen saturation in Arterial blood,97,%,2188-11-22 12:00:00
7,62063368,2188-11-12 10:00:00,BP systolic,146,mmHg,2188-11-22 12:00:00
8,62063368,2188-11-12 10:30:00,Heart rate rhythm,Sinus Tachy,,2188-11-22 12:00:00
9,62063368,2188-11-12 10:30:00,BP diastolic,95,mmHg,2188-11-22 12:00:00


#### Change type datetime

In [13]:
measures['death_datetime'] = pd.to_datetime(measures['death_datetime'])
measures['measurement_datetime'] = pd.to_datetime(measures['measurement_datetime'])
measures.iloc[:10]

Unnamed: 0,person_id,measurement_datetime,measurement_concept_name,value_source_value,unit_source_value,death_datetime
0,62063368,2188-11-12 10:00:00,Mean blood pressure,107.33300018310547,mmHg,2188-11-22 12:00:00
1,62063368,2188-11-12 10:00:00,Respiratory rate,38,BPM,2188-11-22 12:00:00
2,62063368,2188-11-12 10:00:00,Body temperature,36.388900756835938,Deg. C,2188-11-22 12:00:00
3,62063368,2188-11-12 10:00:00,Body temperature,97.5,Deg. F,2188-11-22 12:00:00
4,62063368,2188-11-12 10:00:00,BP diastolic,88,mmHg,2188-11-22 12:00:00
5,62063368,2188-11-12 10:00:00,Heart rate rhythm,Sinus Tachy,,2188-11-22 12:00:00
6,62063368,2188-11-12 10:00:00,Oxygen saturation in Arterial blood,97,%,2188-11-22 12:00:00
7,62063368,2188-11-12 10:00:00,BP systolic,146,mmHg,2188-11-22 12:00:00
8,62063368,2188-11-12 10:30:00,Heart rate rhythm,Sinus Tachy,,2188-11-22 12:00:00
9,62063368,2188-11-12 10:30:00,BP diastolic,95,mmHg,2188-11-22 12:00:00


#### Add target value (y)

In [14]:
from fleming_lib.metrics import add_target

In [15]:
measures = add_target(measures)

#### Row to columns

In [16]:
measures = measures.pivot_table(index=['measurement_datetime','target','person_id'], columns='measurement_concept_name', values='value_source_value', aggfunc='first')
measures.iloc[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,measurement_concept_name,BP diastolic,BP systolic,Body temperature,Heart rate,Heart rate rhythm,Mean blood pressure,Mean pressure Respiratory system airway Calculated,Oxygen concentration breathed,Oxygen saturation in Arterial blood,Respiratory rate
measurement_datetime,target,person_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2188-11-12 10:00:00,0,62063368,88.0,146.0,36.38890075683594,,Sinus Tachy,107.33300018310548,,,97.0,38.0
2188-11-12 10:30:00,0,62063368,95.0,140.0,,134.0,Sinus Tachy,110.0,,,98.0,34.0
2188-11-12 10:45:00,0,62063368,97.0,144.0,,134.0,Sinus Tachy,112.66699981689452,,,97.0,41.0
2188-11-12 11:00:00,0,62063368,91.0,140.0,,134.0,Sinus Tachy,107.33300018310548,,,99.0,40.0
2188-11-12 12:00:00,0,62063368,98.0,139.0,36.44440078735352,135.0,Sinus Tachy,111.66699981689452,,,95.0,38.0
2188-11-12 13:00:00,0,62063368,108.0,158.0,36.666698455810554,134.0,Sinus Tachy,124.66699981689452,,0.44999998807907104,93.0,32.0
2188-11-12 13:03:00,0,62063368,,,36.7,,,,,,,
2188-11-12 13:30:00,0,62063368,105.0,159.0,,134.0,Sinus Tachy,123.0,,0.5,97.0,31.0
2188-11-12 14:00:00,0,62063368,107.0,157.0,,137.0,Sinus Tachy,123.66699981689452,,0.5,98.0,34.0
2188-11-12 14:45:00,0,62063368,101.0,149.0,,147.0,Sinus Tachy,117.0,,,98.0,42.0


#### Change type to float

In [17]:
measures.reset_index(inplace=True)
measures.columns.name = None
measures[['BP diastolic','BP systolic','Body temperature','Heart rate','Mean blood pressure','Oxygen saturation in Arterial blood','Respiratory rate']] = measures[['BP diastolic','BP systolic','Body temperature','Heart rate','Mean blood pressure','Oxygen saturation in Arterial blood','Respiratory rate']].apply(pd.to_numeric, errors='ignore')

#### Add meta data

In [18]:
idx = (meta['person_id'] == patient_id)
meta[idx][meta_names]

Unnamed: 0,person_id,birth_datetime,gender_F,gender_M,race_AMERICAN INDIAN/ALASKA NATIVE,race_AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE,race_ASIAN,race_ASIAN - ASIAN INDIAN,race_ASIAN - CAMBODIAN,race_ASIAN - CHINESE,...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN/NOT SPECIFIED,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
43816,62063368,2164-12-27,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Copy measures into 'data' dataframe
data = measures.copy(deep=True)

# Add meta
for name in meta_names:
    data[name] = meta[idx][name].values.squeeze()

In [20]:
data.iloc[:10]

Unnamed: 0,measurement_datetime,target,person_id,BP diastolic,BP systolic,Body temperature,Heart rate,Heart rate rhythm,Mean blood pressure,Mean pressure Respiratory system airway Calculated,...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN/NOT SPECIFIED,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
0,2188-11-12 10:00:00,0,62063368,88.0,146.0,36.388901,,Sinus Tachy,107.33300018310548,,...,0,0,0,0,0,0,0,0,0,0
1,2188-11-12 10:30:00,0,62063368,95.0,140.0,,134.0,Sinus Tachy,110.0,,...,0,0,0,0,0,0,0,0,0,0
2,2188-11-12 10:45:00,0,62063368,97.0,144.0,,134.0,Sinus Tachy,112.66699981689452,,...,0,0,0,0,0,0,0,0,0,0
3,2188-11-12 11:00:00,0,62063368,91.0,140.0,,134.0,Sinus Tachy,107.33300018310548,,...,0,0,0,0,0,0,0,0,0,0
4,2188-11-12 12:00:00,0,62063368,98.0,139.0,36.444401,135.0,Sinus Tachy,111.66699981689452,,...,0,0,0,0,0,0,0,0,0,0
5,2188-11-12 13:00:00,0,62063368,108.0,158.0,36.666698,134.0,Sinus Tachy,124.66699981689452,,...,0,0,0,0,0,0,0,0,0,0
6,2188-11-12 13:03:00,0,62063368,,,36.7,,,,,...,0,0,0,0,0,0,0,0,0,0
7,2188-11-12 13:30:00,0,62063368,105.0,159.0,,134.0,Sinus Tachy,123.0,,...,0,0,0,0,0,0,0,0,0,0
8,2188-11-12 14:00:00,0,62063368,107.0,157.0,,137.0,Sinus Tachy,123.66699981689452,,...,0,0,0,0,0,0,0,0,0,0
9,2188-11-12 14:45:00,0,62063368,101.0,149.0,,147.0,Sinus Tachy,117.0,,...,0,0,0,0,0,0,0,0,0,0


In [21]:
categorical_variables = ['Heart rate rhythm']

# Convert categorical variable to 'categorical' type
data = to_categorical(data, categorical_variables)

In [22]:
# One-hot encode categorical variables
data = to_onehot(data, categorical_variables)

In [23]:
data.iloc[:10]

Unnamed: 0,measurement_datetime,target,person_id,BP diastolic,BP systolic,Body temperature,Heart rate,Mean blood pressure,Mean pressure Respiratory system airway Calculated,Oxygen concentration breathed,...,race_UNKNOWN/NOT SPECIFIED,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN,Heart rate rhythm_Idioventricular,Heart rate rhythm_Normal Sinus,Heart rate rhythm_Sinus Brady,Heart rate rhythm_Sinus Tachy
0,2188-11-12 10:00:00,0,62063368,88.0,146.0,36.388901,,107.33300018310548,,,...,0,0,0,0,0,0,0,0,0,1
1,2188-11-12 10:30:00,0,62063368,95.0,140.0,,134.0,110.0,,,...,0,0,0,0,0,0,0,0,0,1
2,2188-11-12 10:45:00,0,62063368,97.0,144.0,,134.0,112.66699981689452,,,...,0,0,0,0,0,0,0,0,0,1
3,2188-11-12 11:00:00,0,62063368,91.0,140.0,,134.0,107.33300018310548,,,...,0,0,0,0,0,0,0,0,0,1
4,2188-11-12 12:00:00,0,62063368,98.0,139.0,36.444401,135.0,111.66699981689452,,,...,0,0,0,0,0,0,0,0,0,1
5,2188-11-12 13:00:00,0,62063368,108.0,158.0,36.666698,134.0,124.66699981689452,,0.44999998807907104,...,0,0,0,0,0,0,0,0,0,1
6,2188-11-12 13:03:00,0,62063368,,,36.7,,,,,...,0,0,0,0,0,0,0,0,0,0
7,2188-11-12 13:30:00,0,62063368,105.0,159.0,,134.0,123.0,,0.5,...,0,0,0,0,0,0,0,0,0,1
8,2188-11-12 14:00:00,0,62063368,107.0,157.0,,137.0,123.66699981689452,,0.5,...,0,0,0,0,0,0,0,0,0,1
9,2188-11-12 14:45:00,0,62063368,101.0,149.0,,147.0,117.0,,,...,0,0,0,0,0,0,0,0,0,1


#### Compute age

In [24]:
from fleming_lib.metrics import add_age, add_rolling_avg

In [25]:
data = add_age(data)

In [26]:
data.iloc[:10]

Unnamed: 0,measurement_datetime,target,person_id,BP diastolic,BP systolic,Body temperature,Heart rate,Mean blood pressure,Mean pressure Respiratory system airway Calculated,Oxygen concentration breathed,...,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN,Heart rate rhythm_Idioventricular,Heart rate rhythm_Normal Sinus,Heart rate rhythm_Sinus Brady,Heart rate rhythm_Sinus Tachy,age
0,2188-11-12 10:00:00,0,62063368,88.0,146.0,36.388901,,107.33300018310548,,,...,0,0,0,0,0,0,0,0,1,23.9
1,2188-11-12 10:30:00,0,62063368,95.0,140.0,,134.0,110.0,,,...,0,0,0,0,0,0,0,0,1,23.9
2,2188-11-12 10:45:00,0,62063368,97.0,144.0,,134.0,112.66699981689452,,,...,0,0,0,0,0,0,0,0,1,23.9
3,2188-11-12 11:00:00,0,62063368,91.0,140.0,,134.0,107.33300018310548,,,...,0,0,0,0,0,0,0,0,1,23.9
4,2188-11-12 12:00:00,0,62063368,98.0,139.0,36.444401,135.0,111.66699981689452,,,...,0,0,0,0,0,0,0,0,1,23.9
5,2188-11-12 13:00:00,0,62063368,108.0,158.0,36.666698,134.0,124.66699981689452,,0.44999998807907104,...,0,0,0,0,0,0,0,0,1,23.9
6,2188-11-12 13:03:00,0,62063368,,,36.7,,,,,...,0,0,0,0,0,0,0,0,0,23.9
7,2188-11-12 13:30:00,0,62063368,105.0,159.0,,134.0,123.0,,0.5,...,0,0,0,0,0,0,0,0,1,23.9
8,2188-11-12 14:00:00,0,62063368,107.0,157.0,,137.0,123.66699981689452,,0.5,...,0,0,0,0,0,0,0,0,1,23.9
9,2188-11-12 14:45:00,0,62063368,101.0,149.0,,147.0,117.0,,,...,0,0,0,0,0,0,0,0,1,23.9


### Data enhancement

#### Add rolling mean 

In [28]:
data = add_rolling_avg(data, 'Respiratory rate', window=2)

data.iloc[:10]

Unnamed: 0,measurement_datetime,target,person_id,BP diastolic,BP systolic,Body temperature,Heart rate,Mean blood pressure,Mean pressure Respiratory system airway Calculated,Oxygen concentration breathed,...,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN,Heart rate rhythm_Idioventricular,Heart rate rhythm_Normal Sinus,Heart rate rhythm_Sinus Brady,Heart rate rhythm_Sinus Tachy,age,Respiratory rate avg h-2
0,2188-11-12 10:00:00,0,62063368,88.0,146.0,36.388901,,107.33300018310548,,,...,0,0,0,0,0,0,0,1,23.9,
1,2188-11-12 10:30:00,0,62063368,95.0,140.0,,134.0,110.0,,,...,0,0,0,0,0,0,0,1,23.9,38.0
2,2188-11-12 10:45:00,0,62063368,97.0,144.0,,134.0,112.66699981689452,,,...,0,0,0,0,0,0,0,1,23.9,36.0
3,2188-11-12 11:00:00,0,62063368,91.0,140.0,,134.0,107.33300018310548,,,...,0,0,0,0,0,0,0,1,23.9,37.666667
4,2188-11-12 12:00:00,0,62063368,98.0,139.0,36.444401,135.0,111.66699981689452,,,...,0,0,0,0,0,0,0,1,23.9,38.25
5,2188-11-12 13:00:00,0,62063368,108.0,158.0,36.666698,134.0,124.66699981689452,,0.44999998807907104,...,0,0,0,0,0,0,0,1,23.9,39.0
6,2188-11-12 13:03:00,0,62063368,,,36.7,,,,,...,0,0,0,0,0,0,0,0,23.9,35.0
7,2188-11-12 13:30:00,0,62063368,105.0,159.0,,134.0,123.0,,0.5,...,0,0,0,0,0,0,0,1,23.9,35.0
8,2188-11-12 14:00:00,0,62063368,107.0,157.0,,137.0,123.66699981689452,,0.5,...,0,0,0,0,0,0,0,1,23.9,33.666667
9,2188-11-12 14:45:00,0,62063368,101.0,149.0,,147.0,117.0,,,...,0,0,0,0,0,0,0,1,23.9,32.333333


### Create Dataset

In [26]:
import time

from fleming_lib.metrics import add_rolling_avg, add_target, add_age
from fleming_lib.utils import to_categorical, to_onehot


def create_dataset(list_patients, verbose=False):
    """Create list of dataset given a list of patients.
    
    Parameters
    ----------
    list_patients : list of int
        List of patients ID.
    verbose : bool
        Verbosity level.
    
    Returns
    -------
    frame : list of pd.DataFrame
        List of datasets, each corresponding to a patient.
    """
    t0 = time.time()
    frame = []
    
    if not isinstance(list_patients, list):
        list_patients = [list_patients]
        
    n_patients = len(list_patients)
    
    # Meta data
    if verbose:
        msg = 'Extracting meta data...'
        delta_t = str(int(time.time() - t0)) + ' s'
        print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')
            
    query = """
    select
        distinct p.person_id, p.gender_source_value gender, p.race_source_value race, p.birth_datetime
    from
        person p
        ;"""

    meta = pd.read_sql_query(query, conn)

    # Convert categorical variable to 'categorical' type
    categorical_variables = ['gender', 'race']
    meta = to_categorical(meta, categorical_variables)
    meta = to_onehot(meta, categorical_variables)
    # One-hot column names
    meta_names = meta.columns
    
    
    for i, patient in enumerate(list_patients):
        if verbose:
            base_msg = 'Patient {} [{}/{}]'.format(patient, i+1, n_patients)
            msg = base_msg
            delta_t = str(int(time.time() - t0)) + ' s'
            print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')
        
        # Measures
        if verbose:
            add_msg = 'Extracting measures...'
            msg = base_msg + ' - ' + add_msg
            delta_t = str(int(time.time() - t0)) + ' s'
            print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')
            
        query = """
        select
            distinct m.person_id, m.measurement_datetime, m.measurement_concept_name, m.value_source_value, m.unit_source_value, d.death_datetime
        from 
            measurement m 
        left join 
            death d on d.person_id = m.person_id
        where
            measurement_concept_id IN
            (3022318,   -- heart_rhythm
             3024171,   -- respiratory_rate
             3028354,   -- vent_settings
             3012888,   -- diastolic_bp
             3027598,   -- map_bp
             3004249,   -- systolic_bp
             3027018,   -- heart_rate
             3020891,   -- temperature
             3016502,   -- spo2
             3020716,   -- fio2
             3032652    -- glasgow coma scale
            )
        and m.person_id = {}
        order by measurement_datetime
            ;""".format(patient)

        df = pd.read_sql_query(query, conn)

        if verbose:
            add_msg = 'Formatting data...'
            msg = base_msg + ' - ' + add_msg
            delta_t = str(int(time.time() - t0)) + ' s'
            print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')

        df['death_datetime'] = pd.to_datetime(df['death_datetime'])
        df['measurement_datetime'] = pd.to_datetime(df['measurement_datetime'])

        df = add_target(df)

        df = df.pivot_table(index= ['measurement_datetime','target','person_id'], columns='measurement_concept_name', values='value_source_value', aggfunc='first')
        df.reset_index(inplace=True)
        df.columns.name = None
        df[['BP diastolic','BP systolic','Body temperature','Heart rate','Mean blood pressure','Oxygen saturation in Arterial blood','Respiratory rate']] = df[['BP diastolic','BP systolic','Body temperature','Heart rate','Mean blood pressure','Oxygen saturation in Arterial blood','Respiratory rate']].apply(pd.to_numeric, errors='ignore')
        
        # Convert 'Heart rate rhythm' to categorical and one-hot encode it
        categorical_variables = ['Heart rate rhythm']
        df = to_categorical(df, categorical_variables)
        df = to_onehot(df, categorical_variables)
        
        # Add meta data to measures
        if verbose:
            add_msg = 'Adding meta data...'
            msg = base_msg + ' - ' + add_msg
            delta_t = str(int(time.time() - t0)) + ' s'
            print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')
            
        meta_idx = (meta['person_id'] == patient)
        for meta_name in meta_names:
            df[meta_name] = meta[meta_idx][meta_name].values.squeeze()
            
        df = add_age(df, round_to_dec=1)
        
        # Add additional features
        if verbose:
            add_msg = 'Adding additional features...'
            msg = base_msg + ' - ' + add_msg
            delta_t = str(int(time.time() - t0)) + ' s'
            print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')
        
        df = add_rolling_avg(df, 'Respiratory rate', window=2)
        
        frame.append(df)
        
        if verbose:
            msg = 'Patient {} done.'.format(patient)
            delta_t = str(int(time.time() - t0)) + ' s'
            print('{:100s} [{:10s}]'.format(msg, delta_t), end='\r')
            print('')
    
    return frame

### Example on a patient who indeed died

In [21]:
query = """SELECT * FROM death LIMIT 10;"""
df = pd.read_sql_query(query, conn)

In [22]:
df

Unnamed: 0,person_id,death_date,death_datetime,death_type_concept_id,cause_concept_id,cause_source_value,cause_source_concept_id
0,62063368,2188-11-22,2188-11-22 12:00:00,38003569,,,
1,62063384,2198-02-18,2198-02-18 03:55:00,38003569,,,
2,62063393,2182-07-31,2182-07-31 06:45:00,38003569,,,
3,62063403,2145-03-19,2145-03-19 07:00:00,38003569,,,
4,62063425,2156-08-26,2156-08-26 12:00:00,38003569,,,
5,62063429,2182-02-28,2182-02-28 14:50:00,38003569,,,
6,62063436,2147-11-11,2147-11-11 22:57:00,38003569,,,
7,62063437,2171-06-27,2171-06-27 17:10:00,38003569,,,
8,62063444,2123-07-16,2123-07-16 23:19:00,38003569,,,
9,62063447,2129-12-20,2129-12-20 01:37:00,38003569,,,


In [27]:
dataset = create_dataset(62063368, verbose=True)

Falsent 62063368 [1/1] - Extracting measures...                                                      [11 s      ]

Falsent 62063368 [1/1] - Formatting data...                                                          [13 s      ]

Falsent 62063368 [1/1] - Adding meta data...                                                         [13 s      ]

Falsent 62063368 [1/1] - Adding additional features...                                               [13 s      ]

Patient 62063368 done.                                                                               [14 s      ]


In [28]:
dataset[0]

Unnamed: 0,measurement_datetime,target,person_id,BP diastolic,BP systolic,Body temperature,Heart rate,Mean blood pressure,Mean pressure Respiratory system airway Calculated,Oxygen concentration breathed,...,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN/NOT SPECIFIED,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN,age,Respiratory rate avg h-2
0,2188-11-12 10:00:00,0,62063368,88.0,146.0,36.388901,,107.33300018310547,,,...,0,0,0,0,0,0,0,0,23.9,
1,2188-11-12 10:30:00,0,62063368,95.0,140.0,,134.0,110,,,...,0,0,0,0,0,0,0,0,23.9,38.000000
2,2188-11-12 10:45:00,0,62063368,97.0,144.0,,134.0,112.66699981689453,,,...,0,0,0,0,0,0,0,0,23.9,36.000000
3,2188-11-12 11:00:00,0,62063368,91.0,140.0,,134.0,107.33300018310547,,,...,0,0,0,0,0,0,0,0,23.9,37.666667
4,2188-11-12 12:00:00,0,62063368,98.0,139.0,36.444401,135.0,111.66699981689453,,,...,0,0,0,0,0,0,0,0,23.9,38.250000
5,2188-11-12 13:00:00,0,62063368,108.0,158.0,36.666698,134.0,124.66699981689453,,.44999998807907104,...,0,0,0,0,0,0,0,0,23.9,39.000000
6,2188-11-12 13:03:00,0,62063368,,,36.700000,,,,,...,0,0,0,0,0,0,0,0,23.9,35.000000
7,2188-11-12 13:30:00,0,62063368,105.0,159.0,,134.0,123,,.5,...,0,0,0,0,0,0,0,0,23.9,35.000000
8,2188-11-12 14:00:00,0,62063368,107.0,157.0,,137.0,123.66699981689453,,.5,...,0,0,0,0,0,0,0,0,23.9,33.666667
9,2188-11-12 14:45:00,0,62063368,101.0,149.0,,147.0,117,,,...,0,0,0,0,0,0,0,0,23.9,32.333333


#### Export the dataset

In [None]:
# pd.concat(dataset).to_csv('dataset_omop', sep='\t', encoding='utf-8', index=False)

Export each dataframe dataset corresponding a given patient to CSV format.

In [37]:
for df in dataset:
    patient_id = df['person_id'][0]
    fname = 'dataset_omop_{}'.format(patient_id)
    df.to_csv(fname, sep='\t', encoding='utf-8', index=False)

Import each dataset corresponding a given patient from CSV format to dataframe.

In [38]:
df = pd.read_csv(fname, sep='\t', encoding='utf-8')

In [39]:
df

Unnamed: 0,measurement_datetime,target,person_id,BP diastolic,BP systolic,Body temperature,Heart rate,Mean blood pressure,Mean pressure Respiratory system airway Calculated,Oxygen concentration breathed,...,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN/NOT SPECIFIED,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN,age,Respiratory rate avg h-2
0,2188-11-12 10:00:00,0,62063368,88.0,146.0,36.388901,,107.33300018310547,,,...,0,0,0,0,0,0,0,0,23.9,
1,2188-11-12 10:30:00,0,62063368,95.0,140.0,,134.0,110,,,...,0,0,0,0,0,0,0,0,23.9,38.000000
2,2188-11-12 10:45:00,0,62063368,97.0,144.0,,134.0,112.66699981689453,,,...,0,0,0,0,0,0,0,0,23.9,36.000000
3,2188-11-12 11:00:00,0,62063368,91.0,140.0,,134.0,107.33300018310547,,,...,0,0,0,0,0,0,0,0,23.9,37.666667
4,2188-11-12 12:00:00,0,62063368,98.0,139.0,36.444401,135.0,111.66699981689453,,,...,0,0,0,0,0,0,0,0,23.9,38.250000
5,2188-11-12 13:00:00,0,62063368,108.0,158.0,36.666698,134.0,124.66699981689453,,0.45,...,0,0,0,0,0,0,0,0,23.9,39.000000
6,2188-11-12 13:03:00,0,62063368,,,36.700000,,,,,...,0,0,0,0,0,0,0,0,23.9,35.000000
7,2188-11-12 13:30:00,0,62063368,105.0,159.0,,134.0,123,,0.50,...,0,0,0,0,0,0,0,0,23.9,35.000000
8,2188-11-12 14:00:00,0,62063368,107.0,157.0,,137.0,123.66699981689453,,0.50,...,0,0,0,0,0,0,0,0,23.9,33.666667
9,2188-11-12 14:45:00,0,62063368,101.0,149.0,,147.0,117,,,...,0,0,0,0,0,0,0,0,23.9,32.333333
