# OMOP Create Dataset

In [341]:
# Get your credentials
import numpy as np
omop_login = np.load('full_omop_login.npy').item()

# Open a connection to OMOP
from fleming_lib.tools import connect_to_omop
conn = connect_to_omop(login_dict=omop_login)

# Get ready for SQL query processing
import pandas as pd
import datetime

## Select list of patients

### SQL requests


- not be "limited" (Concept_id: 2001018843 and 2001030812 and 4127294)


In [320]:
query = """
select
    distinct person_id
from 
    measurement 
where 
    measurement_source_concept_id in (2001018843, 2001030812, 4127294)  
    ;"""

limitation = pd.read_sql_query(query, conn)

- Age >= 15 
- Processed only by intensive care unit

In [323]:
query = """
select
    distinct p.person_id
from 
    person p
    
    inner join

    (select 
        person_id, min(visit_start_date) as first_visit_date
    from
        visit_occurrence
    where
        visit_source_value = 'EMERGENCY'
    group by 
        person_id) v
    
    on p.person_id = v.person_id
    
where
    (v.first_visit_date - p.birth_datetime) >= 15 * 365.25
    ;"""

icu_and_age = pd.read_sql_query(query, conn)

### Cohort fusion

In [324]:
list_patient = list(set(icu_and_age.person_id) & set(limitation.person_id))
list_patient

[62063367,
 62063368,
 62063369,
 62063370,
 62063371,
 62063373,
 62063374,
 62063377,
 62063379,
 62063382,
 62063383,
 62063384,
 62063385,
 62063389,
 62063390,
 62063391,
 62063392,
 62063395,
 62063396,
 62063398,
 62063400,
 62063403,
 62063405,
 62063406,
 62063410,
 62063411,
 62063412,
 62063416,
 62063418,
 62063419,
 62063420,
 62063421,
 62063422,
 62063423,
 62063425,
 62063426,
 62063428,
 62063429,
 62063430,
 62063431,
 62063433,
 62063435,
 62063436,
 62063437,
 62063438,
 62063439,
 62063440,
 62063441,
 62063443,
 62063444,
 62063445,
 62063446,
 62063447,
 62063449,
 62063452,
 62063453,
 62063454,
 62063456,
 62063457,
 62063458,
 62063459,
 62063461,
 62063462,
 62063463,
 62063464,
 62063465,
 62063466,
 62063468,
 62063469,
 62063470,
 62063471,
 62063474,
 62063478,
 62063482,
 62063483,
 62063484,
 62063485,
 62063486,
 62063487,
 62063488,
 62063489,
 62063490,
 62063492,
 62063494,
 62063495,
 62063496,
 62063497,
 62063499,
 62063501,
 62063502,
 62063503,

In [325]:
len(list_patient)

27518

## Add measures

### SQL requests

In [332]:
query = """
select
    distinct m.person_id, m.measurement_datetime, m.measurement_concept_name, m.value_source_value, m.unit_source_value, d.death_datetime
from 
    measurement m 
left join 
    death d on d.person_id = m.person_id
where
    measurement_concept_id IN
    (3022318,   -- heart_rhythm
     3024171,   -- respiratory_rate
     3028354,   -- vent_settings
     3012888,   -- diastolic_bp
     3027598,   -- map_bp
     3004249,   -- systolic_bp
     3027018,   -- heart_rate
     3020891,   -- temperature
     3016502,   -- spo2
     3020716,   -- fio2
     3032652    -- glasgow coma scale
    )
and m.person_id = 62073122
order by measurement_datetime
limit 100
    ;"""

measures = pd.read_sql_query(query, conn)

### Change type datetime

In [333]:
measures['death_datetime'] = pd.to_datetime(measures['death_datetime'])
measures['measurement_datetime'] = pd.to_datetime(measures['measurement_datetime'])
measures

Unnamed: 0,person_id,measurement_datetime,measurement_concept_name,value_source_value,unit_source_value,death_datetime
0,62073122,2108-04-06 16:30:00,Heart rate,115,BPM,NaT
1,62073122,2108-04-06 16:30:00,Body temperature,36.111099243164062,Deg. C,NaT
2,62073122,2108-04-06 16:30:00,Mean blood pressure,100.66699981689453,mmHg,NaT
3,62073122,2108-04-06 16:30:00,Respiratory rate,22,BPM,NaT
4,62073122,2108-04-06 16:30:00,Oxygen saturation in Arterial blood,100,%,NaT
5,62073122,2108-04-06 16:30:00,Heart rate rhythm,Sinus Tachy,,NaT
6,62073122,2108-04-06 16:30:00,BP systolic,130,mmHg,NaT
7,62073122,2108-04-06 16:30:00,BP diastolic,86,mmHg,NaT
8,62073122,2108-04-06 16:30:00,Body temperature,97,Deg. F,NaT
9,62073122,2108-04-06 17:00:00,BP systolic,118,mmHg,NaT


### Add target value (y)

In [334]:
def define_target_value(row):
    if row['death_datetime']-timedelta(days=3) <= row['measurement_datetime']:
        val = 1
    else:
        val = 0
    return val

measures['target'] = measures.apply(define_target_value, axis=1)

### Row to columns

In [335]:
measures = measures.pivot_table(index=['measurement_datetime','target','person_id'], columns='measurement_concept_name', values='value_source_value', aggfunc='first')
measures

Unnamed: 0_level_0,Unnamed: 1_level_0,measurement_concept_name,BP diastolic,BP systolic,Body temperature,Heart rate,Heart rate rhythm,Mean blood pressure,Oxygen saturation in Arterial blood,Respiratory rate
measurement_datetime,target,person_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2108-04-06 16:30:00,0,62073122,86.0,130.0,36.11109924316406,115,Sinus Tachy,100.66699981689452,100,22
2108-04-06 17:00:00,0,62073122,69.0,118.0,,117,Sinus Tachy,85.33329772949219,100,20
2108-04-06 18:00:00,0,62073122,70.0,124.0,,122,Sinus Tachy,88.0,95,18
2108-04-06 19:00:00,0,62073122,,,,117,Sinus Tachy,,100,23
2108-04-06 20:00:00,0,62073122,89.0,136.0,36.333301544189446,117,Sinus Tachy,104.66699981689452,100,18
2108-04-06 21:00:00,0,62073122,54.0,132.0,,118,Sinus Tachy,80.0,95,20
2108-04-06 22:00:00,0,62073122,50.0,145.0,,115,Sinus Tachy,81.66670227050781,95,18
2108-04-06 23:00:00,0,62073122,80.0,141.0,,117,Sinus Tachy,100.33300018310548,96,20
2108-04-07 00:00:00,0,62073122,63.0,114.0,36.55559921264648,112,Sinus Tachy,80.0,97,22
2108-04-07 01:00:00,0,62073122,62.0,118.0,,105,Sinus Tachy,80.66670227050781,99,21


### Change type to float

In [336]:
measures.reset_index(inplace=True)
measures.columns.name = None
measures[['BP diastolic','BP systolic','Body temperature','Heart rate','Mean blood pressure','Oxygen saturation in Arterial blood','Respiratory rate']] = measures[['BP diastolic','BP systolic','Body temperature','Heart rate','Mean blood pressure','Oxygen saturation in Arterial blood','Respiratory rate']].apply(pd.to_numeric, errors='ignore')

## Data enhancement

### Add rolling mean 

In [337]:
df_rm = measures

def add_avg(row, df, time, column):
    filter_df = df[(df['measurement_datetime'] >= (row['measurement_datetime']-timedelta(hours=time))) & (df['measurement_datetime'] < row['measurement_datetime'])]
    try:
        return filter_df[column].mean()
    except:
        return float('nan')

df_rm['Respiratory rate avg h-2'] = df_rm.apply(lambda x: add_avg(x, df, 2, 'Respiratory rate'), axis=1)
df_rm

Unnamed: 0,measurement_datetime,target,person_id,BP diastolic,BP systolic,Body temperature,Heart rate,Heart rate rhythm,Mean blood pressure,Oxygen saturation in Arterial blood,Respiratory rate,Respiratory rate avg h-2
0,2108-04-06 16:30:00,0,62073122,86.0,130.0,36.111099,115,Sinus Tachy,100.667,100,22,
1,2108-04-06 17:00:00,0,62073122,69.0,118.0,,117,Sinus Tachy,85.333298,100,20,22.0
2,2108-04-06 18:00:00,0,62073122,70.0,124.0,,122,Sinus Tachy,88.0,95,18,21.0
3,2108-04-06 19:00:00,0,62073122,,,,117,Sinus Tachy,,100,23,19.0
4,2108-04-06 20:00:00,0,62073122,89.0,136.0,36.333302,117,Sinus Tachy,104.667,100,18,20.5
5,2108-04-06 21:00:00,0,62073122,54.0,132.0,,118,Sinus Tachy,80.0,95,20,20.5
6,2108-04-06 22:00:00,0,62073122,50.0,145.0,,115,Sinus Tachy,81.666702,95,18,19.0
7,2108-04-06 23:00:00,0,62073122,80.0,141.0,,117,Sinus Tachy,100.333,96,20,19.0
8,2108-04-07 00:00:00,0,62073122,63.0,114.0,36.555599,112,Sinus Tachy,80.0,97,22,19.0
9,2108-04-07 01:00:00,0,62073122,62.0,118.0,,105,Sinus Tachy,80.666702,99,21,21.0


## Create Dataset

In [338]:
def create_data_set(list_patients):
    nb = 0
    frame = []
    for patient in list_patients:
        query = """
        select
            distinct m.person_id, m.measurement_datetime, m.measurement_concept_name, m.value_source_value, m.unit_source_value, d.death_datetime
        from 
            measurement m 
        left join 
            death d on d.person_id = m.person_id
        where
            measurement_concept_id IN
            (3022318,   -- heart_rhythm
             3024171,   -- respiratory_rate
             3028354,   -- vent_settings
             3012888,   -- diastolic_bp
             3027598,   -- map_bp
             3004249,   -- systolic_bp
             3027018,   -- heart_rate
             3020891,   -- temperature
             3016502,   -- spo2
             3020716,   -- fio2
             3032652    -- glasgow coma scale
            )
        and m.person_id = %s
        order by measurement_datetime
        limit 100 
            ;""" %patient

        df = pd.read_sql_query(query, conn)

        df['death_datetime'] = pd.to_datetime(df['death_datetime'])
        df['measurement_datetime'] = pd.to_datetime(df['measurement_datetime'])

        df['target'] = df.apply(define_target_value, axis=1)

        df = df.pivot_table(index= ['measurement_datetime','target','person_id'], columns='measurement_concept_name', values='value_source_value', aggfunc='sum')
        df.reset_index(inplace=True)
        df.columns.name = None
        df[['BP diastolic','BP systolic','Body temperature','Heart rate','Mean blood pressure','Oxygen saturation in Arterial blood','Respiratory rate']] = df[['BP diastolic','BP systolic','Body temperature','Heart rate','Mean blood pressure','Oxygen saturation in Arterial blood','Respiratory rate']].apply(pd.to_numeric, errors='ignore')

        df['Respiratory rate avg h-2'] = df.apply(lambda x: add_avg(x, df, 2, 'Respiratory rate'), axis=1)
        
        frame.append(df)
        nb+=1
        print(nb)
    
    return pd.concat(frame)

In [339]:
list_patient_test = list_patient[:8]

In [343]:
dataset = create_data_set(list_patient_test)

1
2
3
4
5
6
7
8


### Export the dataset

In [344]:
dataset.to_csv("dataset_omop", sep='\t', encoding='utf-8')