# Porting your data into FastEHR

In [8]:
import os
from pathlib import Path
import sys
import logging

import pandas as pd
from tabulate import tabulate

logging.basicConfig(level=logging.INFO)

cwd = os.getcwd()
!pwd

%load_ext autoreload
%autoreload 2

/home/ubuntu/Documents/GitHub/SurvivEHR/FastEHR/examples/1_build_database
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Example

Within this repository is an example dataset which can be found at 

```
<installation_path>/FastEHR/examples/data/
```

This dataset contains an example of Primary Care Electronic Healthcare Records, recorded through a patients lifetime.

Within this directory, there are four folders

    1) `baseline` - contains the static covariates that remain constant throughout a patient's time in the study.
    2) 'diagnoses' - contains events which do not have an associated value. 
            * In this example, this would be diagnoses.
    3) 'timeseries' - a parent directory for all events that contain an associated value.
            * In this example, this would be measurements, tests, and medications
    4) '_built' - a directory which will be used to contain any FastEHR output

In [2]:
display(os.listdir(cwd + "/../data"))
display(os.listdir(cwd + "/../data/timeseries/measurement_tests_medications")[:3])

['diagnoses', '_built', 'timeseries', 'baseline']

['25_Hydroxyvitamin_D2_level_92_.csv', 'bak']

# Static covariates

Let us first inspect the structure of the `baseline` folder contents.

In [3]:
static_contents = pd.read_csv(cwd + "/../data/baseline/static_data.csv")

print(tabulate(static_contents.iloc[:,0:5].head(6), headers='keys', tablefmt='psql'))
print(tabulate(static_contents.iloc[:,5:].head(6), headers='keys', tablefmt='psql'))


+----+---------------+--------------+-----------+---------------+--------------+
|    | PRACTICE_ID   |   PATIENT_ID | COUNTRY   | HEALTH_AUTH   | INDEX_DATE   |
|----+---------------+--------------+-----------+---------------+--------------|
|  0 | p20960        |            1 | E         | South East    | 2008-07-23   |
|  1 | p20960        |            2 | E         | South East    | 2008-07-24   |
|  2 | p20960        |            3 | E         | South East    | 2008-07-25   |
|  3 | p20960        |            4 | E         | South East    | 2008-07-26   |
|  4 | p20960        |            5 | E         | South East    | 2008-07-27   |
|  5 | p20961        |            1 | E         | North West    | 2008-07-28   |
+----+---------------+--------------+-----------+---------------+--------------+
+----+--------------+-------------+-----------------+-------+-------+--------------+------------+
|    | DEATH_DATE   | ETHNICITY   | YEAR_OF_BIRTH   | SEX   |   IMD | START_DATE   | END_DAT

# Non-valued events

We now inspect the structure of the `diagnoses` folder contents.

In [4]:
nv_event_contents = pd.read_csv(cwd + "/../data/diagnoses/diagnosis_data.csv")

# Take only the first 5 columns (cutting out additional events)
print(tabulate(nv_event_contents.iloc[:,0:5].head(), headers='keys', tablefmt='psql'))



+----+-----------------------+-----------------+--------------+--------------+---------------------------+
|    | PRACTICE_PATIENT_ID   | YEAR_OF_BIRTH   | DEATH_DATE   | BD_MEDI:AF   |   BD_MEDI:STROKE_HAEMRGIC |
|----+-----------------------+-----------------+--------------+--------------+---------------------------|
|  0 | p20960_1              | 1940-07-15      | 2009-08-01   | nan          |                       nan |
|  1 | p20960_2              | 1940-07-16      | 2009-08-01   | nan          |                       nan |
|  2 | p20960_3              | 1940-07-17      | 2009-08-03   | nan          |                       nan |
|  3 | p20960_4              | 1940-07-18      | 2009-08-04   | 1992-06-12   |                       nan |
|  4 | P20960_5              | 1940-07-19      | 2009-08-05   | nan          |                       nan |
+----+-----------------------+-----------------+--------------+--------------+---------------------------+


We see many column names. Columns beginning with the prefix "BD_MEDI:" are marked as events, and the column rows contain a date if they were observed.

Death is also included as a column, as this is an event.

In [5]:
print(nv_event_contents.columns[:20])

Index(['PRACTICE_PATIENT_ID', 'YEAR_OF_BIRTH', 'DEATH_DATE', 'BD_MEDI:AF',
       'BD_MEDI:STROKE_HAEMRGIC', 'BD_MEDI:HYPERTENSION',
       'BD_MEDI:MINFARCTION', 'BD_MEDI:PAD_STRICT', 'BD_MEDI:TYPE1DM',
       'BD_MEDI:TYPE2DIABETES', 'BD_MEDI:CKDSTAGE3TO5', 'BD_MEDI:DEPRESSION',
       'BD_MEDI:ANXIETY', 'BD_MEDI:BIPOLAR', 'BD_MEDI:EATINGDISORDERS',
       'BD_MEDI:AUTISM', 'BD_MEDI:SUBSTANCEMISUSE',
       'BD_MEDI:CHRONIC_LIVER_DISEASE_ALCOHOL',
       'BD_MEDI:OTHER_CHRONIC_LIVER_DISEASE_OPTIMAL',
       'BD_MEDI:ULCERATIVE_COLITIS'],
      dtype='object')


# Valued-events (or events which require their own database table)

In [6]:
# TODO: print relevant docstrings

We now inspect the structure of the `timeseries` parent folder contents.

In [9]:
# v_event_contents = pd.read_csv(cwd + "/../data/timeseries/measurement_tests_medications/Serum_folate_80.csv")
# print(v_event_contents.head())