In [1]:
import glob
import math
import numpy as np
import os
import pandas as pd
import re
import shutil
import sys

In [2]:
data_path = '/Volumes/GoogleDrive/My Drive/TAU/Code/DrugLab/data/hirid-a-high-time-resolution-icu-dataset-1.1.1/raw_stage/'   #complete data path

## Reading using Pandas
Required python packages are `pandas` and `pyarrow`

### Reading a part

In [3]:
part = 4

In [7]:
df_part = pd.read_csv(os.path.join(data_path, 'observation_tables', 'csv', f'part-{part}.csv'))
df_part['value'].count()

  df_part = pd.read_csv(os.path.join(data_path, 'observation_tables', 'csv', f'part-{part}.csv'))


2412421

### Reading a specific patient

In [8]:
patientid = 3

In [10]:
def load_patient_index(path):
    df_ind = pd.read_csv(path)
    return { pid : part for (pid, part) in zip(df_ind['patientid'], df_ind['part'])}

pat_index = load_patient_index(os.path.join(data_path, 'observation_tables 2', 'observation_tables_index.csv'))

In [13]:
def load_patient(pid, data_path, pat_index):
    df_part = pd.read_csv(os.path.join(data_path, 'observation_tables 2', 'csv', f"part-{pat_index[pid]}.csv"))
    
    return df_part.query(f'patientid == {pid}')

print ("Patient {} in partition {}.".format(patientid, pat_index[patientid]))

Patient 3 in partition 51.


In [14]:
load_patient(3, data_path, pat_index)

  df_part = pd.read_csv(os.path.join(data_path, 'observation_tables 2', 'csv', f"part-{pat_index[pid]}.csv"))


Unnamed: 0,datetime,entertime,patientid,status,stringvalue,type,value,variableid
0,2163-10-17 19:15:00.000,2163-10-17 19:16:33.990,3,8,,,185.000000,10000450
1,2163-10-17 19:15:00.000,2163-10-17 19:16:33.990,3,8,,,80.000000,10000400
2,2163-10-17 19:17:07.110,2163-10-17 19:17:12.383,3,4,,,96.000000,200
3,2163-10-17 19:17:07.310,2163-10-17 19:17:12.460,3,4,,,32.000000,2200
4,2163-10-17 19:17:07.410,2163-10-17 19:17:12.523,3,4,,,100.000000,4000
...,...,...,...,...,...,...,...,...
8139,2163-10-18 13:14:24.540,2163-10-18 13:14:17.730,3,8,,,2.465556,30005010
8140,2163-10-18 13:14:24.540,2163-10-18 13:14:17.920,3,8,,,0.000000,30005075
8141,2163-10-18 13:14:24.540,2163-10-18 13:14:18.750,3,8,,,0.000000,5685
8142,2163-10-18 13:22:00.000,2163-10-18 13:22:37.720,3,8,,,2.466333,30005010


## Load a partition

In [16]:
def load_partition(data_path, part):
    df_part = pd.read_csv(os.path.join(data_path, 'observation_tables 2', 'csv', f"part-{part}.csv"))
    
    return df_part

print ("Partition {}.".format(part))

Partition 4.


In [17]:
part4 = load_partition(data_path, 4)

  df_part = pd.read_csv(os.path.join(data_path, 'observation_tables 2', 'csv', f"part-{part}.csv"))


In [19]:
part4.patientid.value_counts()

9923     253814
25426    172235
14107    143083
28481    102266
33265     98935
          ...  
16706      1862
9597       1849
19033      1496
15429       697
6500        666
Name: patientid, Length: 125, dtype: int64

### Stats over columns

`pandas` can also read several parts at the same time. To save memory, only the needed columns can be selected.

In [None]:
df_ph = pd.read_parquet(os.path.join(data_path, 'pharma_records', 'parquet'), columns=['pharmaid', 'givendose'])

In [None]:
df_ph.info()

In [None]:
# stats over augmentin doses
df_ph.query('pharmaid == 1000274')['givendose'].describe()

## Using spark

Access with `pyspark` (no need for a cluster)

In [None]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

from pyspark.sql import functions as sf

In [None]:
def get_spark_session(cores, memory_per_executor):
    driver_mem = cores * memory_per_executor + 2000 # + some driver overhead
    
    cfg = (SparkConf().set("spark.driver.memory", "{}m".format(driver_mem)).
            set("spark.executor.memory", "{}m".format(memory_per_executor)).
            set("spark.master", "local[{}]".format(cores)).
            set("spark.sql.execution.arrow.enabled", True)
          )
    
    return (SparkSession.
             builder.
             config(conf=cfg).
             getOrCreate())

In [None]:
spark = get_spark_session(4, 1024)

### Stats over columns

In [None]:
df_obs = spark.read.parquet(os.path.join(data_path, 'observation_tables', 'parquet'))

In [None]:
# stats over weights (considering all parts)
df_obs.where('variableid == 10000400').select('value').summary().toPandas()