In [None]:
# default_exp data

# data
> Classes and functions for managing data

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
#hide
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [None]:
#export
from lemonade.preprocessing.clean import * #for GVs
from lemonade.preprocessing.transform import *
from fastai.imports import *

In [None]:
#hide
from nbdev.showdoc import *

## Split

- Split is already done in the raw data before vocab creation.
- The following class just to hold everything together

In [None]:
#export
class EHRDataSplits():
    '''Class to hold the PatientList splits; defaults to loading 0 to 20 years age span'''
    def __init__(self, path, age_start=0, age_stop=20, age_in_months=False):
        self.train, self.valid, self.test = self._load_splits(path, age_start, age_stop, age_in_months)
    
    def _load_splits(self, path, age_start, age_stop, age_in_months):
        '''Load splits of preprocessed `PatientList`s from persistent store using path'''
        train = PatientList.load(path, 'train', age_start, age_stop, age_in_months)
        valid = PatientList.load(path, 'valid', age_start, age_stop, age_in_months)
        test  = PatientList.load(path, 'test',  age_start, age_stop, age_in_months)
        return train, valid, test

    def get_splits(self):
        '''Return splits'''
        return self.train, self.valid, self.test
    
    def get_lengths(self):
        '''Return a dataframe with lengths (# of patients) of the splits (train, valid, test) and total'''
        lengths = [len(self.train), len(self.valid), len(self.test), len(self.train)+len(self.valid)+len(self.test)]
        return pd.DataFrame(lengths, index=['train','valid','test','total'], columns=['lengths'])
    
    def get_label_counts(self, labels):
        '''Get prevalence counts of labels in each split - returns a dataframe with counts for each split and total count'''
        counts = []
        for label in labels:
            train_count = [getattr(self.train[i],label) == 1 for i in range(len(self.train))].count(True)
            valid_count = [getattr(self.valid[i],label) == 1 for i in range(len(self.valid))].count(True)
            test_count  = [getattr(self.test[i],label) == 1 for i in range(len(self.test))].count(True)
            total_count = train_count+valid_count+test_count
            counts.append([train_count, valid_count, test_count, total_count])
        return pd.DataFrame(counts, index=labels, columns=['train','valid','test','total'])
    
    def get_pos_wts(self, labels):
        '''Get positive weights to be used in `nn.BCEWithLogitsLoss`'''
        pos_counts = self.get_label_counts(labels)
        neg_counts = self.get_lengths().transpose().values - pos_counts
        return round(neg_counts / pos_counts)

In [None]:
show_doc(EHRDataSplits, title_level=3)

<h3 id="EHRDataSplits" class="doc_header"><code>class</code> <code>EHRDataSplits</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>EHRDataSplits</code>(**`path`**, **`age_start`**=*`0`*, **`age_stop`**=*`20`*, **`age_in_months`**=*`False`*)

Class to hold the PatientList splits; defaults to loading 0 to 20 years age span

In [None]:
show_doc(EHRDataSplits._load_splits)

<h4 id="EHRDataSplits._load_splits" class="doc_header"><code>EHRDataSplits._load_splits</code><a href="__main__.py#L7" class="source_link" style="float:right">[source]</a></h4>

> <code>EHRDataSplits._load_splits</code>(**`path`**, **`age_start`**, **`age_stop`**, **`age_in_months`**)

Load splits of preprocessed [`PatientList`](/lemonade/preprocessing_transform#PatientList)s from persistent store using path

In [None]:
show_doc(EHRDataSplits.get_splits)

<h4 id="EHRDataSplits.get_splits" class="doc_header"><code>EHRDataSplits.get_splits</code><a href="__main__.py#L14" class="source_link" style="float:right">[source]</a></h4>

> <code>EHRDataSplits.get_splits</code>()

Return splits

In [None]:
show_doc(EHRDataSplits.get_lengths)

<h4 id="EHRDataSplits.get_lengths" class="doc_header"><code>EHRDataSplits.get_lengths</code><a href="__main__.py#L18" class="source_link" style="float:right">[source]</a></h4>

> <code>EHRDataSplits.get_lengths</code>()

Return a dataframe with lengths (# of patients) of the splits (train, valid, test) and total

In [None]:
show_doc(EHRDataSplits.get_label_counts)

<h4 id="EHRDataSplits.get_label_counts" class="doc_header"><code>EHRDataSplits.get_label_counts</code><a href="__main__.py#L23" class="source_link" style="float:right">[source]</a></h4>

> <code>EHRDataSplits.get_label_counts</code>(**`labels`**)

Get prevalence counts of labels in each split - returns a dataframe with counts for each split and total count

In [None]:
show_doc(EHRDataSplits.get_pos_wts)

<h4 id="EHRDataSplits.get_pos_wts" class="doc_header"><code>EHRDataSplits.get_pos_wts</code><a href="__main__.py#L34" class="source_link" style="float:right">[source]</a></h4>

> <code>EHRDataSplits.get_pos_wts</code>(**`labels`**)

Get positive weights to be used in `nn.BCEWithLogitsLoss`

**Tests**

In [None]:
PATH_1K, LABELS

('./datasets/synthea/1K',
 ['diabetes', 'stroke', 'alzheimers', 'coronaryheart'])

In [None]:
data_pth = PATH_1K

In [None]:
splits = EHRDataSplits(data_pth)

In [None]:
splits.get_lengths()

Unnamed: 0,lengths
train,664
valid,222
test,222
total,1108


In [None]:
prevalence = splits.get_label_counts(LABELS)
prevalence

Unnamed: 0,train,valid,test,total
diabetes,37,10,14,61
stroke,42,14,14,70
alzheimers,18,3,5,26
coronaryheart,34,12,9,55


**Cross check with raw**
- Check total counts against raw_csv
- Check split counts against split/raw_csv

In [None]:
raw_cnds = pd.read_csv(f'{data_pth}/raw_original/conditions.csv', low_memory=False)

In [None]:
# raw_cnds

In [None]:
print(raw_cnds[raw_cnds.CODE == 44054006].CODE.count()) #diabetes
print(raw_cnds[raw_cnds.CODE == 230690007].CODE.count()) #stroke
print(raw_cnds[raw_cnds.CODE == 26929004].CODE.count()) #alzheimers
print(raw_cnds[raw_cnds.CODE == 53741008].CODE.count()) #coronary_heart

61
70
26
55


In [None]:
raw_cnds_train = pd.read_csv(f'{PATH_1K}/raw_split/train/conditions.csv', low_memory=False)
raw_cnds_valid = pd.read_csv(f'{PATH_1K}/raw_split/valid/conditions.csv', low_memory=False)
raw_cnds_test  = pd.read_csv(f'{PATH_1K}/raw_split/test/conditions.csv', low_memory=False)

In [None]:
print(raw_cnds_train[raw_cnds_train.CODE == 44054006].CODE.count()) #diabetes
print(raw_cnds_valid[raw_cnds_valid.CODE == 44054006].CODE.count()) #diabetes
print(raw_cnds_test [raw_cnds_test.CODE == 44054006].CODE.count()) #diabetes

37
10
14


In [None]:
prevalence.loc['diabetes'].total

61

In [None]:
cnd_codes = [44054006, 230690007, 26929004, 53741008]

In [None]:
for code,name in zip(cnd_codes, LABELS):
    print(code,': ',name)

44054006 :  diabetes
230690007 :  stroke
26929004 :  alzheimers
53741008 :  coronaryheart


In [None]:
for code, name in zip(cnd_codes, LABELS):
    assert prevalence.loc[name].total == raw_cnds[raw_cnds.CODE == code].CODE.count()
    assert prevalence.loc[name].train == raw_cnds_train[raw_cnds_train.CODE == code].CODE.count()
    assert prevalence.loc[name].valid == raw_cnds_valid[raw_cnds_valid.CODE == code].CODE.count()
    assert prevalence.loc[name].test  == raw_cnds_test [raw_cnds_test.CODE == code]. CODE.count()

## Label

**Labeling** definition in fastai -- some processes need to be run on `train` and **applied** to `valid`

This is completed in preprocessing (vocab & transform) as follows
1. Vocabs created from train data
    - Tokenizing unique values for different record codes & demographic values
    - Calculating mean and std for age
2. Vocabs applied to train, valid and test data
    - With `numericalize` for record codes & demographic values
    - With normalizing of age with the mean / std from train

**Hence labeling in our case will be creating X and y**

- X is the patient object
- y needs to be a tensor made out of - diabetes, stroke, alzheimers, coronaryheart

So **creating the `y` tensor** is simply a matter of ..
1. extracting the values of each of the 4 labels from each `Patient` object 
2. turning it into a `torch.FloatTensor`
3. and stacking them up using `torch.stack`

In [None]:
tst_y = np.array((True, False, False, True), dtype='float')
torch.from_numpy(tst_y), torch.FloatTensor(tst_y)

(tensor([1., 0., 0., 1.], dtype=torch.float64), tensor([1., 0., 0., 1.]))

2 ways of creating torch tensor from a numpy array, we will stick with the latter

In [None]:
y = []
for pt in splits.train:
    y.append(torch.FloatTensor(np.array([getattr(pt,label) for label in LABELS], dtype='float')) )

In [None]:
# y

In [None]:
y = torch.stack(y)

In [None]:
y.shape

torch.Size([664, 4])

In [None]:
y

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        ...,
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 1.]])

Putting it into a function

In [None]:
def label_data(patient_ds, labels) -> 'x,y':
    '''Extracts y from patient object, returns x=Patient object, y=tensor of conditions'''
    def _get_y(ds, labels):
        y = []
        for pt in ds:
            y.append( torch.FloatTensor(np.array([getattr(pt,label) for label in labels], dtype='float')) )
        return torch.stack(y)
    
    x, y = patient_ds, _get_y(patient_ds, labels)
    return x,y

In [None]:
x_train,y_train = label_data(splits.train, LABELS)
x_valid,y_valid = label_data(splits.valid, LABELS)
x_test ,y_test  = label_data(splits.test , LABELS)

In [None]:
y_train.shape, y_valid.shape, y_test.shape

(torch.Size([664, 4]), torch.Size([222, 4]), torch.Size([222, 4]))

In [None]:
#export
class LabelEHRData():
    '''Class to hold labeled EHR data splits'''
    def __init__(self, train, valid, test, labels):
        '''Extracts y from patient object, each labelset a tuple of x,y: x=Patient object, y=tensor of conditions'''
        self.x_train, self.y_train = train, self._get_y(train, labels)
        self.x_valid, self.y_valid = valid, self._get_y(valid, labels)
        self.x_test,  self.y_test  = test , self._get_y(test , labels)
        
        self.train = self.x_train, self.y_train
        self.valid = self.x_valid, self.y_valid
        self.test  = self.x_test,  self.y_test
    
    def _get_y(self, ds, labels):
        '''Extract y from each patient object in ds and stack them - ds is dataset containing patient objects'''
        y = []
        for pt in ds:
            y.append( torch.FloatTensor(np.array([getattr(pt,label) for label in labels], dtype='float')) )
        return torch.stack(y)

In [None]:
show_doc(LabelEHRData, title_level=3)

<h3 id="LabelEHRData" class="doc_header"><code>class</code> <code>LabelEHRData</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>LabelEHRData</code>(**`train`**, **`valid`**, **`test`**, **`labels`**)

Class to hold labeled EHR data splits

In [None]:
show_doc(LabelEHRData.__init__)

<h4 id="LabelEHRData.__init__" class="doc_header"><code>LabelEHRData.__init__</code><a href="__main__.py#L4" class="source_link" style="float:right">[source]</a></h4>

> <code>LabelEHRData.__init__</code>(**`train`**, **`valid`**, **`test`**, **`labels`**)

Extracts y from patient object, each labelset a tuple of x,y: x=Patient object, y=tensor of conditions

In [None]:
show_doc(LabelEHRData._get_y)

<h4 id="LabelEHRData._get_y" class="doc_header"><code>LabelEHRData._get_y</code><a href="__main__.py#L14" class="source_link" style="float:right">[source]</a></h4>

> <code>LabelEHRData._get_y</code>(**`ds`**, **`labels`**)

Extract y from each patient object in ds and stack them - ds is dataset containing patient objects

In [None]:
labeled = LabelEHRData(*splits.get_splits(), LABELS)

## Dataset

Subclasses Pytorch [`Dataset`](https://pytorch.org/docs/master/data.html?highlight=dataloader#torch.utils.data.Dataset)

In [None]:
#export
class EHRDataset(torch.utils.data.Dataset):
    '''Class to hold a single EHR dataset - holds a tuple of x and y and implements `__len__()` and `__getitem__()`'''
    def __init__(self, x, y): self.x,self.y = x,y
    def __len__(self): return len(self.x)
    def __getitem__(self, i): return self.x[i],self.y[i]

In [None]:
show_doc(EHRDataset, title_level=3)

<h3 id="EHRDataset" class="doc_header"><code>class</code> <code>EHRDataset</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>EHRDataset</code>(**\*`args`**, **\*\*`kwds`**) :: `Dataset`

Class to hold a single EHR dataset - holds a tuple of x and y and implements `__len__()` and `__getitem__()`

In [None]:
def get_ds(x_train,y_train, x_valid,y_valid) -> 'train_ds, valid_ds':
    train_ds,valid_ds = EHRDataset(x_train, y_train), EHRDataset(x_valid, y_valid)
    return train_ds, valid_ds

In [None]:
train_ds, valid_ds = get_ds(*labeled.train, *labeled.valid)

In [None]:
len(train_ds), len(valid_ds)

(664, 222)

In [None]:
len(labeled.train), len(labeled.x_train)

(2, 664)

In [None]:
assert len(train_ds)==len(labeled.x_train)==len(labeled.y_train)
assert len(valid_ds)==len(labeled.y_valid)==len(labeled.x_valid)

In [None]:
xb,yb = train_ds[0:5]
xb,yb

([ptid:12b42348-29bb-4cf4-b878-d419a4c8f067, birthdate:1951-09-04, diabetes:False, device:cpu,
  ptid:4c59df3b-042b-42c2-901a-3783d6d77919, birthdate:1979-09-18, diabetes:False, device:cpu,
  ptid:8f07e577-7ab4-4e89-ae98-6d394b3929a9, birthdate:1967-01-13, diabetes:False, device:cpu,
  ptid:653753d9-a52f-4ef3-b285-bdc3c671293c, birthdate:1971-12-26, diabetes:False, device:cpu,
  ptid:1b801001-0bf9-4ad9-b175-9de9bc2d905f, birthdate:1979-11-26, diabetes:False, device:cpu],
 tensor([[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 1.]]))

In [None]:
yb.shape

torch.Size([5, 4])

## DataLoader - Using Pytorch DataLoader

**Need to define a custom collate function**, because default collate cannot handle list of patient objects in x, gives following error
```
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class '__main__.Patient'>
```

In [None]:
valid_ds[0:4]

([ptid:cb64a8e7-c65c-464c-bb27-270dc00d7ec5, birthdate:1960-05-11, diabetes:False, device:cpu,
  ptid:8349dd1f-b916-4e4d-b132-837c1e021c58, birthdate:1963-11-22, diabetes:False, device:cpu,
  ptid:bd44cef1-d656-480a-9c10-21d2e26e261f, birthdate:2008-11-10, diabetes:False, device:cpu,
  ptid:ace46042-749e-44ba-9c7c-92cf42c00eb1, birthdate:1995-09-21, diabetes:False, device:cpu],
 tensor([[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 1.],
         [0., 0., 0., 0.]]))

In [None]:
x_tmps,y_tmps = valid_ds[0:4]

In [None]:
x_tmps

[ptid:cb64a8e7-c65c-464c-bb27-270dc00d7ec5, birthdate:1960-05-11, diabetes:False, device:cpu,
 ptid:8349dd1f-b916-4e4d-b132-837c1e021c58, birthdate:1963-11-22, diabetes:False, device:cpu,
 ptid:bd44cef1-d656-480a-9c10-21d2e26e261f, birthdate:2008-11-10, diabetes:False, device:cpu,
 ptid:ace46042-749e-44ba-9c7c-92cf42c00eb1, birthdate:1995-09-21, diabetes:False, device:cpu]

In [None]:
y_tmps

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 0.]])

**Old collate fns**

**1. removed cuda calls**
```python
def collate(b):
    xs,ys = zip(*b)
    return [x.to_gpu() for x in xs], torch.unsqueeze(torch.tensor(ys), 1).cuda()
```
**2. removed unsqueeze**
```python
def collate(b):
    xs,ys = zip(*b)
    return xs, torch.unsqueeze(torch.tensor(ys), 1)
```

In [None]:
def collate_ehr(b):
    '''Custom collate function for use in `DataLoader`'''
    xs,ys = zip(*b)
    return xs, torch.stack(ys)

In [None]:
bs = 2

In [None]:
def get_dls(train_ds, valid_ds, bs, collate_fn=collate_ehr, **kwargs) -> 'train_dl, valid_dl':
    return(DataLoader(train_ds, batch_size=bs, shuffle=True, collate_fn=collate_fn, **kwargs),
           DataLoader(valid_ds, batch_size=bs*2, collate_fn=collate_fn, **kwargs))

In [None]:
train_dl, valid_dl = get_dls(train_ds, valid_ds, bs)

**Tests - `iter()`, `next()` - Next Batch**

In [None]:
it = iter(valid_dl)
first_x, first_y = next(it)
second_x, second_y = next(it)

In [None]:
first_x, first_y

((ptid:cb64a8e7-c65c-464c-bb27-270dc00d7ec5, birthdate:1960-05-11, diabetes:False, device:cpu,
  ptid:8349dd1f-b916-4e4d-b132-837c1e021c58, birthdate:1963-11-22, diabetes:False, device:cpu,
  ptid:bd44cef1-d656-480a-9c10-21d2e26e261f, birthdate:2008-11-10, diabetes:False, device:cpu,
  ptid:ace46042-749e-44ba-9c7c-92cf42c00eb1, birthdate:1995-09-21, diabetes:False, device:cpu),
 tensor([[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 1.],
         [0., 0., 0., 0.]]))

In [None]:
second_x, second_y

((ptid:4cc576ef-e1d9-45b0-9d83-7eb7d091cc6f, birthdate:1943-09-25, diabetes:False, device:cpu,
  ptid:ee09a493-5684-498f-b46c-851eb1e7a3db, birthdate:1955-06-05, diabetes:False, device:cpu,
  ptid:26b6efd5-7bf6-47b5-9d6a-f3f7338afcba, birthdate:1989-11-22, diabetes:False, device:cpu,
  ptid:c118629a-db21-4c34-803e-8249ad79ae51, birthdate:1952-02-18, diabetes:False, device:cpu),
 tensor([[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]))

In [None]:
second_x[0].alg_nums

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
#export
class EHRData:
    '''All encompassing class for EHR data - holds Splits, Labels, Datasets, DataLoaders and provides convenience fns for training and prediction'''
    def __init__(self, path, labels, age_start=0, age_stop=20, age_in_months=False):
        self.path, self.labels = path, labels
        self.age_start, self.age_stop, self.age_in_months = age_start, age_stop, age_in_months
    
    def load_splits(self):
        '''Load data splits given dataset path'''
        self.splits = EHRDataSplits(self.path, self.age_start, self.age_stop, self.age_in_months)
    
    def label(self):
        '''Run labeler - i.e. extract y from patient objects'''
        self.labeled = LabelEHRData(*self.splits.get_splits(), self.labels)
        
    def create_datasets(self):
        '''Create `EHRDataset`s'''
        self.train_ds = EHRDataset(*self.labeled.train)
        self.valid_ds = EHRDataset(*self.labeled.valid)
        self.test_ds  = EHRDataset(*self.labeled.test)
        
    def ehr_collate(b):
        '''Custom collate function for use in `DataLoader`'''
        xs,ys = zip(*b)
        return xs, torch.stack(ys)
    
    def create_dls(self, bs, collate_fn=ehr_collate, **kwargs):
        '''Create `DataLoader`s'''
        self.train_dl = DataLoader(self.train_ds, batch_size=bs, shuffle=True, collate_fn=collate_fn, **kwargs)
        self.valid_dl = DataLoader(self.valid_ds, batch_size=bs*2, collate_fn=collate_fn, **kwargs)
        self.test_dl  = DataLoader(self.test_ds,  batch_size=bs*2, collate_fn=collate_fn, **kwargs)
        
    def get_data(self, bs=64, num_workers=0):
        '''Convenience function - returns everything needed for training'''
        self.load_splits()
        self.label()
        self.create_datasets()
        self.create_dls(bs, num_workers=num_workers)

        pos_wts = self.splits.get_pos_wts(self.labels)
        train_pos_wts = torch.Tensor(pos_wts['train'].values)
        valid_pos_wts = torch.Tensor(pos_wts['valid'].values)
#         demograph_dims, rec_dims, demograph_dims_wd, rec_dims_wd = get_all_emb_dims(EhrVocabList.load(self.path), αd)
#         return self.train_dl, self.valid_dl, demograph_dims, rec_dims, demograph_dims_wd, rec_dims_wd, train_pos_wts, valid_pos_wts
        return self.train_dl, self.valid_dl, train_pos_wts, valid_pos_wts

    def get_test_data(self, bs=64, num_workers=0):
        '''Convenience function - returns everything needed for prediction using test data'''
        self.load_splits()
        self.label()
        self.create_datasets()
        self.create_dls(bs, num_workers=num_workers)
                        
        pos_wts = self.splits.get_pos_wts(self.labels)
        test_pos_wts = torch.Tensor(pos_wts['test'].values)
#         demograph_dims, rec_dims, demograph_dims_wd, rec_dims_wd = get_all_emb_dims(EhrVocabList.load(self.path))
#         return self.test_dl, demograph_dims, rec_dims, demograph_dims_wd, rec_dims_wd, test_pos_wts
        return self.test_dl, test_pos_wts

In [None]:
show_doc(EHRData.load_splits)

<h4 id="EHRData.load_splits" class="doc_header"><code>EHRData.load_splits</code><a href="__main__.py#L8" class="source_link" style="float:right">[source]</a></h4>

> <code>EHRData.load_splits</code>()

Load data splits given dataset path

In [None]:
show_doc(EHRData.label)

<h4 id="EHRData.label" class="doc_header"><code>EHRData.label</code><a href="__main__.py#L12" class="source_link" style="float:right">[source]</a></h4>

> <code>EHRData.label</code>()

Run labeler - i.e. extract y from patient objects

In [None]:
show_doc(EHRData.create_datasets)

<h4 id="EHRData.create_datasets" class="doc_header"><code>EHRData.create_datasets</code><a href="__main__.py#L16" class="source_link" style="float:right">[source]</a></h4>

> <code>EHRData.create_datasets</code>()

Create `EHRDataset`s

In [None]:
show_doc(EHRData.ehr_collate)

<h4 id="EHRData.ehr_collate" class="doc_header"><code>EHRData.ehr_collate</code><a href="__main__.py#L22" class="source_link" style="float:right">[source]</a></h4>

> <code>EHRData.ehr_collate</code>(**`b`**)

Custom collate function for use in `DataLoader`

In [None]:
show_doc(EHRData.create_dls)

<h4 id="EHRData.create_dls" class="doc_header"><code>EHRData.create_dls</code><a href="__main__.py#L27" class="source_link" style="float:right">[source]</a></h4>

> <code>EHRData.create_dls</code>(**`bs`**, **`collate_fn`**=*`'ehr_collate'`*, **\*\*`kwargs`**)

Create `DataLoader`s

In [None]:
show_doc(EHRData.get_data)

<h4 id="EHRData.get_data" class="doc_header"><code>EHRData.get_data</code><a href="__main__.py#L33" class="source_link" style="float:right">[source]</a></h4>

> <code>EHRData.get_data</code>(**`bs`**=*`64`*, **`num_workers`**=*`0`*)

Convenience function - returns everything needed for training

In [None]:
show_doc(EHRData.get_test_data)

<h4 id="EHRData.get_test_data" class="doc_header"><code>EHRData.get_test_data</code><a href="__main__.py#L47" class="source_link" style="float:right">[source]</a></h4>

> <code>EHRData.get_test_data</code>(**`bs`**=*`64`*, **`num_workers`**=*`0`*)

Convenience function - returns everything needed for prediction using test data

## Export -

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 01_preprocessing_clean.ipynb.
Converted 02_preprocessing_vocab.ipynb.
Converted 03_preprocessing_transform.ipynb.
Converted 04_data.ipynb.
Converted 05_metrics.ipynb.
Converted index.ipynb.
