In [1]:
import pandas as pd
import numpy as np
import os
import sys
import random

In [2]:
START = 0
PAD = 1
SEP = 2
END = 3

In [3]:
def tokenize_dataset(data_input, vocab):
    tokenized = []
    idx = 0
    last_person = 0
    max_length = 0
    for patient, group in data_input.groupby("person_id"):
        tokenized.append([])
        tokenized[idx].append(START)
        for index, row in group.iterrows():
            for cond in data_input.columns:
                if row[cond] != 0 and cond != "person_id":
                    tokenized[idx].append(vocab["token2idx"][cond])
            tokenized[idx].append(SEP)
        tokenized[idx][-1] = END
        max_len = len(tokenized[idx])
        idx += 1
    return tokenized, max(max_length, len(tokenized[-1]))

In [2]:
def tokenize_age(data_input, vocab, src):
    tokenized = []
    tokenized.append([])
    idx = 0
    idx_2 = 0
    last_person = 0
    max_length = 0
    for patient in src:
        for row in patient:
            tokenized[idx_2].append(vocab["token2idx"][data_input.at[idx, "age"]])
            if row == SEP or row == END:
                idx += 1
            
        idx_2 += 1
        tokenized.append([])
    tokenized.pop()
    return tokenized

In [5]:
def seq_padding(tokens, max_len, token2idx=None, symbol=None):
    
    if symbol is None:
        symbol = PAD

    seq = []
    token_len = len(tokens)
    for i in range(max_len):
        if i < token_len:
            seq.append(tokens[i])
        else:
            seq.append(symbol)
    return seq

In [6]:
def position_idx(tokens, symbol=SEP):
    pos = []
    flag = 0

    for token in tokens:
        if token == symbol:
            pos.append(flag)
            flag += 1
        else:
            pos.append(flag)
    return pos

In [7]:
 def build_vocab(data_input):
    token2idx = {"<start>":START, "<pad>":PAD, "<sep>":SEP, "<end>":END}
    idx2token = {START:"<start>", PAD:"<pad>", SEP:"<sep>", END:"<end>"}
    idx = 4
    for cond in data_input.columns:
        if cond == "person_id":
            continue
        token2idx[cond] = idx
        idx2token[idx] = cond
        idx +=1
    return {"token2idx":token2idx, "idx2token":idx2token}

In [8]:
def index_seg(tokens, symbol=SEP):
    flag = 0
    seg = []

    for token in tokens:
        if token == symbol:
            seg.append(flag)
            if flag == 0:
                flag = 1
            else:
                flag = 0
        else:
            seg.append(flag)
    return seg

In [9]:
 def build_age_vocab(data_input):
    token2idx = {}
    idx2token = {}
    min_age = 999
    max_age = 0
    
    for index, row in data_input.iterrows():
        if row["age"] > max_age:
            max_age = row["age"]
        if row["age"] < min_age:
            min_age = row["age"]
    idx = 0
    for i in np.arange(min_age, max_age + 0.25, 0.25):
        token2idx[i] = idx
        idx2token[idx] = i
        idx += 1
    return {"token2idx":token2idx, "idx2token":idx2token}

In [10]:
def create_folder(path):
    if not os.path.exists(path):
        os.makedirs(path)


def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)


def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [11]:
def random_mask(tokens, token2idx):
    output_label = []
    output_token = []
    for i, token in enumerate(tokens):
        prob = random.random()
        # mask token with 15% probability
        if prob < 0.15 and token>3:
            prob /= 0.15

            # 80% randomly change token to mask token
            if prob < 0.8:
                output_token.append(token2idx["MASK"])

            # 10% randomly change token to random token
            elif prob < 0.9:
                output_token.append(random.choice(list(token2idx.values())[4:]))

            # -> rest 10% randomly keep current token

            # append current token to output (we will predict these later
            output_label.append(token)
        else:
            # no masking token (will be ignored by loss function later)
            output_label.append(-1)
            output_token.append(token)

    return tokens, output_token, output_label