In [2]:
#!pip install tensorflow

In [3]:
from Hopper import *
import tensorflow as tf
import numpy as np
import Prepper

In [4]:
verbose = True
h = dbms(verbose=verbose)

DROP TABLE IF EXISTS i_ids;

CREATE TABLE i_ids (
	i_id TEXT NOT NULL,
	PRIMARY KEY (i_id)
);

DROP TABLE IF EXISTS i_windows;

CREATE TABLE i_windows (
	i_id TEXT NOT NULL,
	i_st INTEGER NOT NULL,
	i_et INTEGER NOT NULL,
	PRIMARY KEY (i_id),
	FOREIGN KEY (i_id) REFERENCES i_ids (i_id)
);

DROP TABLE IF EXISTS i_partitions;

CREATE TABLE i_partitions (
	i_id TEXT NOT NULL,
	partition TEXT NOT NULL,
	PRIMARY KEY (i_id),
	FOREIGN KEY (i_id) REFERENCES i_ids (i_id)
);

DROP TABLE IF EXISTS i_relcal;

CREATE TABLE i_relcal (
	i_id TEXT NOT NULL,
	i_st INTEGER NOT NULL,
	i_et INTEGER NOT NULL
);



In [5]:
tc = table_config("characteristics_0", 
                  ["age", "sex", "eth", "bin_ldc", "job"], 
                  ["real", "bin", "ldc", "ldc", "hdc"],
                  has_times=False,
                  primary_key=True)
h.create_fvm_with_csv(tc, "data/characteristics_0.csv", delimiter=',')


_tc = table_config("samples_0", 
                  ["SBP", "DBP", "type"], 
                  ["real", "real", "ldc"], 
                  has_times=True,
                  primary_key=False)
h.create_fvm_with_csv(_tc, "data/samples_0.csv", delimiter=',')

_tc = table_config("samples_1", 
                  ["tmps", "hrs", "sbps", "rrs", "dbps", "sats", "wgts"],
                  ["real", "real", 'real', "real", "real", "real", "real"], 
                  has_times=True,
                  primary_key=False)
h.create_fvm_with_csv(_tc, "data/samples_1.csv", delimiter=',')


_tc = table_config("samples_2", 
                  ["dx"],
                  ["hdc"], 
                  has_times=True,
                  primary_key=False)
h.create_fvm_with_csv(_tc, "data/samples_2.csv", delimiter=',')


_tc = table_config("samples_3", 
                  ["outcome"],
                  ["bin"], 
                  has_times=True,
                  primary_key=False)
h.create_fvm_with_csv(_tc, "data/samples_3.csv", delimiter=',')

DROP TABLE IF EXISTS characteristics_0;

CREATE TABLE characteristics_0 (
	i_id TEXT NOT NULL,
	age REAL NOT NULL,
	sex TEXT NOT NULL,
	eth TEXT NOT NULL,
	bin_ldc TEXT NOT NULL,
	job TEXT NOT NULL,
	PRIMARY KEY (i_id),
	FOREIGN KEY (i_id) REFERENCES i_ids (i_id)
);

INSERT OR IGNORE INTO i_ids (i_id) VALUES (?);

INSERT  INTO characteristics_0 (i_id, age, sex, eth, bin_ldc, job) VALUES (?, ?, ?, ?, ?, ?);

CREATE INDEX i_index_characteristics_0_i_id ON characteristics_0(i_id);

CREATE INDEX i_index_characteristics_0_age ON characteristics_0(age);

CREATE INDEX i_index_characteristics_0_sex ON characteristics_0(sex);

CREATE INDEX i_index_characteristics_0_eth ON characteristics_0(eth);

CREATE INDEX i_index_characteristics_0_bin_ldc ON characteristics_0(bin_ldc);

CREATE INDEX i_index_characteristics_0_job ON characteristics_0(job);

10000 rows loaded

DROP TABLE IF EXISTS samples_0;

CREATE TABLE samples_0 (
	i_id TEXT NOT NULL,
	i_st INTEGER NOT NULL,
	i_et INTEGER NOT NULL,
	SBP RE

In [6]:
h.dew_it(after_first=20)

INSERT  INTO i_windows (i_id, i_st, i_et) VALUES (?, ?, ?);

gen_range_table_sql called: af=20, bf=None
INSERT OR IGNORE INTO i_windows 
SELECT * FROM (
	SELECT i_id, MIN(i_st) AS i_st, MIN(i_st)+20 AS i_et
	FROM (
	    SELECT i_id, i_st, i_et
		FROM samples_0
		UNION
			SELECT i_id, i_st, i_et
		FROM samples_1
		UNION
			SELECT i_id, i_st, i_et
		FROM samples_2
		UNION
			SELECT i_id, i_st, i_et
		FROM samples_3
		
	)
	GROUP BY i_id
);

INSERT  INTO i_partitions (i_id, partition) VALUES (?, ?);

INSERT OR IGNORE INTO i_partitions 
SELECT * FROM (
	SELECT i_id, CASE 
		 WHEN r <= 0.8 THEN "train" 
		 WHEN r <= 0.9 THEN "dev" 
		 WHEN r <= 1.0 THEN "test" 
		 ELSE "test" 
	END
	FROM (SELECT i_id, ABS(RANDOM())/(9223372036854775807.0) AS r FROM i_ids)
);

CREATE VIEW win_samples_0 AS
    SELECT samples_0.i_id,
	MAX(samples_0.i_st, i_windows.i_st) AS i_st,
	MIN(samples_0.i_et, i_windows.i_et) AS i_et,
	SBP,
	DBP,
	type
	FROM (
	    samples_0 JOIN i_windows ON samples_0.i_id=i_windows.i_id

INSERT  INTO i_relcal (i_id, i_st, i_et) VALUES (?, ?, ?);

CREATE VIEW fil_characteristics_0 AS
    SELECT i_id,
	CASE
	    WHEN 19.417789193703243>age THEN 19.0
	    WHEN age>70.26516520900998 THEN 71.0 ELSE age
	END AS age,
	CASE
	    WHEN sex IN ('M') THEN 1
	    ELSE 0
	END AS sex,
	CASE
	    WHEN eth IN ('e0', 'e1', 'e2', 'e3', 'e4', 'e5') THEN eth
	    ELSE '_OTHER_'
	END AS eth,
	CASE
	    WHEN bin_ldc IN ('A', 'B') THEN bin_ldc
	    ELSE '_OTHER_'
	END AS bin_ldc,
	CASE
	    WHEN job IN ('j000', 'j001', 'j002', 'j003', 'j004', 'j005', 'j007', 'j006', 'j008', 'j009', 'j010', 'j011', 'j016', 'j012', 'j014', 'j013', 'j015', 'j017', 'j018', 'j020', 'j023', 'j019', 'j022', 'j031', 'j030', 'j029', 'j028', 'j024', 'j027', 'j032', 'j036', 'j035', 'j026', 'j025', 'j021', 'j034', 'j038', 'j074', 'j039', 'j041', 'j050', 'j048', 'j053', 'j045', 'j033', 'j037', 'j047', 'j044', 'j049', 'j065', 'j052', 'j043', 'j055', 'j071', 'j089', 'j096', 'j042', 'j063', 'j064', 'j060', 'j070', 'j078', 'j

SELECT SUM((avg_SBP-(119.99315288597323))*(avg_SBP-(119.99315288597323)))/(177662-1),
SUM((min_SBP-(117.44854613052787))*(min_SBP-(117.44854613052787)))/(177662-1),
SUM((max_SBP-(122.5356084419721))*(max_SBP-(122.5356084419721)))/(177662-1),
SUM((avg_DBP-(79.99572049826698))*(avg_DBP-(79.99572049826698)))/(177662-1),
SUM((min_DBP-(77.45026540533095))*(min_DBP-(77.45026540533095)))/(177662-1),
SUM((max_DBP-(82.53845938853067))*(max_DBP-(82.53845938853067)))/(177662-1),
SUM((count-(2.2232047370850267))*(count-(2.2232047370850267)))/(177662-1) 
FROM agg_samples_0;

SELECT AVG(avg_tmps),
COUNT(avg_tmps),
AVG(min_tmps),
COUNT(min_tmps),
AVG(max_tmps),
COUNT(max_tmps),
AVG(avg_hrs),
COUNT(avg_hrs),
AVG(min_hrs),
COUNT(min_hrs),
AVG(max_hrs),
COUNT(max_hrs),
AVG(avg_sbps),
COUNT(avg_sbps),
AVG(min_sbps),
COUNT(min_sbps),
AVG(max_sbps),
COUNT(max_sbps),
AVG(avg_rrs),
COUNT(avg_rrs),
AVG(min_rrs),
COUNT(min_rrs),
AVG(max_rrs),
COUNT(max_rrs),
AVG(avg_dbps),
COUNT(avg_dbps),
AVG(min_dbps),
COUNT

In [None]:
idxs = h.cur_man.execute_fetchall("SELECT i_id FROM i_windows;")
#idxs = h.cur_man.execute_fetchmany("SELECT i_id FROM i_windows;")
idxs = [i[0] for i in idxs]

In [None]:
windows = h.cur_man.execute_fetchall("SELECT * FROM i_windows;")
#windows = h.cur_man.execute_fetchmany("SELECT * FROM i_windows;")
durations = {idx: et for idx, _, et in windows}
#durations

In [None]:
tc = h.fvm_lookup["samples_3"].view_tc["nrm"]
print(tc.name, tc.column_names, tc.feature_names)
table_data = h.cur_man.execute_fetchall("SELECT * FROM nrm_samples_3;")
#table_data = h.cur_man.execute_fetchmany("SELECT * FROM nrm_samples_3;")

In [None]:
def tokenize_and_sample(obs_str, tokenizer, channels=5):
    obs = tokenizer.texts_to_sequences([obs_str])[0]
    
    if len(obs) <= channels:
        n = channels - len(obs)
        empty_token = tokenizer.texts_to_sequences(['_empty_'])[0]
        res = obs + empty_token*(n)
    
    else:
        res = obs[:channels]
    
    return(res)  

In [None]:
def convert(h, offsets = [], label_cns = [], drop_cns = [], channels=5):

    data = {idx: {} for idx in idxs}
    meta = {}
    col_name = "{tn}/{fn}"
    hdc_tokenizer = {}


    for fvm in h.fvms:
        tc = fvm.view_tc["nrm"]
        tn = tc.name
        #table_data = h.cur_man.execute_fetchmany("SELECT * FROM %s;" %(tc.name))
        table_data = h.cur_man.execute_fetchall("SELECT * FROM %s;" %(tc.name))

        if tc.has_times:
            for fn, f_type in zip(tc.feature_names, tc.feature_types):
                cn = col_name.format(tn=tn, fn=fn)
                meta[cn] = {"tn": tn, "fn": fn, "type": f_type, "has_times": True, "channels": 1}
                
                if f_type == "hdc":
                    meta[cn]["channels"] = channels
                    hdc_text_sql = "SELECT {cn} FROM {tn};".format(cn=fn, tn=tn)
                    hdc_texts = h.cur_man.execute_fetchall(hdc_text_sql)
                    hdc_texts = [row[0] for row in hdc_texts]
                    hdc_texts.append('_empty_')
                    hdc_texts = hdc_texts

                    hdc_tokenizer[cn] = tf.keras.preprocessing.text.Tokenizer(split=',', oov_token='_unknown_', filters="")
                    hdc_tokenizer[cn].fit_on_texts(hdc_texts)

                for idx in idxs:
                    if f_type != "hdc":
                        data[idx][cn] = [[0.0]]*(durations[idx]+1)
                    else:
                        data[idx][cn] = [tokenize_and_sample("", hdc_tokenizer[cn], channels=channels)]*(durations[idx]+1)

                    
            for idx, st, et, *row_data in table_data:
                for fn, val in zip(tc.feature_names, row_data):
                    cn = col_name.format(tn=tn, fn=fn)
                    if meta[cn]["type"] != "hdc":
                        if val is None:
                            val = 0.0
                        data[idx][cn][st] = [val]
                    else:
                        data[idx][cn][st] = tokenize_and_sample(val, hdc_tokenizer[cn], channels=channels)

        else:
            for fn, f_type in zip(tc.feature_names, tc.feature_types):
                cn = col_name.format(tn=tn, fn=fn)
                meta[cn] = {"tn": tn, "fn": fn, "type": f_type, "has_times": True, "channels": 1}
                
                if f_type == "hdc":
                    meta[cn]["channels"] = 1
                    hdc_text_sql = "SELECT {cn} FROM {tn};".format(cn=fn, tn=tn)
                    hdc_texts = h.cur_man.execute_fetchall(hdc_text_sql)
                    hdc_texts = [row[0] for row in hdc_texts]
                    hdc_texts.append('_empty_')
                    hdc_texts = hdc_texts

                    hdc_tokenizer[cn] = tf.keras.preprocessing.text.Tokenizer(split=',', oov_token='_unknown_', filters="")
                    hdc_tokenizer[cn].fit_on_texts(hdc_texts)

                for idx in idxs:
                    #val = 0.0 if f_type != "hdc" else ""
                    #data[idx][cn] = [val]*(durations[idx]+1)
                    if f_type != "hdc":
                        data[idx][cn] = [[0.0]]*(durations[idx]+1)
                    else:
                        data[idx][cn] = [tokenize_and_sample("", hdc_tokenizer[cn], channels=1)]*(durations[idx]+1)

            for idx, *row_data in table_data:
                for fn, val in zip(tc.feature_names, row_data):
                    cn = col_name.format(tn=tn, fn=fn)
                    #if val is None:
                    #    val = 0.0 if meta[cn]["type"] != "hdc" else ""
                    #data[idx][cn] = [val]*(durations[idx]+1)
                    if meta[cn]["type"] != "hdc":
                        if val is None:
                            val = 0.0
                        data[idx][cn] = [[val]]*(durations[idx]+1)
                    else:
                        data[idx][cn] = [tokenize_and_sample(val, hdc_tokenizer[cn], channels=1)]*(durations[idx]+1)
                    
    

    labels = {idx: {} for idx in idxs}
    label_name = "{cn}/{offset}"

    for idx, idx_data in data.items():
        for cn in label_cns:
            labels[idx][cn] = []
            for offset in offsets:
                ln = label_name.format(cn=cn, offset=offset)
                l = idx_data[cn]
                o = min(offset, len(l))
                labels[idx][cn].append(l[o:] + [l[-1]]*o)

        for cn in drop_cns:
            del idx_data[cn]
            
        idx_data["i_mask"] = [[1]]*(durations[idx]+1)
        meta["i_mask"] = {"tn": "i_mask", "fn": "i_mask", "type": "mask", "has_times": True, "channels": 1}
    
    for cn in drop_cns:
        del meta[cn]
    
    return(data, labels, meta, hdc_tokenizer)

In [None]:
#offsets = [1, 5, 10]
offsets = [0, 1, 2]
label_cns = ["nrm_samples_3/avg_outcome"]
data, labels, meta, hdc_tokenizer = convert(h, offsets=offsets, label_cns=label_cns, drop_cns = ["nrm_samples_3/count"])

In [None]:
data['00000']

In [None]:
meta

In [None]:
labels['00000']

In [None]:
x_dss = {}
x_padded_shapes = {}

def make_data_gen(cn_data):
    return(lambda: iter(cn_data))

@tf.function
def expand(x):
    return tf.expand_dims(x, axis = -1)

for cn, cn_info in meta.items():
    cn_data = [idx_data[cn] for idx_data in data.values()]
    if cn_info["type"] == "hdc":
        output_types = tf.int32 
    else:
        output_types = tf.float32 
        
    data_gen = make_data_gen(cn_data)
    
    cn_ds = tf.data.Dataset.from_generator(data_gen, output_types)
    #cn_ds = cn_ds.map(lambda x:tf.expand_dims(x, axis=-1))
    x_dss[cn] = cn_ds
    #x_padded_shapes[cn] = [None, 1]
    x_padded_shapes[cn] = [None, cn_info["channels"]]
    

In [None]:
x_padded_shapes["i_mask"], x_padded_shapes["nrm_samples_3/avg_outcome"]

In [None]:
y_dss = []
y_padded_shapes = []

for label_cn in label_cns:
    #label_data = [np.array(idx_labels[label_cn]).T for idx_labels in labels.values()]
    label_data = [np.array(idx_labels[label_cn]) for idx_labels in labels.values()]
    
    data_gen = make_data_gen(label_data)
    label_ds = tf.data.Dataset.from_generator(data_gen, output_types)
    
    #new
    label_ds = label_ds.map(lambda x: tf.squeeze(x))
    label_ds = label_ds.map(lambda x: tf.transpose(x))
    
    y_dss.append(label_ds)
    y_padded_shapes.append([None, len(offsets)])

y_dss = tuple(y_dss)    
y_padded_shapes = tuple(y_padded_shapes)

In [None]:
x_ds = tf.data.Dataset.zip(x_dss)
y_ds = tf.data.Dataset.zip(y_dss)
ds = tf.data.Dataset.zip((x_ds, y_ds))

ds = ds.shuffle(buffer_size = 20)
ds = ds.padded_batch(32, padded_shapes=(x_padded_shapes, y_padded_shapes))

In [None]:
for i in ds.take(1):
    x, y = i
    for k, v in x.items():
        print(k, '\n', v.shape, '\n')
    
    for i, _y in enumerate(y):
        print("y[%s]" %(i), '\n', _y.shape, '\n')

print("Avg y value: ", np.mean(y))

In [None]:
#build input model
inputs = {}
cat_list = []

for cn, cn_info in meta.items():
    in_layer = tf.keras.layers.Input(shape=(None, cn_info["channels"]), name=cn)
    inputs[cn] = in_layer
    cat_layer = in_layer
    if cn_info["type"] == "hdc":
        input_dim = len(hdc_tokenizer[cn].word_index)+1
        output_dim = round(input_dim**(0.25)+1)
        emb_layer = tf.keras.layers.Embedding(input_dim, output_dim, mask_zero=True)(in_layer)
        #cat_layer = emb_layer
        cat_layer = tf.keras.layers.Reshape([-1, cn_info["channels"]*output_dim])(emb_layer)
        
        #TODO: consider averaging - to reduce parameter space
        print(cn, input_dim, output_dim)
        
    elif cn == "i_mask":
        cat_layer = tf.keras.layers.Masking()(in_layer)
    
    cat_list.append(cat_layer)
        

    
input_list = list(inputs.values())    
#cat_list = [lay for cn, lay in inputs.items() if cn != "i_mask"]
#cat_list.append(tf.keras.layers.Masking()(inputs["i_mask"]))

        cat_inputs = tf.keras.layers.concatenate(cat_list, axis=-1, name="cat_inputs")
ingestion_model = tf.keras.models.Model(inputs=input_list, outputs=cat_inputs, name="ingestion")

In [None]:
rnn = tf.keras.layers.LSTM(units=2, name="rnn_1", return_sequences=True)(ingestion_model.output)
middle_model = tf.keras.models.Model(inputs=ingestion_model.input, outputs=rnn, name="middle")

In [None]:
dense = tf.keras.layers.Dense(units=len(offsets), activation="sigmoid")(middle_model.output)

#dense = tf.keras.layers.Dense(2, activation="relu")(middle_model.output)
#dense = tf.keras.layers.Dense(units=len(offsets), activation="sigmoid")(dense)

#TODO: final size shoudl be len(offsets)*(number of blah)?

final_model = tf.keras.models.Model(inputs=ingestion_model.input, outputs=(dense), name="final")

In [None]:
final_model.compile(loss="binary_crossentropy")

In [None]:
final_model.summary()

In [None]:
final_model.fit(ds, epochs=10)

In [None]:
def display(idx=0, ds=ds, model=None):
    
    for i in ds.take(1):
        ds_x, ds_y = i
    
    x = ds_x["nrm_samples_3/avg_outcome"][idx].numpy()
    #print(ds_x["i_mask"][idx].numpy())
        
        
    x = x.squeeze()
    y = ds_y[0][idx].numpy()
    
    if model is None:
        for _x, _y in zip(x, y):
            print(_x, _y)
    else:
        y_hat = model.predict(ds_x)[idx].round(2)
        for _x, _y, _y_hat in zip(x, y, y_hat):
            print(_x, _y, _y_hat)
    

display(0, model=final_model)

In [None]:
'

In [None]:
#tutorials

In [None]:
x_ds = tf.data.Dataset.zip(x_dss)
x_ds = x_ds.shuffle(buffer_size = 20)
x_ds = x_ds.padded_batch(2, padded_shapes=x_padded_shapes)

In [None]:
y_ds = tf.data.Dataset.zip(y_dss)
y_ds = y_ds.shuffle(buffer_size = 20)
y_ds = y_ds.padded_batch(2, padded_shapes=y_padded_shapes)

In [None]:
a_fn = "nrm_characteristics_0/age"
b_fn = "nrm_characteristics_0/sex"
a = [idx_data[a_fn] for idx_data in data.values()]
b = [idx_data[a_fn] for idx_data in data.values()]

In [None]:
a_ds = tf.data.Dataset.from_generator(lambda: iter(a), tf.float32)
b_ds = tf.data.Dataset.from_generator(lambda: iter(b), tf.float32)

z_ds = tf.data.Dataset.zip((a_ds, b_ds))
z_ds = z_ds.shuffle(buffer_size = 20)

z_ds = z_ds.padded_batch(2, padded_shapes=([None], [None]))

In [None]:
for i in z_ds.take(1000):
    
    print(i)
    break

In [None]:
elements = [[1, 2], 
            [3, 4, 5], 
            [6, 7], 
            [8]] 
A = tf.data.Dataset.from_generator(lambda: iter(elements), tf.float32)

B = A.padded_batch(2, padded_shapes=[None]) 


for element in B.as_numpy_iterator(): 
    print(element) 