<a href="https://colab.research.google.com/github/david-ak/t2/blob/master/DxO_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Packages

In [None]:
import pickle
#import feather
import pandas as pd
import numpy as np
import math
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics
#import keras as keras
from tensorflow.keras import backend as K
from tensorflow.keras import models
#from keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, TimeDistributed

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('\n\n\nYour runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, Runtime > "Change runtime type"')
else:
  print('You are using a high-RAM runtime!\n\n')

print(tf.__version__) # 2.2.0
print(keras.__version__) # 2.3.1 (tf imports 2.3.0-tf)


# Load data

In [None]:
data = pd.read_csv('https://davidak.s3-us-west-1.amazonaws.com/SID+SEDD/Data+main/nat_CA_60k.csv')

# Format data

In [None]:
print(data.shape) # 604,034 rows, 115 columns
print('{:.0f} unique IDs'.format(len(pd.unique(data['ID2'])))) # 60,000 unique IDs
data.sort_values(by=['ID2', 'Visit_no'], ascending=[True, False], inplace=True) # Sort by ID2, -Visit_no
# data['Interval'] = (np.log(math.e + data.Interval))**-1 # Format interval

In [178]:
data.head()

Unnamed: 0,ID2,Case,Visit_no,Interval,Type,Age,Sex,Race,Payer,Pt_zip_inc_qrtl,Dispo,Adm_LOS,Adm_charges,EC_prin,EC_1,EC_2,EC_3,EC_4,Dx_prin,Dx_1,Dx_2,Dx_3,Dx_4,Dx_5,Dx_6,Dx_7,Dx_8,Dx_9,Dx_10,Dx_11,Dx_12,Dx_13,Dx_14,Dx_15,Dx_16,Dx_17,Dx_18,Dx_19,Dx_20,Dx_21,...,CCS_Dx_6,CCS_Dx_7,CCS_Dx_8,CCS_Dx_9,CCS_Dx_10,CCS_Dx_11,CCS_Dx_12,CCS_Dx_13,CCS_Dx_14,CCS_Dx_15,CCS_Dx_16,CCS_Dx_17,CCS_Dx_18,CCS_Dx_19,CCS_Dx_20,CCS_Dx_21,CCS_Dx_22,CCS_Dx_23,CCS_Dx_24,CCS_Proc_prin,CCS_Proc_1,CCS_Proc_2,CCS_Proc_3,CCS_Proc_4,CCS_Proc_5,CCS_Proc_6,CCS_Proc_7,CCS_Proc_8,CCS_Proc_9,CCS_Proc_10,CCS_Proc_11,CCS_Proc_12,CCS_Proc_13,CCS_Proc_14,CCS_Proc_15,CCS_Proc_16,CCS_Proc_17,CCS_Proc_18,CCS_Proc_19,CCS_Proc_20
0,C1000056,0,9,Interval_67_115,Type_Adm,Age_40_50,Sex_M,Race_White,Payer_Private,Pt_zip_inc_qrtl_4,Dispo_Home,2.0,25196.0,,,,,,28800,1629.0,1977.0,1985.0,1978.0,7806,27651.0,4240.0,V1251,V554,,,,,,,,,,,,,...,CCS_Dx_118,CCS_Dx_155,,,,,,,,,,,,,,,,,,CCS_Proc_93,,,,,,,,,,,,,,,,,,,,
1,C1000056,0,8,Interval_-001_2,Type_Adm,Age_30_40,Sex_M,Race_White,Payer_Private,Pt_zip_inc_qrtl_4,Dispo_Other,16.0,108244.0,,,,,,53551,5184.0,1978.0,1628.0,1977.0,1985,1980.0,57410.0,1976,2639,4240.0,2111.0,2766.0,79902.0,V1251,V5861,,,,,,,...,CCS_Dx_96,CCS_Dx_47,CCS_Dx_55,CCS_Dx_244,CCS_Dx_118,CCS_Dx_257,,,,,,,,,,,,,,CCS_Proc_84,CCS_Proc_184,CCS_Proc_70,CCS_Proc_93,CCS_Proc_71,CCS_Proc_223,CCS_Proc_222,,,,,,,,,,,,,,
2,C1000056,0,7,Interval_-001_2,Type_ED,Age_30_40,Sex_M,Race_White,Payer_Private,Pt_zip_inc_qrtl_4,Dispo_Home,,,E9342,E8499,,,,78906,41519.0,79092.0,1629.0,5739.0,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,C1000056,0,6,Interval_2_8,Type_ED,Age_30_40,Sex_M,Race_White,Payer_Private,Pt_zip_inc_qrtl_4,Dispo_Home,,,,,,,,7847,1629.0,1977.0,28522.0,41519.0,V5861,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,C1000056,0,5,Interval_19_37,Type_ED,Age_30_40,Sex_M,Race_White,Payer_Private,Pt_zip_inc_qrtl_4,Dispo_Home,,,,,,,,49390,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Define max_visit and add visit counter

In [188]:
max_visit = 15
data['Visit_no_rev'] = data.groupby('ID2').cumcount() + 1

In [None]:
cols_EC_CCS = [str('CCS_EC_'+str(i)) for i in ['prin',*range(1,5)]]
cols_Dx_CCS = [str('CCS_Dx_'+str(i)) for i in ['prin',*range(1,25)]]
cols_Proc_CCS = [str('CCS_Proc_'+str(i)) for i in ['prin',*range(1,21)]]
from itertools import chain
cols_all = list(chain.from_iterable([['Interval','Type','Age','Sex','Race','Payer','Pt_zip_inc_qrtl','Dispo'],cols_EC_CCS,cols_Dx_CCS,cols_Proc_CCS]))

In [None]:
cols_all

## Bin age

In [None]:
data['Age'] = data.Age.astype('int')
data['Age'] = pd.cut(data['Age'], bins=np.linspace(0,130,14))
data['Age'] = data.Age.astype(str)
data['Age'] = data.Age.apply(lambda x:x.replace('.0',''))
data['Age'] = data.Age.apply(lambda x:x.replace(', ','_'))
data['Age'] = data.Age.apply(lambda x:x.replace(']',''))
data['Age'] = data.Age.apply(lambda x:x.replace('(',''))
pd.value_counts(data.Age)

## Bin interval -- use the same bins on other states

In [None]:
data['Interval'] = pd.qcut(data['Interval'],q=10).astype(str)
data['Interval'] = data.Interval.apply(lambda x:x.replace('-001','0'))
data['Interval'] = data.Interval.astype(str)
data['Interval'] = data.Interval.apply(lambda x:x.replace('.0',''))
data['Interval'] = data.Interval.apply(lambda x:x.replace(', ','_'))
data['Interval'] = data.Interval.apply(lambda x:x.replace(']',''))
data['Interval'] = data.Interval.apply(lambda x:x.replace('(',''))

In [None]:
pd.value_counts(data.Interval)

## Append var names to all fields

In [None]:
data.Interval = 'Interval_'+data.Interval
data.Type = 'Type_'+data.Type
data.Age = 'Age_'+data.Age.astype('str')
data.Sex = 'Sex_'+data.Sex
data.Race = 'Race_'+data.Race
data.Payer = 'Payer_'+data.Payer
data.Pt_zip_inc_qrtl = np.nan_to_num(data.Pt_zip_inc_qrtl,nan=0)
data.Pt_zip_inc_qrtl = data.Pt_zip_inc_qrtl.astype('int').astype('str')
data.Pt_zip_inc_qrtl = 'Pt_zip_inc_qrtl_'+data.Pt_zip_inc_qrtl
data.Dispo = 'Dispo_'+data.Dispo

In [None]:
for i in cols_EC_CCS:
  data.loc[:,i] = np.nan_to_num(data.loc[:,i],nan=0)
  data.loc[:,i] = 'CCS_EC_'+data.loc[:,i].astype('int').astype('str')
  data.loc[:,i] = data[i].replace('CCS_EC_0',np.NaN)

for i in cols_Dx_CCS:
  data.loc[:,i] = np.nan_to_num(data.loc[:,i],nan=0)
  data.loc[:,i] = 'CCS_Dx_'+data.loc[:,i].astype('int').astype('str')
  data.loc[:,i] = data[i].replace('CCS_Dx_0',np.NaN)

for i in cols_Proc_CCS:
  data.loc[:,i] = np.nan_to_num(data.loc[:,i],nan=0)
  data.loc[:,i] = 'CCS_Proc_'+data.loc[:,i].astype('int').astype('str')
  data.loc[:,i] = data[i].replace('CCS_Proc_0',np.NaN)

In [None]:
data.loc[0:10,list(chain.from_iterable([cols_EC_CCS,cols_Dx_CCS,cols_Proc_CCS]))]

## Make lists of tokenized fields (tokenized) for each visit

In [None]:
data.loc[:,cols_all].shape # (604034, 59)

### Split X/y/ID2

In [None]:
y = data.loc[:,'Case'].to_numpy().flatten()
print(y)

In [None]:
ID2 = data.loc[:,'ID2'].to_numpy().flatten()
print(ID2)

In [191]:
X = data.loc[:,cols_all] # leave case in here?

In [192]:
fields_unique = pd.unique(X.loc[:,cols_all].values.ravel('K'))
fields_unique = fields_unique.astype(str)
fields_unique.sort
fields_unique

array(['Interval_67_115', 'Interval_-001_2', 'Interval_2_8',
       'Interval_19_37', 'Interval_37_67', 'Interval_322_601',
       'Interval_8_19', 'Interval_nan', 'Interval_191_322',
       'Interval_601_3830', 'Interval_115_191', 'Type_Adm', 'Type_ED',
       'Age_40_50', 'Age_30_40', 'Age_20_30', 'Age_0_10', 'Age_80_90',
       'Age_70_80', 'Age_60_70', 'Age_50_60', 'Age_90_100', 'Age_10_20',
       'Age_nan', 'Age_100_110', 'Sex_M', 'Sex_F', 'Sex_U', 'Race_White',
       'Race_Hispanic', 'Race_Asian', 'Race_Other', 'Race_Black',
       'Race_Unknown', 'Race_Native', 'Payer_Private', 'Payer_Self',
       'Payer_Other', 'Payer_Medicaid', 'Payer_Medicare',
       'Pt_zip_inc_qrtl_4', 'Pt_zip_inc_qrtl_3', 'Pt_zip_inc_qrtl_2',
       'Pt_zip_inc_qrtl_0', 'Pt_zip_inc_qrtl_1', 'Dispo_Home',
       'Dispo_Other', 'Dispo_AMA', 'Dispo_3', 'Dispo_2', 'Dispo_6',
       'Dispo_5', 'Dispo_Psych', 'Dispo_4', 'Dispo_21', 'Dispo_8',
       'Dispo_70', 'Dispo_62', 'Dispo_63', 'Dispo_66', 'Dispo_Died

In [193]:
matching = [s for s in fields_unique if any(xs in s for xs in ['nan'])]
matching # ['Interval_nan', 'Age_nan', 'nan']

['Interval_nan', 'Age_nan', 'nan']

In [194]:
fields_unique = [x for x in fields_unique if x not in matching]

### Dict field <> token

In [195]:
my_dict = dict(zip(list(fields_unique), list(range(len(fields_unique)))))

In [196]:
for i in cols_all:
  X[i] = X[i].map(my_dict).astype('float32')
  #print(i)

In [197]:
X.head()

Unnamed: 0,Interval,Type,Age,Sex,Race,Payer,Pt_zip_inc_qrtl,Dispo,CCS_EC_prin,CCS_EC_1,CCS_EC_2,CCS_EC_3,CCS_EC_4,CCS_Dx_prin,CCS_Dx_1,CCS_Dx_2,CCS_Dx_3,CCS_Dx_4,CCS_Dx_5,CCS_Dx_6,CCS_Dx_7,CCS_Dx_8,CCS_Dx_9,CCS_Dx_10,CCS_Dx_11,CCS_Dx_12,CCS_Dx_13,CCS_Dx_14,CCS_Dx_15,CCS_Dx_16,CCS_Dx_17,CCS_Dx_18,CCS_Dx_19,CCS_Dx_20,CCS_Dx_21,CCS_Dx_22,CCS_Dx_23,CCS_Dx_24,CCS_Proc_prin,CCS_Proc_1,CCS_Proc_2,CCS_Proc_3,CCS_Proc_4,CCS_Proc_5,CCS_Proc_6,CCS_Proc_7,CCS_Proc_8,CCS_Proc_9,CCS_Proc_10,CCS_Proc_11,CCS_Proc_12,CCS_Proc_13,CCS_Proc_14,CCS_Proc_15,CCS_Proc_16,CCS_Proc_17,CCS_Proc_18,CCS_Proc_19,CCS_Proc_20
0,0.0,10.0,12.0,23.0,26.0,33.0,38.0,43.0,,,,,,84.0,89.0,158.0,164.0,112.0,184.0,239.0,93.0,,,,,,,,,,,,,,,,,,347.0,,,,,,,,,,,,,,,,,,,,
1,1.0,10.0,13.0,23.0,26.0,33.0,38.0,44.0,,,,,,85.0,144.0,158.0,89.0,201.0,307.0,184.0,223.0,112.0,100.0,239.0,116.0,,,,,,,,,,,,,,348.0,555.0,387.0,347.0,417.0,380.0,383.0,,,,,,,,,,,,,,
2,1.0,11.0,13.0,23.0,26.0,33.0,38.0,43.0,62.0,72.0,,,,86.0,121.0,209.0,89.0,265.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2.0,11.0,13.0,23.0,26.0,33.0,38.0,43.0,,,,,,87.0,89.0,158.0,124.0,121.0,116.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,3.0,11.0,13.0,23.0,26.0,33.0,38.0,43.0,,,,,,88.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [198]:
maxlen = 59 # num cols -c(ID2,Case,Visit_no,Adm_LOS,Adm_charges)

In [199]:
X = X.to_numpy()
print(X)

[[ 0. 10. 12. ... nan nan nan]
 [ 1. 10. 13. ... nan nan nan]
 [ 1. 11. 13. ... nan nan nan]
 ...
 [ 7. 11. 21. ... nan nan nan]
 [ 7. 11. 21. ... nan nan nan]
 [nan 11. 21. ... nan nan nan]]


In [200]:
clean = [[i for i in row if str(i) != 'nan'] for row in X]
clean = np.asarray(clean, dtype=object)
clean

### Pad to (maxvisit,maxlen) matrices for each trajectory

In [202]:
X = keras.preprocessing.sequence.pad_sequences(clean, maxlen=maxlen)
ID2_temp = data['ID2'].to_numpy().reshape((data.shape[0],1))
Visit = data['Visit_no_rev'].to_numpy().reshape((data.shape[0],1))
X = np.concatenate((ID2_temp,Visit,X),axis=1)

In [209]:
np.split(X,'ID2')

TypeError: ignored

## Split train/val/test sets

# Embedding
Two seperate embedding layers, one for tokens, one for token index (positions).


In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, emded_dim, name=None):
        super(TokenAndPositionEmbedding, self).__init__(name=name)
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=emded_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=emded_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

# Multi head self attention


In [None]:
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output

# Transformer

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate, name=None):
        super(TransformerBlock, self).__init__(name=name)
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = Sequential(
            [layers.Dense(ff_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Model

Transformer layer outputs one vector for each time step of our input sequence.
Here, we take the mean across all time steps and
use a feed forward network on top of it to classify text.


In [None]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
dense_units = 20
dropout_rate = 0.1
vocab_size=1000

visit_input = layers.Input(shape=(maxlen,), name='visit_input') 
# I/O(batch, max_vars)

visit_embedding = TokenAndPositionEmbedding(maxlen=maxlen, vocab_size=vocab_size, 
                                            emded_dim=embed_dim, name='visit_embedding')(visit_input) 
# O(batch, max_vars, embed_dim)
 
transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, 
                               dropout_rate=dropout_rate, name='transformer')(visit_embedding)
# O(batch, max_vars, embed_dim)

x = layers.GlobalAveragePooling1D()(transformer) # ...
# O(batch, embed_dim)
x = layers.Dropout(dropout_rate)(x)
# O(batch, embed_dim)
x = layers.Dense(dense_units, activation="relu")(x) # nonlinear activation
# O(batch, dense_units)
x = layers.Dropout(dropout_rate)(x)
# O(batch, dense_units)


#outputs = layers.Dense(2, activation="softmax")(x)
visit_output = layers.Dense(1, activation="sigmoid", name='visit_output')(x)
# O(batch, 1)

model = keras.Model(inputs=visit_input, outputs=visit_output)

model.summary()

# Train and Evaluate


In [None]:
callbacks_list = [keras.callbacks.EarlyStopping(monitor='val_acc',patience=1,min_delta=.001),
                  keras.callbacks.ModelCheckpoint(filepath='my_model.h5',monitor='val_loss',save_best_only=True)]


#model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

history = model.fit(x_train, y_train,
                    batch_size=32,
                    epochs=5,
                    #callbacks=callbacks_list,
                    validation_data=(x_val, y_val))