In [2]:
"""Attention based model for sepsis."""
import torch
import torch.nn as nn
import torch.nn.functional as F

import argparse
import numpy as np
from functools import partial

from torch.utils.data import DataLoader, Subset
import pytorch_lightning as pl

from sklearn.metrics import (
    average_precision_score, roc_auc_score, balanced_accuracy_score)
# from test_tube import HyperOptArgumentParser

# import src.torch.datasets
# from src.torch.datasets import CombinedDataset
# from src.evaluation import physionet2019_utility
# from src.torch.torch_utils import (
#     variable_length_collate, ComposeTransformations, LabelPropagation)
# from src.torch.cli_utils import str2bool



In [3]:



def get_subsequent_mask(seq, offset=0):
    """For masking out the subsequent info."""
    sz_b, len_s, n_features = seq.size()
    subsequent_mask = torch.triu(
        torch.ones(
            (len_s+offset, len_s+offset), device=seq.device, dtype=bool),
        diagonal=1
    )
    return subsequent_mask


def length_to_mask(length, max_len=None, dtype=None, offset=0):
    assert len(length.shape) == 1, 'Length shape should be 1 dimensional.'
    max_len = max_len or (length.max().item() + offset)
    mask = (
        torch.arange(max_len, device=length.device, dtype=length.dtype) \
        .expand(len(length), max_len)
        >= (length.unsqueeze(1) + offset)
    )
    if dtype is not None:
        mask = torch.as_tensor(mask, dtype=dtype, device=length.device)
    return mask


class MaskedLayerNorm(nn.LayerNorm):
    def forward(self, x):
        # Compute cumulative summary statics along time axis
        N = torch.arange(
            start=1., end=x.shape[1]+1, device=x.device)[None, :, None]
        mean_x = torch.cumsum(x, 1) / N
        std_x = torch.sqrt(torch.cumsum((x - mean_x) ** 2, 1) / N + self.eps)

        return ((x - mean_x) / std_x) * self.weight + self.bias


class ReZero(nn.Module):
    def __init__(self):
        super().__init__()
        self.resweight = nn.Parameter(torch.Tensor([0.]))

    def forward(self, x1, x2):
        return x1 + self.resweight * x2


class TransformerEncoderLayer(nn.Module):
    """TransformerEncoderLayer is made up of self-attn and feedforward network.
    This standard encoder layer is based on the paper "Attention Is All You
    Need".  Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion
    Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention
    is all you need. In Advances in Neural Information Processing Systems,
    pages 6000-6010. Users may modify or implement in a different way during
    application.

    This class is adapted from the pytorch source code.

    Args:
        d_model: the number of expected features in the input (required).
        nhead: the number of heads in the multiheadattention models (required).
        dim_feedforward: the dimension of the feedforward network model
            (default=2048).
        dropout: the dropout value (default=0.1).
        norm: Normalization to apply, one of 'layer' or 'rezero'.

    Examples::
        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
        >>> src = torch.rand(10, 32, 512)
        >>> out = encoder_layer(src)
    """

    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
                 norm='layer'):
        super(TransformerEncoderLayer, self).__init__()
        if norm == 'layer':
            def get_residual():
                def residual(x1, x2):
                    return x1 + x2
                return residual

            def get_norm():
                return nn.LayerNorm(d_model)
        elif norm == 'rezero':
            def get_residual():
                return ReZero()

            def get_norm():
                return nn.Identity()
        else:
            raise ValueError('Invalid normalization: {}'.format(norm))

        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = get_norm()
        self.norm2 = get_norm()
        self.residual1 = get_residual()
        self.residual2 = get_residual()
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.activation = F.relu

    def __setstate__(self, state):
        if 'activation' not in state:
            state['activation'] = F.relu
        super(TransformerEncoderLayer, self).__setstate__(state)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        r"""Pass the input through the encoder layer.

        Args:
            src: the sequence to the encoder layer (required).
            src_mask: the mask for the src sequence (optional).
            src_key_padding_mask: the mask for the src keys per batch (optional).

        Shape:
            see the docs in Transformer class.
        """
        src2 = self.self_attn(
            src, src, src,
            attn_mask=src_mask,
            key_padding_mask=src_key_padding_mask
        )[0]
        src = self.residual1(src, self.dropout1(src2))
        src = self.norm1(src)

        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = self.residual2(src, self.dropout2(src2))
        src = self.norm2(src)
        return src


class AttentionModel(pl.LightningModule):
    """Sequence to sequence model based on MultiHeadAttention."""

    def __init__(self, d_model, n_layers, n_heads, dropout, norm, indicators=False,
                 **kwargs):
        """AttentionModel.

        Args:
            d_model: Dimensionality of the model
            n_layers: Number of MultiHeadAttention layers
            n_heads: Number of attention heads
            indicators: flag if missingness indicators should be applied
        """
        super().__init__(**kwargs)
        ff_dim = 4*d_model # dimensionality of ff layers: hard-coded default
        #self.to_observation_tuples = to_observation_tuples if indicators else to_observation_tuples_without_indicators 
        self.save_hyperparameters()
        d_in = 7

        self.layers = nn.ModuleList(
            [nn.Linear(d_in, d_model)]
            + [
                TransformerEncoderLayer(
                    d_model, n_heads, ff_dim, dropout, norm=norm)
                for n in range(n_layers)
            ]
            + [nn.Linear(d_model, 1)]
        )

    @property
    def transforms(self):
        parent_transforms = super().transforms
        parent_transforms.extend([
            PositionalEncoding(1, 500, 10),  # apply positional encoding
            self.to_observation_tuples            # mask nan with zero add indicator
        ])
        return parent_transforms

    def forward(self, x, lengths):
        """Apply attention model to input x."""
        offset = 0
        # Invert mask as multi head attention ignores values which are true
        mask = length_to_mask(lengths, offset=offset, max_len=x.shape[1])
        future_mask = get_subsequent_mask(x, offset=offset)
        x = self.layers[0](x)

        x = x.permute(1, 0, 2)
        for layer in self.layers[1:]:
            if isinstance(layer, TransformerEncoderLayer):
                x = layer(
                    x, src_key_padding_mask=mask, src_mask=future_mask)
            else:
                x = layer(x)
        x = x.permute(1, 0, 2)
        # Remove first element if statics are present
        x = x[:, offset:, :]
        return x


In [4]:
from torchinfo import summary


# Create an instance of the model
model = AttentionModel(d_model=128, n_layers=1, n_heads=8, dropout=0.1, norm='rezero')


# Create some dummy data for x and lengths
x = torch.rand(32, 336, 7)
lengths = torch.randint(24, 336, (32,))

# Call the model
output = model(x, lengths)


summary(model, input_data= [x, lengths],)


Layer (type:depth-idx)                   Output Shape              Param #
AttentionModel                           [32, 336, 1]              --
├─ModuleList: 1-1                        --                        --
│    └─Linear: 2-1                       [32, 336, 128]            1,024
│    └─TransformerEncoderLayer: 2-2      [336, 32, 128]            --
│    │    └─MultiheadAttention: 3-1      [336, 32, 128]            66,048
│    │    └─Dropout: 3-2                 [336, 32, 128]            --
│    │    └─ReZero: 3-3                  [336, 32, 128]            1
│    │    └─Identity: 3-4                [336, 32, 128]            --
│    │    └─Linear: 3-5                  [336, 32, 512]            66,048
│    │    └─Dropout: 3-6                 [336, 32, 512]            --
│    │    └─Linear: 3-7                  [336, 32, 128]            65,664
│    │    └─Dropout: 3-8                 [336, 32, 128]            --
│    │    └─ReZero: 3-9                  [336, 32, 128]            1
│ 

In [34]:
output.shape

torch.Size([32, 336, 1])

In [2]:
import keras.layers
from keras_nlp.layers import PositionEmbedding
import numpy as np

input = np.zeros((32, 336, 7))

layer = PositionEmbedding(336)
layer(input).shape

TensorShape([32, 336, 7])

In [4]:
import keras.layers
from keras_nlp.layers import SinePositionEncoding
import numpy as np

input = np.zeros((32, 336, 7))

layer = SinePositionEncoding()
layer(input).shape

TensorShape([32, 336, 7])

In [14]:
import requests

def generate_bibtex_from_doi(doi):
    url = f"https://doi.org/{doi}"
    headers = {"Accept": "application/x-bibtex"}
    response = requests.get(url, headers=headers)
    return response.text

In [22]:
doi = "10.1097/CCE.0000000000000744"
bibtex = generate_bibtex_from_doi(doi)
print(bibtex)

 @article{Veldhuis_2022, title={Artificial Intelligence for the Prediction of In-Hospital Clinical Deterioration: A Systematic Review}, volume={4}, ISSN={2639-8028}, url={http://dx.doi.org/10.1097/CCE.0000000000000744}, DOI={10.1097/cce.0000000000000744}, number={9}, journal={Critical Care Explorations}, publisher={Ovid Technologies (Wolters Kluwer Health)}, author={Veldhuis, Lars I. and Woittiez, Nicky J. C. and Nanayakkara, Prabath W. B. and Ludikhuize, Jeroen}, year={2022}, month=aug, pages={e0744} }



In [1]:
import tensorflow as tf 
import keras
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Masking, Bidirectional, GRU, Embedding, TimeDistributed, Concatenate, MultiHeadAttention
from tcn import TCN, tcn_full_summary
import keras.backend as K
from keras.layers import *
from keras import initializers, regularizers, constraints
from keras_nlp.layers import PositionEmbedding, TransformerEncoder, SinePositionEncoding
from tensorflow import linalg, ones, math, cast, float32


2024-04-15 10:04:55.349439: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-15 10:04:55.349515: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-15 10:04:55.401621: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-15 10:04:55.509502: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using TensorFlow backend


In [3]:

def create_padding_mask(input):
    # Create mask which marks the 100000.0 padding values in the input by a 1
    mask = tf.math.not_equal(input, 100000.0) # want 100000.0 to produce 0, aka no attention
    mask = tf.cast(mask, tf.bool)
    mask = mask[:, :, 0]
    return mask
 
def create_lookahead_mask(input):
    # Mask out future entries by marking them with a 1.0
    mask = 1 - linalg.band_part(ones((input.shape[1], input.shape[1])), -1, 0)
    mask = tf.repeat(tf.expand_dims(mask, 0), input.shape[0], axis=0)
 
    return mask
    


input_shape = (336, 7)
intermediate_dim = 128
num_heads=8
num_transformer_layers=1

position_embedding = PositionEmbedding(336)

inputs = Input(shape=input_shape)
mask_layer = keras.layers.Masking(mask_value=100000.0)(inputs)


dense_in = TimeDistributed(Dense(128))(mask_layer)

x = position_embedding(dense_in)

#x=mask_layer

padding_mask = create_padding_mask(inputs)
#lookahead_mask = create_lookahead_mask(inputs)

transformer_block = TransformerEncoder(
    num_heads=num_heads,
    intermediate_dim=intermediate_dim,
    dropout=0.1,
)
x = transformer_block(x, padding_mask=padding_mask)

outputs = Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
    
 

2024-04-15 10:05:44.347323: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-15 10:05:44.575152: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-15 10:05:44.575349: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 336, 7)]             0         []                            
                                                                                                  
 masking (Masking)           (None, 336, 7)               0         ['input_1[0][0]']             
                                                                                                  
 tf.math.not_equal (TFOpLam  (None, 336, 7)               0         ['input_1[0][0]']             
 bda)                                                                                             
                                                                                                  
 time_distributed (TimeDist  (None, 336, 128)             1024      ['masking[0][0]']         

In [2]:
import tensorflow as tf
from tensorflow import linalg, ones, math, cast, float32
import numpy as np

def create_padding_mask(input):
    # Create mask which marks the 100000.0 padding values in the input by a 1
    mask = tf.math.equal(input, 100000.0) # want 100000.0 to produce 0, aka no attention
    mask = tf.cast(mask, tf.bool)
    mask = mask[:, :, 0]
    return mask

arr = np.zeros((1, 30, 7))
arr[:, 20:, :] = 100000.0
create_padding_mask(arr)

2024-04-17 12:46:31.985102: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-17 12:46:32.010633: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-17 12:46:32.010777: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

<tf.Tensor: shape=(1, 30), dtype=bool, numpy=
array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True]])>

In [5]:
def create_lookahead_mask(input):
    # Mask out future entries by marking them with a 1.0
    mask = linalg.band_part(ones((input.shape[1], input.shape[1])), -1, 0)
    mask = tf.repeat(tf.expand_dims(mask, 0), input.shape[0], axis=0)
 
    return mask

create_lookahead_mask(arr)

<tf.Tensor: shape=(1, 30, 30), dtype=float32, numpy=
array([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.,