## Transformer Model 

In [4]:
%reload_ext autoreload
%autoreload 2
import IPython, IPython.display, os, datetime
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 


import tensorflow as tf
from tensorflow import keras
from keras.callbacks  import EarlyStopping
from keras.models import Sequential, Model
from keras.layers import Lambda, Dropout, Input
from keras import regularizers
from keras.utils import plot_model


mpl.rcParams['figure.figsize'] = (14, 4)
mpl.rcParams['axes.grid'] = True

print(f"Tensorflow Version {tf.__version__}, Keras Vesion: {keras.__version__}")

Tensorflow Version 2.10.0, Keras Vesion: 2.10.0


## Transformer

In [None]:
import tensorflow
from tensorflow import keras
from keras import layers

x = np.array([[[1.,2.,3.], [4.,5.,6.]] ])
o = layers.MultiHeadAttention(key_dim=1, num_heads=2, dropout=0)(x, x, x)
x.shape, o.shape, o

In [43]:
performance = {}
models = []

In [102]:
import tensorflow
from tensorflow import keras
from keras import layers

'''
    input shape = windows_len, features_length  <== this is your input features dimentions
    output shape= output_len , ouput_feat_len   <== This is your output features dimentions
'''
class TransformerModel():
    def __init__(self, window_len, feat_len, output_len, output_feat_len, head_size=256, 
                num_heads=4, ff_dim=64, num_transformer_blocks=6, mlp_units=[128], 
                dropout=0.3, mlp_dropout=0.25, use_norm= "layer|batch", **kwargs ):
        super().__init__()
        input_shape = (window_len, feat_len)
        self.input_shape = input_shape
        self.output_len, self.ouput_feat_len, self.head_size, self.num_heads, \
            self.ff_dim,self.num_transformer_blocks, self.mlp_units, self.dropout, self.mlp_dropout = \
                  output_len, ouput_feat_len, head_size, num_heads, ff_dim, \
                num_transformer_blocks, mlp_units, dropout, mlp_dropout

        #self.mha = layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout, name="mha")
        self.mha = layers.MultiHeadAttention(key_dim=feat_len, num_heads=num_heads, dropout=dropout, name="mha")
        self.layernorm = layers.LayerNormalization()
        self.add = layers.Add()

        self.ff = Sequential([
            layers.Dense(ff_dim, activation='relu'),
            layers.Dense(feat_len),
            layers.Dropout(mlp_dropout)
            ], name="FeedForward")

        op_len = output_len * output_feat_len
        
        self.last = Sequential([layers.Flatten(), 
                                layers.Dropout(mlp_dropout), 
                                layers.Dense( op_len, activation="relu"),
                                layers.Reshape([output_len, output_feat_len])],
                                name = "LastDecoderLyer")
        self.build()

    def position_encoding(self, x):
        return x

    def self_attention(self, inputs):
        x = self.mha(inputs, inputs, inputs )
        x = self.layernorm(x)
        self.add( [x, inputs])
        return x

    def ff(self, inputs):
        x = self.ff (inputs)
        x = self.layernorm(x)
        self.add( [inputs, x])
        return x

    def build(self, inputs =None):
        if inputs is None:
            inputs = keras.Input(shape=self.input_shape)
        x = self.position_encoding(inputs)

        for _ in range( self.num_transformer_blocks ):
            x = self.self_attention(x)
            x = self.ff(x)

        x = self.last(x)
        return keras.Model(inputs, x)


tm = TransformerModel(2, 2, output_len=1, output_feat_len=10, num_transformer_blocks =2, ff_dim=1)
model = tm.build()
model.summary()

Model: "model_84"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_87 (InputLayer)          [(None, 2, 2)]       0           []                               
                                                                                                  
 mha (MultiHeadAttention)       (None, 2, 2)         90          ['input_87[0][0]',               
                                                                  'input_87[0][0]',               
                                                                  'input_87[0][0]',               
                                                                  'FeedForward[2][0]',            
                                                                  'FeedForward[2][0]',            
                                                                  'FeedForward[2][0]']     

In [111]:
import tensorflow
from tensorflow import keras
from keras import layers

'''
    input shape = windows_len, features_length  <== this is your input features dimentions
    output shape= output_len , ouput_feat_len   <== This is your output features dimentions
'''
class TransformerModel():
    def __init__(self, window_len, feat_len, output_len, output_feat_len, head_size=256, 
                num_heads=4, ff_dim=64, num_transformer_blocks=6, mlp_units=[128], 
                dropout=0.3, mlp_dropout=0.25, use_norm= "layer|batch", **kwargs ):
        super().__init__()
        input_shape = (window_len, feat_len)
        self.input_shape = input_shape
        self.output_len, self.ouput_feat_len, self.head_size, self.num_heads, \
            self.ff_dim,self.num_transformer_blocks, self.mlp_units, self.dropout, self.mlp_dropout = \
                  output_len, ouput_feat_len, head_size, num_heads, ff_dim, \
                num_transformer_blocks, mlp_units, dropout, mlp_dropout

        #self.mha = layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout, name="mha")
        self.mha = [layers.MultiHeadAttention(key_dim=feat_len, num_heads=num_heads, dropout=dropout, name="mha")]*num_transformer_blocks
        self.layernorm = layers.LayerNormalization()
        self.add = layers.Add()

        self.ff = Sequential([
            layers.Dense(ff_dim, activation='relu'),
            layers.Dense(feat_len),
            layers.Dropout(mlp_dropout)
            ] * num_transformer_blocks, name="FeedForward")

        op_len = output_len * output_feat_len
        
        self.last = Sequential([layers.Flatten(), 
                                layers.Dropout(mlp_dropout), 
                                layers.Dense( op_len, activation="relu"),
                                layers.Reshape([output_len, output_feat_len])],
                                name = "LastDecoderLyer")
        self.build()

    def position_encoding(self, x):
        return x

    def self_attention(self, inputs):
        x = self.mha(inputs, inputs, inputs )
        x = self.layernorm(x)
        self.add( [x, inputs])
        return x

    def ff(self, inputs):
        x = self.ff (inputs)
        x = self.layernorm(x)
        self.add( [inputs, x])
        return x

    def build(self, inputs =None):
        if inputs is None:
            inputs = keras.Input(shape=self.input_shape)
        x = self.position_encoding(inputs)

        for i in range( self.num_transformer_blocks ):
            x = self.mha[i](inputs, inputs, inputs )
            x = self.layernorm(x)
            self.add( [x, inputs])
            x = self.ff(x)

        x = self.last(x)
        return keras.Model(inputs, x)


tm = TransformerModel(2, 2, output_len=1, output_feat_len=10, num_transformer_blocks =6, ff_dim=1)
model = tm.build()
model.summary()    

Model: "model_90"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_95 (InputLayer)          [(None, 2, 2)]       0           []                               
                                                                                                  
 mha (MultiHeadAttention)       (None, 2, 2)         90          ['input_95[0][0]',               
                                                                  'input_95[0][0]',               
                                                                  'input_95[0][0]']               
                                                                                                  
 layer_normalization_56 (LayerN  (None, 2, 2)        4           ['mha[11][0]']                   
 ormalization)                                                                             

In [113]:
tm.ff.weights

[<tf.Variable 'dense_123/kernel:0' shape=(2, 1) dtype=float32, numpy=
 array([[0.59179795],
        [0.12576783]], dtype=float32)>,
 <tf.Variable 'dense_123/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>,
 <tf.Variable 'dense_124/kernel:0' shape=(1, 2) dtype=float32, numpy=array([[0.39387882, 1.3983084 ]], dtype=float32)>,
 <tf.Variable 'dense_124/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>]

In [46]:
history = ts_utils.compile_fit(model, window_trn, window_tst, epochs=1, verbose=1)



In [None]:
# Linear Model
model1 = tf.keras.Sequential([
    # Take the last time-step.
    # Shape [batch, time, features] => [batch, 1, features]
    tf.keras.layers.Lambda(lambda x: x[:, -1:, :])
    ] + ts_utils.getCommonLayer(ouput_len, ouput_feat_len),
    name = "Linear"
)

history1 = ts_utils.compile_fit(model1, window_trn, window_tst, epochs=1, verbose=1)

models.append(model)x

In [None]:
#hist = model.fit( window_trn, epochs=200, batch_size=batch_size, callbacks=callbacks)
#model.evaluate(window_tst)

## Compile and Evaluate All Models

In [None]:
performace={}
models.append(model)

In [None]:
for i, model in enumerate(models):
    print(f"Now Compiling {i+1}/{len(models)} {model.name} ")
    history = ts_utils.compile_fit(model, window_trn, window_tst, epochs=3, verbose=1)
    IPython.display.clear_output()

# Plot graphs
performance = ts_plot_utils.plot_performance(models, window_trn, window_tst, performance=performance)

## Model Architecture

In [None]:
from keras.utils import plot_model
for i, model in enumerate(models):
    print(f"PLotting {i} Model: {model.name}")
    d= plot_model(model, show_shapes=True)
    display(d)

## Predictions

In [None]:
import matplotlib.pyplot as plt

model = models[0]
rw=slice(400, 500)
pc = 4

for x in window_tst100:
    #yp = model(x)
    #yhat = yp[:, np.newaxis, :]
    #y = x[1]
    break;
#y, yhat = ts_utils.model_predict( model , window_tst100)
#ydf = scaler.inverse_transform(pd.DataFrame(y, columns=scaler.feature_names_in_[label_slice]))
#pdf = scaler.inverse_transform(pd.DataFrame(yhat, columns=scaler.feature_names_in_[label_slice]))
'''y = y.numpy().reshape((47754,20))
yhat = yhat.numpy().reshape((47754,20))
ydf = pd.DataFrame(y, columns=scaler.feature_names_in_[label_slice])
pdf = pd.DataFrame(yhat, columns=scaler.feature_names_in_[label_slice])
'''
nc = len(ydf.columns)
pr = nc //pc
pr += 1 if len(ydf.columns)%pc else 0
fig, axs = plt.subplots(pr, pc, figsize=(20, 3 *pc))
for i, c in enumerate(ydf.columns):
    pr1 , pc1 = i // pc, i % pc
    plt1 = axs[ pr1, pc1 ]
    plt1.plot(range(len(ydf))[rw], ydf[c][rw], "-."  , label=f"y")
    plt1.plot(range(len(pdf))[rw], pdf[c][rw], "r-." , label=f"yhat")
    plt1.legend()
    plt1.set_title(f'{model.name} - {c} - {rw}/{len(ydf)}')

In [None]:
#model = models[1]
#ydf, pdf = ts_plot_utils.predict_and_plot( model, window_trn100, window_tst100, howmany=1024* 1024,
#                        plot_start=0, df=None, scaler=None, label_slice=None);

## Anomaly graph

In [None]:
import ts_anom_utils
from ts_anom_utils import compute_scores

y, yhat = ts_utils.model_predict( model , window_trn100)
ydf = scaler.inverse_transform(pd.DataFrame(y, columns=scaler.feature_names_in_[label_slice]))
pdf = scaler.inverse_transform(pd.DataFrame(yhat, columns=scaler.feature_names_in_[label_slice]))

ret, error, se, errorDF, escaler, fscore = compute_scores(ydf, pdf, errorDF= None, escaler=None)

y, yhat = ts_utils.model_predict( model , window_tst100)
ydf = scaler.inverse_transform(pd.DataFrame(y, columns=scaler.feature_names_in_[label_slice]))
pdf = scaler.inverse_transform(pd.DataFrame(yhat, columns=scaler.feature_names_in_[label_slice]))

ret, error, se, errorDF, escaler, fscore = compute_scores(ydf, pdf, errorDF, escaler, fscore=fscore )

errorDF

In [None]:
plt.plot(pd.to_datetime(df_tst.index, unit='ms')[0:len(ret)], ret.norm_score)

### The END