## Chris, TFの自作らへんを色々見てみる

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc, os

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow import keras
from tensorflow.keras import layers

print("Using TensorFlow version", tf.__version__)

Using TensorFlow version 2.10.0


In [14]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, feat_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(ff_dim, activation="gelu"),
                layers.Dense(feat_dim)
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

feat_dim = 188
embed_dim = 64 # embedding size for attention --
num_heads = 4 # number of attention heads --
ff_dim = 128 # hidden layer size in feed forward network inside transformer --
dropout_rate = 0.3
num_blocks = 2

In [15]:
def build_model():

    # input embedding layer --
    inp = layers.Input(shape=(13, 188))  # *
    embeddings = []  # **
    for k in range(11):   # たぶんindex=0~10がカテゴリfeatureで、それを分散表現に変えてる
        emb = layers.Embedding(10, 4)
        embeddings.append(emb(inp[:, :, k]))
    x = layers.Concatenate()([inp[:, :, 11:]]+embeddings) # ***
    x = layers.Dense(feat_dim)(x) # つまり、221:[category*4] + [num] -> 188に戻してる --

    # transformer blocks --
    for k in range(num_blocks):
        x_old = x
        transformer_block = TransformerBlock(embed_dim, feat_dim, num_heads, ff_dim, dropout_rate)
        x = transformer_block(x)
        x = 0.9*x + 0.1*x_old  # skip connection --
    
    # classification head --
    x = layers.Dense(64, activation="relu")(x[:,-1,:])
    x = layers.Dense(32, activation="relu")(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs=inp, outputs=outputs)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(loss=loss, optimizer=opt)

    return model

model1 = build_model()

In [20]:
model1.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 13, 188)]    0           []                               
                                                                                                  
 tf.__operators__.getitem (Slic  (None, 13)          0           ['input_1[0][0]']                
 ingOpLambda)                                                                                     
                                                                                                  
 tf.__operators__.getitem_1 (Sl  (None, 13)          0           ['input_1[0][0]']                
 icingOpLambda)                                                                                   
                                                                                              

In [19]:
model2 = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(13, 188)),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10)
])

In [21]:
model2.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 2444)              0         
                                                                 
 dense_15 (Dense)            (None, 128)               312960    
                                                                 
 dropout_7 (Dropout)         (None, 128)               0         
                                                                 
 dense_16 (Dense)            (None, 10)                1290      
                                                                 
Total params: 314,250
Trainable params: 314,250
Non-trainable params: 0
_________________________________________________________________


### メモ
* inpはバッチサイズを落として2Dで指定してる
* pytorchのforward(x)的な書き方で書いてあるからよくわからんな...

## ChrisのTFモデルを1行ずつやっていく --

In [26]:
# デモ入力
X_train = np.ones(100*13*188).reshape(100, 13, 188)
X_train.shape

(100, 13, 188)

In [54]:
def build_model():
    inp = layers.Input(shape=(13, 188))
    output = layers.Dense(1, activation=None)(inp)
    model = keras.Model(inputs=inp, outputs=output)

    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(loss=loss, optimizer=opt)
    return model
model = build_model()

In [99]:
inp = layers.Input(shape=(13, 188))
embeddings = []
for k in range(11):
    emb = layers.Embedding(10, 4)
    embeddings.append(emb(inp[:, :, k]))
x1 = layers.Concatenate()([inp[:, :, 11:]]+embeddings) # ***
x2 = layers.Dense(feat_dim)(x)  # feat_dim=188, activation=None --

tb1 = TransformerBlock(embed_dim, feat_dim, num_heads, ff_dim, dropout_rate)

x3 = tb1(x2)
x4 = 0.9*x3 + 0.1*x2

tb2 = TransformerBlock(embed_dim, feat_dim, num_heads, ff_dim, dropout_rate)

x5 = tb2(x4)
x6 = 0.9*x5 + 0.1*x4
    ## classification head --
    #x = layers.Dense(64, activation="relu")(x[:,-1,:])
    #x = layers.Dense(32, activation="relu")(x)
    #outputs = layers.Dense(1, activation="sigmoid")(x)

x7 = layers.Dense(64, activation="relu")(x6[:, -1, :])  # ここで時間方向の軸を最終行のみスライス --
x8 = layers.Dense(32, activation="relu")(x7)
outputs = layers.Dense(1, activation="sigmoid")(x8)


In [100]:
print(inp[:, :, k].shape)
print(emb(inp[:, :, k]).shape)  # (batch_size, sequence_length) -> <emb> -> (batch_size, sequence_length, embed_dim) --


print(inp.shape)
print(x1.shape)  # shape[2] = 188 + 4*10 
print(x2.shape)
print(x3.shape)
print(x4.shape)
print(x5.shape)
print(x6.shape)
print(x7.shape)
print(x8.shape)
print(x9.shape)
print(outputs.shape)

(None, 13)
(None, 13, 4)
(None, 13, 188)
(None, 13, 221)
(None, 13, 188)
(None, 13, 188)
(None, 13, 188)
(None, 13, 188)
(None, 13, 188)
(None, 64)


## embedding layerについて

In [84]:
# だいたい辞書みたいな感じ --
initializer = tf.keras.initializers.Constant(
    value=[
        [1, 2],
        [3, 4],
        [5, 6],
        [7, 8],
        [9, 10],
    ]
)
embedding_layer = tf.keras.layers.Embedding(5, 2, embeddings_initializer=initializer)
embedding_layer(inputs=tf.constant([[0, 1, 3], [0, 1, 2]]))  # 対応するインデックスのinitializerを取り出す --

<tf.Tensor: shape=(2, 3, 2), dtype=float32, numpy=
array([[[1., 2.],
        [3., 4.],
        [7., 8.]],

       [[1., 2.],
        [3., 4.],
        [5., 6.]]], dtype=float32)>

In [88]:
# で、分散表現のinitalizer自体がパラメータになる。今は5*2なので、#Params = 10 --
model = keras.Sequential(
    embedding_layer
)
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_93 (Embedding)    (None, None, 2)           10        
                                                                 
Total params: 10
Trainable params: 10
Non-trainable params: 0
_________________________________________________________________


## TransformerBlockについて