In [None]:
import os
import h5py
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, LayerNormalization, Conv2D, Add, Dropout, Reshape, UpSampling2D, BatchNormalization, AveragePooling2D
import matplotlib.pyplot as plt
from tensorflow.keras.saving import register_keras_serializable
import random
from google.colab import drive
import cv2
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
onehot2label = {
    0: "Slow Motion",
    1: "Wide Shot",
    2: "Close-up",
    3: "High Angle",
    4: "Low Angle",
    5: "Cinematic Lighting",
    6: "Blurred Background",
    7: "Fast Motion",
    8: "Dynamic Movement",
    9: "Hyper-realistic Detail"}

X, Y = [], []

#Data Loading Here


Folder: close-up
(512, 512)
(512, 512)

Folder: medium-shot

Folder: profile-shot

Folder: titles

Folder: wide-shot
(512, 512)


In [None]:
@register_keras_serializable()
class embedify(tf.keras.layers.Layer):
  def __init__(self, patch_size, emb_dim, **kwargs):
    super(embedify, self).__init__(**kwargs)
    self.emb_dim = emb_dim
    self.patch_size = patch_size
    self.conv_emb = Conv2D(filters=self.emb_dim, kernel_size=self.patch_size, strides=self.patch_size)
    self.CLS_token = self.add_weight(
        name="CLS_token",
        shape=(1, 1, self.emb_dim),
        initializer="random_normal",
        trainable=True)

  def build(self, input_shape):
    num_patches = (input_shape[1] // self.patch_size) * (input_shape[2] // self.patch_size)
    self.pos_emb = self.add_weight(
        name="pos_emb",
        shape=(1, num_patches+1, self.emb_dim),
        initializer="random_normal",
        trainable=True)
    self.conv_emb.build(input_shape)
    super().build(input_shape)

  def call(self, x):
    batch_size = tf.shape(x)[0]
    x = self.conv_emb(x) #patchify into 16x16 patches
    x = tf.reshape(x, (batch_size,  tf.shape(x)[1]* tf.shape(x)[2], self.emb_dim)) #flatten into patch array
    CLS = tf.tile(self.CLS_token, [batch_size, 1, 1]) #add CLS token
    x = tf.concat([CLS, x], axis=1)
    x += self.pos_emb #add learnable pos enc (no sinusoid)
    return x

  def get_config(self):
    config = super().get_config()
    config.update({"patch_size": self.patch_size, "emb_dim": self.emb_dim})
    return config

  def compute_output_shape(self, input_shape):
    h, w = input_shape[1], input_shape[2]
    num_patches = (h // self.patch_size) * (w // self.patch_size)
    return (input_shape[0], num_patches, self.emb_dim)

@register_keras_serializable()
class attentify(tf.keras.layers.Layer):
  def __init__(self, emb_dim, **kwargs):
    super().__init__(**kwargs)
    self.emb_dim = emb_dim
    self.Q = self.add_weight(shape=(emb_dim, emb_dim), initializer='glorot_uniform', name='Q', trainable=True)
    self.K = self.add_weight(shape=(emb_dim, emb_dim), initializer='glorot_uniform', name='K', trainable=True)

  def call(self, x):
    Qx = tf.matmul(x, self.Q)
    Kx = tf.matmul(x, self.K)
    A = tf.matmul(Qx, Kx, transpose_b=True) / tf.math.sqrt(tf.cast(self.emb_dim, tf.float32))
    A = tf.nn.softmax(A)
    x = tf.matmul(A, x) + x
    return x

  def get_config(self):
    config = super().get_config()
    config.update({"emb_dim": self.emb_dim})
    return config

  def compute_output_shape(self, input_shape):
    return input_shape

  def build(self, input_shape):
    super().build(input_shape)

@register_keras_serializable()
class MLPify(tf.keras.layers.Layer):
  # __NOTES__
  def __init__(self, emb_dim, expansion_multiplier, **kwargs):
    super().__init__(**kwargs)
    self.emb_dim = emb_dim
    self.expansion_multiplier = expansion_multiplier
    self.denseUp = Dense(self.emb_dim*self.expansion_multiplier, activation="gelu")
    self.denseDown = Dense(self.emb_dim, activation="gelu")
    self.dropout = Dropout(0.1)

  def call(self, x, training=False):
    dx = self.denseUp(x)
    dx = self.denseDown(dx)
    dx = self.dropout(dx,training=training)
    x = x + dx
    return x

  def build(self, input_shape):
    self.denseUp.build(input_shape)  # input: (batch, context, emb_dim)
    up_out_shape = self.denseUp.compute_output_shape(input_shape)
    self.denseDown.build(up_out_shape)
    self.dropout.build(up_out_shape)
    super().build(input_shape)

  def compute_output_shape(self, input_shape):
    return input_shape

  def get_config(self):
    config = super().get_config()
    config.update({"emb_dim": self.emb_dim, "expansion_multiplier": self.expansion_multiplier})
    return config

@register_keras_serializable()
class transformify(tf.keras.layers.Layer):
  def __init__(self, emb_dim, head_no, dropout, **kwargs):
    super().__init__(**kwargs)
    self.attentifys = [attentify(emb_dim) for _ in range(head_no)]
    self.emb_dim = emb_dim
    self.mlp = MLPify(emb_dim, 4)
    self.layernorm1 = LayerNormalization(epsilon=1e-6)
    self.dense_projection = Dense(emb_dim)
    self.layernorm2 = LayerNormalization(epsilon=1e-6)
    self.dropout1 = Dropout(dropout)
    self.dropout2 = Dropout(dropout)

  def call(self, x, training=False):
    input_shape = x.shape
    x = self.layernorm1(x)
    attn = [att(x) for att in self.attentifys]
    x = tf.concat(attn, axis=-1)
    x = self.dense_projection(x)
    x = self.dropout1(x, training=training)
    x = self.layernorm2(x)
    x = self.mlp(x, training=training)
    x = self.dropout2(x, training=training)
    return x

  def get_config(self):
    config = super().get_config()
    config.update({"emb_dim": self.emb_dim, "head_no": len(self.attentifys), "dropout": self.dropout1.rate})
    return config

  def compute_output_shape(self, input_shape):
    return input_shape

  def build(self, input_shape):
    for attn in self.attentifys:
      attn.build(input_shape)
    self.layernorm1.build(input_shape)
    self.layernorm2.build(input_shape)
    self.mlp.build(input_shape)
    self.dropout1.build(input_shape)
    self.dropout2.build(input_shape)
    self.dense_projection.build((input_shape[0], input_shape[1], len(self.attentifys) * self.emb_dim))  # (batch_size, num_tokens, head_no * emb_dim)
    super().build(input_shape)

In [None]:
def create_model(patch_size=16, emb_dim=300, head_no=4, dropout=0.1, transformer_layers=6,):
  inputs = Input(shape=(512, 512, 3))

  #__ViT__
  x = embedify(16, 300)(inputs)
  for _ in range(transformer_layers):
    x = transformify(emb_dim, head_no, dropout)(x, training=False)
  x = MLPify(300, 4)(x)
  CLS = x[:,0,:]
  CLS = Dense(10, activation='softmax')(CLS)
  return tf.keras.Model(inputs, CLS, name="Lightweight-ViT-Encoder-Classification")

In [None]:
class ViTDataGen(tf.keras.utils.Sequence):
  def __init__(self, X, Y, batch_size=16, shuffle=True):
    super().__init__()  # THIS IS IMPORTANT
    self.X = X
    #print(X[0].shape)
    self.Y = Y
    self.batch_size = batch_size
    self.shuffle = shuffle

    self.indices = np.arange(len(X))
    if self.shuffle:
        np.random.shuffle(self.indices)

  def __len__(self):
    return len(self.indices) // self.batch_size

  def __getitem__(self, idx):
    #np.expand_dims(np.array(X[0]), axis=0)
    indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]

    X = np.stack([self.X[i] for i in indices])
    Y = np.stack([self.Y[i] for i in indices])
    return (X, Y)  # Return a tuple, not a list

  def on_epoch_end(self):
    if self.shuffle:
        np.random.shuffle(self.indices)

In [None]:
ViT = create_model()
ViT.summary()

In [None]:
ViT.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
x = np.expand_dims(np.array(X[0]), axis=0)
print(x.shape)
ViT.predict(x)

(1, 512, 512, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


array([[0.00353501, 0.01144199, 0.05601081, 0.40686488, 0.4272279 ,
        0.00886884, 0.01637472, 0.04429984, 0.00210371, 0.02327227]],
      dtype=float32)

In [None]:
ViT.fit(ViTDataGen(X,Y), epochs=3, batch_size=16)

In [None]:
ViT.save('ViT_Weights.keras')

In [None]:
vit = tf.keras.models.load_model('ViT_Weights.keras')

In [None]:
pdf = vit.predict(np.expand_dims(np.array(X[random.randint(0,5000)]), axis=0))
print(pdf)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[[0.00379678 0.00297844 0.26176834 0.12760867 0.08178452 0.25893638
  0.0057682  0.00677026 0.14552607 0.10506228]]


In [None]:
onehot2label[np.argmax(pdf)]

'Close-up'