### ADD LIBRARY

In [1]:
import os
import io
import cv2
import imageio
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers, Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report


### VARIABLE DECLARATION

In [2]:
DATASET_NAME = "violence_and_nonviolence_videos"
BATCH_SIZE = 32
AUTO = tf.data.AUTOTUNE
INPUT_SHAPE = (36, 28, 28, 3)
NUM_CLASSES = 2
LABELS = ["no","fi"]

# OPTIMIZER
LEARNING_RATE = 0.0001
WEIGHT_DECAY = 0.00001

# TRAINING
EPOCHS = 60

# TUBELET EMBEDDING
PATCH_SIZE = (8, 8, 8)
NUM_PATCHES = (INPUT_SHAPE[0] // PATCH_SIZE[0]) ** 2

# ViViT ARCHITECTURE
LAYER_NORM_EPS = 1e-6
PROJECTION_DIM = 128
NUM_HEADS = 8
NUM_LAYERS = 8

### DATASET PREPARATION

In [3]:
data_path = "C:/Users/Emirhan/Downloads/data"
classes = os.listdir(data_path)

videos = []
labels = []

for cls in classes:
    cls_path = os.path.join(data_path, cls)
    files = os.listdir(cls_path)
    for video in os.listdir(cls_path):
        video_path = os.path.join(cls_path, video)
        videos.append(video_path)
        label = os.path.basename(video_path).split('_')[0][0:2]
        if label not in LABELS:
            continue
        labels.append(label)



In [4]:
labels = np.array(labels)
videos = np.array(videos)

In [5]:
print(labels)

['no' 'no' 'no' ... 'fi' 'fi' 'fi']


#### ONE-HOT ENCODING

In [6]:
encoder = OneHotEncoder(sparse=False)
labels = encoder.fit_transform(np.array(labels).reshape(-1, 1))

print(labels)


[[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]




### DATA PREPROCESSING

In [7]:
temp_videos = videos
temp_labels = labels

X_train, X_test, y_train1, y_test1 = train_test_split(temp_videos, temp_labels, test_size=0.20,random_state=1)

# Bu kısım prediction kısmında test verilerini video halinde görüntüleyebilmek için yazılmıştır.


In [8]:
video_paths = videos

# Fındıng minimum frame count
min_frame_count = float('inf')

for video_path in video_paths:
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if frame_count < min_frame_count:
        min_frame_count = frame_count


frames = []

for video_path in video_paths:
    cap = cv2.VideoCapture(video_path)
    selected_frames = []
    for i in range(min_frame_count):
        ret, frame = cap.read()
        if ret:
            frame = cv2.resize(frame, (28,28))
            selected_frames.append(frame)
    frames.append(np.array(selected_frames))

frames_array = np.array(frames)


print(frames_array.shape)


(2000, 36, 28, 28, 3)


In [9]:
X_train_frames, X_test_frames, y_train, y_test = train_test_split(frames_array, labels, test_size=0.25,random_state=1)

In [10]:
X_train_frames, X_val_frames, y_train, y_val = train_test_split(X_train_frames, y_train, test_size=0.15, random_state=1)

In [11]:
X_train_frames

array([[[[[0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          ...,
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0]],

         [[0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          ...,
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0]],

         [[0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          ...,
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0]],

         ...,

         [[0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          ...,
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0]],

         [[0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          ...,
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0]],

         [[0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          ...,
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0]]],


        [[[0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          ...,
          [0, 0, 0],
          [0, 0, 0],
     

In [12]:
len(X_train_frames)

1275

In [13]:
len(X_test_frames)

500

In [14]:
len(X_val_frames)

225

##### DATA STANDARDIZATION

In [15]:
X_train_frames = X_train_frames/255
X_test_frames = X_test_frames/255
X_val_frames = X_val_frames/255

In [16]:
@tf.function
def preprocess(frames: tf.Tensor, label: tf.Tensor):
    """Preprocess the frames tensors and parse the labels."""
    # Preprocess images
    frames = tf.image.convert_image_dtype(
        frames[
            ..., tf.newaxis
        ],  # The new axis is to help for further processing with Conv3D layers
        tf.float32,
    )
    # Parse label
    label = tf.cast(label, tf.float32)
    return frames, label


def prepare_dataloader(
    videos: np.ndarray,
    labels: np.ndarray,
    loader_type: str = "train",
    batch_size: int = BATCH_SIZE,
):
    """Utility function to prepare the dataloader."""
    dataset = tf.data.Dataset.from_tensor_slices((videos, labels))

    if loader_type == "train":
        dataset = dataset.shuffle(BATCH_SIZE * 2)

    dataloader = (
        dataset.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
        .batch(batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )
    return dataloader


trainloader = prepare_dataloader(X_train_frames, y_train, "train")
validloader = prepare_dataloader(X_val_frames, y_val, "valid")
testloader = prepare_dataloader(X_test_frames, y_test, "test")

In [17]:
testloader

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 36, 28, 28, 3, 1), dtype=tf.float32, name=None), TensorSpec(shape=(None, 2), dtype=tf.float32, name=None))>

### BUILDING THE MODEL

In [18]:
# TUBELET EMBEDDING

class TubeletEmbedding(layers.Layer):
    def __init__(self, embed_dim, patch_size, **kwargs):
        super().__init__(**kwargs)
        self.projection = layers.Conv3D(
            filters=embed_dim,
            kernel_size=patch_size,
            strides=patch_size,
            padding="VALID",
        )
        self.flatten = layers.Reshape(target_shape=(-1, embed_dim))

    def call(self, videos):
        projected_patches = self.projection(videos)
        flattened_patches = self.flatten(projected_patches)
        return flattened_patches


In [19]:
# POSITIONAL 

class PositionalEncoder(layers.Layer):
    def __init__(self, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim

    def build(self, input_shape):
        _, num_tokens, _ = input_shape
        self.position_embedding = layers.Embedding(
            input_dim=num_tokens, output_dim=self.embed_dim
        )
        self.positions = tf.range(start=0, limit=num_tokens, delta=1)

    def call(self, encoded_tokens):
        # Encode the positions and add it to the encoded tokens
        encoded_positions = self.position_embedding(self.positions)
        encoded_tokens = encoded_tokens + encoded_positions
        return encoded_tokens

In [20]:
def create_vivit_classifier(
    tubelet_embedder,
    positional_encoder,
    input_shape=INPUT_SHAPE,
    transformer_layers=NUM_LAYERS,
    num_heads=NUM_HEADS,
    embed_dim=PROJECTION_DIM,
    layer_norm_eps=LAYER_NORM_EPS,
    num_classes=NUM_CLASSES,
):
    # Get the input layer
    inputs = layers.Input(shape=input_shape)
    # Create patches.
    patches = tubelet_embedder(inputs)
    # Encode patches.
    encoded_patches = positional_encoder(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization and MHSA
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim // num_heads, dropout=0.1
        )(x1, x1)

        # Skip connection
        x2 = layers.Add()([attention_output, encoded_patches])

        # Layer Normalization and MLP
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        x3 = keras.Sequential(
            [
                layers.Dense(units=embed_dim * 4, activation=tf.nn.gelu),
                layers.Dense(units=embed_dim, activation=tf.nn.gelu),
            ]
        )(x3)

        # Skip connection
        encoded_patches = layers.Add()([x3, x2])

    # Layer normalization and Global average pooling.
    representation = layers.LayerNormalization(epsilon=layer_norm_eps)(encoded_patches)
    representation = layers.GlobalAvgPool1D()(representation)

    # Classify outputs.
    outputs = layers.Dense(units=num_classes, activation="softmax")(representation)

    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

### MODEL TRAINING

In [21]:
def run_experiment():
    # Initialize model
    model = create_vivit_classifier(
        tubelet_embedder=TubeletEmbedding(
            embed_dim=PROJECTION_DIM, patch_size=PATCH_SIZE
        ),
        positional_encoder=PositionalEncoder(embed_dim=PROJECTION_DIM),
    )

    # Compile the model with the optimizer, loss function
    # and the metrics.
    optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    model.compile(
        optimizer=optimizer,
        loss="binary_crossentropy",
        metrics=['accuracy'
        ],
    )

    # Train the model.
    _ = model.fit(trainloader, epochs=EPOCHS, validation_data=validloader)


    return model


model = run_experiment()

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [22]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 36, 28, 28,  0           []                               
                                 3)]                                                              
                                                                                                  
 tubelet_embedding (TubeletEmbe  (None, 36, 128)     196736      ['input_1[0][0]']                
 dding)                                                                                           
                                                                                                  
 positional_encoder (Positional  (None, 36, 128)     4608        ['tubelet_embedding[0][0]']      
 Encoder)                                                                                     

### EVALUATE THE MODEL

In [23]:
x_test = X_test_frames

y_pred = model.predict(x_test)

print(y_pred) # (num_samples, num_classes)


[[4.48849732e-07 9.99999523e-01]
 [9.45725739e-01 5.42742833e-02]
 [9.33666289e-01 6.63337484e-02]
 [9.99999881e-01 1.08852106e-07]
 [1.00000000e+00 5.13897653e-08]
 [9.99966860e-01 3.31102747e-05]
 [3.94628108e-01 6.05371892e-01]
 [8.48010984e-07 9.99999166e-01]
 [9.99879003e-01 1.20944176e-04]
 [9.99996543e-01 3.41696364e-06]
 [9.99999762e-01 2.62031108e-07]
 [1.00000000e+00 1.14523777e-08]
 [1.00000000e+00 2.62141526e-08]
 [9.99999881e-01 9.65381020e-08]
 [1.29524466e-07 9.99999881e-01]
 [9.99935746e-01 6.42558516e-05]
 [1.74745178e-08 1.00000000e+00]
 [9.98468339e-01 1.53165066e-03]
 [2.22630310e-03 9.97773707e-01]
 [9.99999881e-01 1.77220045e-07]
 [9.99999642e-01 3.87506589e-07]
 [1.00000000e+00 4.25772129e-08]
 [3.10622752e-01 6.89377248e-01]
 [1.15573539e-06 9.99998808e-01]
 [2.81876206e-01 7.18123794e-01]
 [7.18904687e-08 9.99999881e-01]
 [9.99838710e-01 1.61240489e-04]
 [9.99999762e-01 1.98545152e-07]
 [2.80499171e-06 9.99997139e-01]
 [2.07853716e-08 1.00000000e+00]
 [1.000000

In [24]:
predictions = y_pred > 0.5

In [25]:
predictions = predictions.astype(float)
predictions

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.

In [26]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       245
           1       0.84      0.82      0.83       255

   micro avg       0.82      0.82      0.82       500
   macro avg       0.82      0.82      0.82       500
weighted avg       0.82      0.82      0.82       500
 samples avg       0.82      0.82      0.82       500



In [27]:
y_test

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.

### PREDICTIONS

In [28]:

def video_show(single_video):
    
    video_path = single_video

    cap = cv2.VideoCapture(video_path)

    gif_frames = []

    while True:
     
        ret, frame = cap.read()

        if not ret:
            break

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        gif_frames.append(frame_rgb)

    cap.release()

    imageio.mimsave("video.gif", gif_frames, fps=30)

def predict(frames):
    frames = [frames] 
    frames = np.array(frames)
    y_pred = model.predict(frames)

   
    prediction = y_pred > 0.5
    prediction = prediction.astype(float)

    print(prediction)  # (num_samples, num_classes)


In [31]:
video_show(X_test[122])
predict(X_test_frames[122])

[[1. 0.]]
