In [1]:
import os
import io
import imageio
import ipywidgets
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers



In [2]:
DATASET_NAME = "hockey_fight_videos"
BATCH_SIZE = 64
AUTO = tf.data.AUTOTUNE
INPUT_SHAPE = (36, 28, 28, 3)
NUM_CLASSES = 2
LABELS = ["no","fi"]

# OPTIMIZER
LEARNING_RATE = 0.0001
WEIGHT_DECAY = 0.00001

# TRAINING
EPOCHS = 100

# TUBELET EMBEDDING
PATCH_SIZE = (8, 8, 8)
NUM_PATCHES = (INPUT_SHAPE[0] // PATCH_SIZE[0]) ** 2

# ViViT ARCHITECTURE
LAYER_NORM_EPS = 1e-6
PROJECTION_DIM = 128
NUM_HEADS = 8
NUM_LAYERS = 8

## DATASET

In [3]:
import os
import random
from sklearn.model_selection import train_test_split

# veri kümenizi yükleyin
data_path = "C:/Users/Emirhan/Downloads/data"
classes = os.listdir(data_path)

videos = []
labels = []
train_files = []
val_files = []
for cls in classes:
    cls_path = os.path.join(data_path, cls)
    files = os.listdir(cls_path)
    for video in os.listdir(cls_path):
        video_path = os.path.join(cls_path, video)
        videos.append(video_path)
        label = os.path.basename(video_path).split('_')[0][0:2]
        if label not in LABELS:
            continue
        labels.append(label)



In [4]:
labels = np.array(labels)
videos = np.array(videos)

In [5]:
from sklearn.preprocessing import OneHotEncoder

# Örnek veri etiketleri

# One-Hot Encoding
encoder = OneHotEncoder(sparse=False)
labels = encoder.fit_transform(np.array(labels).reshape(-1, 1))

# Kodlanmış etiketleri kontrol etme
print(labels)


[[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]




In [6]:
import cv2
import numpy as np

# Videoların yollarını içeren bir liste oluşturma
video_paths = videos

# En kısa video uzunluğunu belirleme
min_frame_count = float('inf')

for video_path in video_paths:
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if frame_count < min_frame_count:
        min_frame_count = frame_count

# Tüm videolardan aynı sayıda frame almak için döngü
frames = []

for video_path in video_paths:
    cap = cv2.VideoCapture(video_path)
    selected_frames = []
    for i in range(min_frame_count):
        ret, frame = cap.read()
        if ret:
            frame = cv2.resize(frame, (28,28))
            selected_frames.append(frame)
    frames.append(np.array(selected_frames))

# Tüm videolardaki ortak frame sayısına sahip bir matris oluşturma
video_array = np.stack(frames)

# Video dizisinin şeklini yazdırma
print(video_array.shape)

(2000, 36, 28, 28, 3)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(video_array, labels, test_size=0.25)

In [8]:
X_train = X_train/255
X_test = X_test/255

In [9]:
len(y_train)

1500

In [10]:
len(X_test)

500

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)

In [12]:
len(X_val)

150

In [13]:
X_val = X_val/255

In [14]:
@tf.function
def preprocess(frames: tf.Tensor, label: tf.Tensor):
    """Preprocess the frames tensors and parse the labels."""
    # Preprocess images
    frames = tf.image.convert_image_dtype(
        frames[
            ..., tf.newaxis
        ],  # The new axis is to help for further processing with Conv3D layers
        tf.float32,
    )
    # Parse label
    label = tf.cast(label, tf.float32)
    return frames, label


def prepare_dataloader(
    videos: np.ndarray,
    labels: np.ndarray,
    loader_type: str = "train",
    batch_size: int = BATCH_SIZE,
):
    """Utility function to prepare the dataloader."""
    dataset = tf.data.Dataset.from_tensor_slices((videos, labels))

    if loader_type == "train":
        dataset = dataset.shuffle(BATCH_SIZE * 2)

    dataloader = (
        dataset.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
        .batch(batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )
    return dataloader


trainloader = prepare_dataloader(X_train, y_train, "train")
validloader = prepare_dataloader(X_val, y_val, "valid")
testloader = prepare_dataloader(X_test, y_test, "test")

In [15]:
testloader

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 36, 28, 28, 3, 1), dtype=tf.float32, name=None), TensorSpec(shape=(None, 2), dtype=tf.float32, name=None))>

In [16]:
# TUBELET EMBEDDING

class TubeletEmbedding(layers.Layer):
    def __init__(self, embed_dim, patch_size, **kwargs):
        super().__init__(**kwargs)
        self.projection = layers.Conv3D(
            filters=embed_dim,
            kernel_size=patch_size,
            strides=patch_size,
            padding="VALID",
        )
        self.flatten = layers.Reshape(target_shape=(-1, embed_dim))

    def call(self, videos):
        projected_patches = self.projection(videos)
        flattened_patches = self.flatten(projected_patches)
        return flattened_patches


In [17]:
# POSITIONAL 

class PositionalEncoder(layers.Layer):
    def __init__(self, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim

    def build(self, input_shape):
        _, num_tokens, _ = input_shape
        self.position_embedding = layers.Embedding(
            input_dim=num_tokens, output_dim=self.embed_dim
        )
        self.positions = tf.range(start=0, limit=num_tokens, delta=1)

    def call(self, encoded_tokens):
        # Encode the positions and add it to the encoded tokens
        encoded_positions = self.position_embedding(self.positions)
        encoded_tokens = encoded_tokens + encoded_positions
        return encoded_tokens

In [18]:
def create_vivit_classifier(
    tubelet_embedder,
    positional_encoder,
    input_shape=INPUT_SHAPE,
    transformer_layers=NUM_LAYERS,
    num_heads=NUM_HEADS,
    embed_dim=PROJECTION_DIM,
    layer_norm_eps=LAYER_NORM_EPS,
    num_classes=NUM_CLASSES,
):
    # Get the input layer
    inputs = layers.Input(shape=input_shape)
    # Create patches.
    patches = tubelet_embedder(inputs)
    # Encode patches.
    encoded_patches = positional_encoder(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization and MHSA
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim // num_heads, dropout=0.1
        )(x1, x1)

        # Skip connection
        x2 = layers.Add()([attention_output, encoded_patches])

        # Layer Normalization and MLP
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        x3 = keras.Sequential(
            [
                layers.Dense(units=embed_dim * 4, activation=tf.nn.gelu),
                layers.Dense(units=embed_dim, activation=tf.nn.gelu),
            ]
        )(x3)

        # Skip connection
        encoded_patches = layers.Add()([x3, x2])

    # Layer normalization and Global average pooling.
    representation = layers.LayerNormalization(epsilon=layer_norm_eps)(encoded_patches)
    representation = layers.GlobalAvgPool1D()(representation)

    # Classify outputs.
    outputs = layers.Dense(units=num_classes, activation="softmax")(representation)

    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [19]:
def run_experiment():
    # Initialize model
    model = create_vivit_classifier(
        tubelet_embedder=TubeletEmbedding(
            embed_dim=PROJECTION_DIM, patch_size=PATCH_SIZE
        ),
        positional_encoder=PositionalEncoder(embed_dim=PROJECTION_DIM),
    )

    # Compile the model with the optimizer, loss function
    # and the metrics.
    optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    model.compile(
        optimizer=optimizer,
        loss="binary_crossentropy",
        metrics=['accuracy'
        ],
    )

    # Train the model.
    _ = model.fit(trainloader, epochs=EPOCHS, validation_data=validloader)


    return model


model = run_experiment()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [20]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 36, 28, 28,  0           []                               
                                 3)]                                                              
                                                                                                  
 tubelet_embedding (TubeletEmbe  (None, 36, 128)     196736      ['input_1[0][0]']                
 dding)                                                                                           
                                                                                                  
 positional_encoder (Positional  (None, 36, 128)     4608        ['tubelet_embedding[0][0]']      
 Encoder)                                                                                     

In [21]:
import numpy as np

# test verilerinizi hazırlayın
x_test = X_test

# tahminleri yapın
y_pred = model.predict(x_test)

# tahmin sonuçlarını kontrol edin
print(y_pred)  # (num_samples, num_classes)


[[2.69670831e-03 9.97303247e-01]
 [9.99168515e-01 8.31523445e-04]
 [1.05719780e-08 1.00000000e+00]
 [8.21080505e-07 9.99999166e-01]
 [9.99998212e-01 1.84591852e-06]
 [1.04945047e-05 9.99989510e-01]
 [6.91660702e-08 9.99999881e-01]
 [1.08192175e-08 1.00000000e+00]
 [3.93956618e-08 1.00000000e+00]
 [5.40579720e-07 9.99999404e-01]
 [9.99996543e-01 3.46069555e-06]
 [1.00000000e+00 7.24425364e-09]
 [6.22022152e-01 3.77977848e-01]
 [1.22987993e-08 1.00000000e+00]
 [1.25653719e-06 9.99998689e-01]
 [1.00000000e+00 5.29912256e-08]
 [9.99999881e-01 1.45588842e-07]
 [9.99991536e-01 8.49507251e-06]
 [1.00000000e+00 1.52039448e-08]
 [1.60093039e-01 8.39906991e-01]
 [9.99933720e-01 6.62864986e-05]
 [3.98516469e-03 9.96014833e-01]
 [1.00000000e+00 1.89737293e-08]
 [1.30095596e-05 9.99987006e-01]
 [8.44363630e-01 1.55636430e-01]
 [5.97558085e-07 9.99999404e-01]
 [1.00000000e+00 1.04922684e-08]
 [1.00000000e+00 2.04176267e-08]
 [9.99999166e-01 7.99895361e-07]
 [1.66090715e-08 1.00000000e+00]
 [5.733505

In [22]:
predictions = y_pred > 0.5

In [23]:
predictions

array([[False,  True],
       [ True, False],
       [False,  True],
       [False,  True],
       [ True, False],
       [False,  True],
       [False,  True],
       [False,  True],
       [False,  True],
       [False,  True],
       [ True, False],
       [ True, False],
       [ True, False],
       [False,  True],
       [False,  True],
       [ True, False],
       [ True, False],
       [ True, False],
       [ True, False],
       [False,  True],
       [ True, False],
       [False,  True],
       [ True, False],
       [False,  True],
       [ True, False],
       [False,  True],
       [ True, False],
       [ True, False],
       [ True, False],
       [False,  True],
       [False,  True],
       [False,  True],
       [False,  True],
       [ True, False],
       [ True, False],
       [ True, False],
       [ True, False],
       [ True, False],
       [ True, False],
       [ True, False],
       [ True, False],
       [ True, False],
       [False,  True],
       [Fal

In [24]:
from sklearn.metrics import classification_report

print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.78      0.84      0.81       261
           1       0.81      0.74      0.78       239

   micro avg       0.79      0.79      0.79       500
   macro avg       0.80      0.79      0.79       500
weighted avg       0.80      0.79      0.79       500
 samples avg       0.79      0.79      0.79       500



In [25]:
y_test

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.