# Sports Action Recognition Using CNN-RNN Architecture with Transfer Learning on `UCF101 10 Sports actions`

## Dependencies

In [None]:
import os
import re
import time
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, top_k_accuracy_score, classification_report, confusion_matrix

import imageio
import cv2
from IPython.display import Image

import keras
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.metrics import top_k_categorical_accuracy

import pprint

import kagglehub

import wandb
from wandb.integration.keras import WandbCallback

## Dataset Loading

Download latest version of the ucf101-action-recognition dataset from Kaggle

In [None]:
path = kagglehub.dataset_download("matthewjansen/ucf101-action-recognition")

In [None]:
print("Path to dataset files: \n", path)
print("\nFiles in dataset directory:\n", os.listdir(path))

**Class Definition**

In [None]:
sports_actions = [
    "SkyDiving",
    "Biking",
    "HorseRace",
    "Surfing",
    "TennisSwing",
    "Punch",
    "Basketball",
    "JumpRope",
    "Archery",
    "Skiing",
]

Utility to transform video paths

In [None]:
def load_dataset(dataset_type):
    dataset_path = os.path.join(path, f"{dataset_type}.csv")
    dataset = pd.read_csv(dataset_path)

    # Filter dataset to only include the specified sports actions
    filtered_dataset = dataset[dataset["label"].isin(sports_actions)]

    return pd.DataFrame(
        {
            "label": filtered_dataset["label"],
            "video_name": filtered_dataset["clip_name"],
            "rel_path": filtered_dataset["clip_path"],
            "video_path": filtered_dataset["clip_path"].apply(lambda x: f"{path}{x}"),
        }
    )

Load the train and test datasets

In [None]:
train_df = load_dataset("train")
val_df = load_dataset("val")
test_df = load_dataset("test")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for validation: {len(val_df)}")
print(f"Total videos for testing: {len(test_df)}")

In [None]:
print("Number of unique classes in training set: ", len(train_df["label"].unique()))
print("Number of unique classes in validation set: ", len(val_df["label"].unique()))
print("Number of unique classes in test set: ", len(test_df["label"].unique()))

print("\nLabels: \n", train_df["label"].unique())

In [None]:
train_df.sample(10)

## Configs & Hyperparameters

In [None]:
MODEL_NAME = "cnn_rnn_ucf101_10c_tl"
MODEL_ROOT_PATH = "../models"

IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 100
LEARNING_RATE = 1e-4

MAX_SEQ_LENGTH = 20 # TODO: Use 150 for final test
NUM_FEATURES = 2048

HYPERPARAMETER_TUNING_ENABLED = True
TUNING_EPISODES = 40

## Video Data Preprocessing

#### Review video category distribution

In [None]:
def review_class_distribution(dataset, dataset_name):
    class_counts = dataset["label"].value_counts()
    return class_counts

# Get class distributions for each dataset
train_class_counts = review_class_distribution(train_df, "Train")
val_class_counts = review_class_distribution(val_df, "Validation")
test_class_counts = review_class_distribution(test_df, "Test")

# Create DataFrame for distribution and calculate average
distribution_df = pd.DataFrame({
    "Train": train_class_counts,
    "Validation": val_class_counts,
    "Test": test_class_counts
}).fillna(0)

distribution_df["Average"] = distribution_df.mean(axis=1).round().astype(int)
print("Combined average number of videos per class:")
print(distribution_df)

# Plot the distribution
plot_distribution_df = distribution_df.drop(columns="Average")
plot_distribution_df.plot(kind="bar", figsize=(10, 5))
plt.title("Class Distribution Comparison Across Train, Validation, and Test Sets")
plt.xlabel("Class Labels")
plt.ylabel("Number of Videos")
plt.legend(title="Dataset")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


#### Review Video frame distribution 

Count the number of frames for each video

In [None]:
def count_frames_per_video(video_paths):
    frame_counts = []

    for video_path in video_paths: 
        cap = cv2.VideoCapture(video_path)
        count = 0
        
        while cap.isOpened():
            ret, _ = cap.read()
            if not ret:
                break
            count += 1
        cap.release()
        frame_counts.append(count)

    return frame_counts

In [None]:
def visualize_frame_distribution(frame_counts):
    plt.figure(figsize=(8, 5))
    sns.violinplot(x=frame_counts)
    plt.title("Violin Plot of Frame Counts per Video")
    plt.xlabel("Number of Frames")
    plt.xlabel("Number of Frames")
    plt.show()

In [None]:
frame_counts = count_frames_per_video(train_df["video_path"].values)

Standard deviation of the frame counts

In [None]:
np.std(frame_counts)

In [None]:
visualize_frame_distribution(frame_counts)

### Video preprocessing utilities

 Function to resize the video frames to a square shape without distorting their content

In [None]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]  # Get the height (y) and width (x) of the image
    min_dim = min(y, x)       # Find the smallest dimension (either height or width)
    start_x = (x // 2) - (min_dim // 2)  # Calculate the horizontal starting point for the crop
    start_y = (y // 2) - (min_dim // 2)  # Calculate the vertical starting point for the crop
    
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]  # Return the cropped square

Crop,resize, and reorder color channels

In [None]:
def load_and_preprocess_video(video_path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(video_path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame) # Crop center square
            frame = cv2.resize(frame, resize) # Resize the image (In this case to 224x224)
            frame = frame[:, :, [2, 1, 0]] # Reorder the color channels from OpenCV BGR to RGB
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
   
    return np.array(frames)

### Feature Extraction

Using a pre-trained network to extract meaningful features from the extracted frames, the InceptionV3 model pretrained on ImageNet-1k dataset

In [None]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")

In [None]:
feature_extractor = build_feature_extractor()

Encode the class labels as integers using the Keras StringLookup layer

In [None]:
np.unique(train_df["label"])

In [None]:
label_processor = keras.layers.StringLookup(num_oov_indices=0, vocabulary=sports_actions)

class_vocab = label_processor.get_vocabulary()

print(f"Vocabulary: {class_vocab}")
print(f"Number of classes: {len(class_vocab)}")

label_processorExtract frame_features, frame_masks and labels
- `frame_features` will contain extracted features per frame
- `frame_masks` will contain booleans denoting if a timestep/frame is padded or not

Helper function to create masks and features for a single video

In [None]:
def create_video_mask_and_features(frames):
    frames = frames[None, ...]  # Add batch dimension
    video_length = min(MAX_SEQ_LENGTH, frames.shape[1])

    mask = np.zeros((1, MAX_SEQ_LENGTH), dtype="bool")
    features = np.zeros((1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    # TODO: Check effect of normalisation and srandardisation
    
    for j in range(video_length):
        features[0, j, :] = feature_extractor.predict(frames[:, j, :], verbose=0)
    mask[0, :video_length] = 1  # Set mask for valid frames

    return features, mask

Main function for video feature extraction

In [None]:
def extract_features_and_masks(df, dataset_type):
    start_time = time.time()
    num_samples = len(df)
    video_paths = df["video_path"].values.tolist()
    labels = keras.ops.convert_to_numpy(label_processor(df["label"].values[..., None]))

    frame_masks = np.zeros((num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros((num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for idx, video_path in enumerate(video_paths):
        frames = load_and_preprocess_video(video_path)
        features, mask = create_video_mask_and_features(frames)
        frame_features[idx] = features
        frame_masks[idx] = mask

    end_time = time.time()
    elapsed_time = end_time - start_time
    time_unit = "seconds" if elapsed_time < 60 else "minutes"
    time_value = elapsed_time if elapsed_time < 60 else elapsed_time / 60
    print(f"Processed {num_samples} {dataset_type} videos in {time_value:.2f} {time_unit}")

    return (frame_features, frame_masks), labels

Extract the frame features, feature masks and labels for the `train` set

In [None]:
train_data, train_labels = extract_features_and_masks(train_df, "Train")

Extract the frame features, feature masks and labels for the `validation` set

In [None]:
val_data, val_labels = extract_features_and_masks(val_df, "Validation")

Extract the frame features, feature masks and labels for the `test` set

In [None]:
test_data, test_labels = extract_features_and_masks(test_df, "Test")

In [None]:
total_samples, frame_count, num_of_features = train_data[0].shape

print(f"""Frame features in train set: {train_data[0].shape} 
    → {total_samples} samples
    → {frame_count} frames per video
    → {num_of_features} features per frame
""")

total_samples, mask_count = train_data[1].shape
print(f"""Frame masks in train set: {train_data[1].shape} 
    → {total_samples} samples
    → {mask_count} masks per video
""")

print(f"Frame features in validation set: {val_data[0].shape}")
print(f"Frame masks in validation set: {val_data[1].shape}")

print(f"Frame features in test set: {test_data[0].shape}")
print(f"Frame masks in test set: {test_data[1].shape}")

## Weights & Biases Sweep Configuration

In [None]:
if HYPERPARAMETER_TUNING_ENABLED:
    wandb.login()

Weights & Biases Sweep Configuration

In [None]:
sweep_config = {
    "method": "bayes",  # Efficient hyperparameter search using Bayesian Optimization
    "metric": {"name": "val_accuracy", "goal": "maximize"},
    "parameters": {
        # Search space for learning rate
        "learning_rate": {"values": [1e-4, 1e-3, 1e-2]},
        # Batch sizes to test
        "batch_size": {"values": [32, 64, 128]},
        # Number of units in GRU layers
        "gru_units": {"values": [8, 12, 16]},
        # Dropout rates for layers
        "dropout_rate": {"values": [0.3, 0.4, 0.5]},
        # Different optimizers to test
        "optimizer": {"values": ["adam", "sgd", "rmsprop"]},
    },
}

Initializing the Sweep

In [None]:
if HYPERPARAMETER_TUNING_ENABLED:
    sweep_id = wandb.sweep(sweep_config, project=MODEL_NAME)

## RNN GRU Model Architecture

Recurrent Neural Network architecture with GRU (Gated Recurrent Unit) layers

In [None]:
def build_rnn_sequence_model(config):
    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    x = keras.layers.Bidirectional(
        keras.layers.GRU(config.get("gru_units", 12),
                         return_sequences=True,
                         recurrent_dropout=config.get("dropout_rate", 0.3),
                         kernel_regularizer=keras.regularizers.L1(0.01),
                         activity_regularizer=keras.regularizers.L2(0.01))
    )(frame_features_input, mask=mask_input)

    x = keras.layers.GRU(config.get("gru_units", 8),
                         recurrent_dropout=config.get("dropout_rate", 0.4),
                         kernel_regularizer=keras.regularizers.L1(0.01),
                         activity_regularizer=keras.regularizers.L2(0.01))(x)
    x = keras.layers.Dropout(config.get("dropout_rate", 0.5))(x)

    x = keras.layers.Dense(32, activation="relu",
                           kernel_regularizer=keras.regularizers.L1(0.01),
                           activity_regularizer=keras.regularizers.L2(0.01))(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(config.get("dropout_rate", 0.4))(x)

    x = keras.layers.Dense(64, activation="relu",
                           kernel_regularizer=keras.regularizers.L1(0.01),
                           activity_regularizer=keras.regularizers.L2(0.01))(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(config.get("dropout_rate", 0.4))(x)

    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    # Configure optimizer
    optimizer = config.get("optimizer", "adam")
    learning_rate = config.get("learning_rate", 1e-3)
    if optimizer == "adam":
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer == "sgd":
        optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
    elif optimizer == "rmsprop":
        optimizer = keras.optimizers.RMSprop(learning_rate=learning_rate)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=optimizer,
        metrics=["accuracy"],
    )

    return rnn_model

In [None]:
def get_model_path():
    model_version = f'v{datetime.now().strftime("%Y%m%d_%H%M%S")}'
    model_base_path = os.path.join(MODEL_ROOT_PATH, model_version)
    model_path = os.path.join(MODEL_ROOT_PATH, model_version, f'{MODEL_NAME}.keras')

    print(f"Model will be saved to: '{model_path}'")

    return model_version, model_base_path, model_path

In [None]:
model_version, model_base_path, model_path = get_model_path()

Build the model

In [None]:
def build_model():
    if HYPERPARAMETER_TUNING_ENABLED:
        wandb.init()
        config = wandb.config
    else:
        config = {
            "learning_rate": 1e-3,
            "batch_size": 32,
            "gru_units": 12,
            "dropout_rate": 0.4,
            "optimizer": "adam",
        }

    model = build_rnn_sequence_model(config)
    return model, config

In [None]:
rnn_seq_model, config = build_model()

In [None]:
rnn_seq_model.summary()

## Model Training

Utility to run the experiment

In [None]:
def train_and_evaluate_model(config=None):
    checkpoint = ModelCheckpoint(
        model_path,
        monitor="val_loss",
        mode="min",  # Save the model when the loss decreases (when model improves)
        save_weights_only=False,
        save_best_only=True,
        verbose=1,
    )

    # TODO: Tune this
    early_stopping = EarlyStopping(
        monitor="val_loss",
        patience=5,
        min_delta=0.001,
        restore_best_weights=True,
        verbose=1,
    )
    
    callbacks = [checkpoint, early_stopping]
    if HYPERPARAMETER_TUNING_ENABLED:
        callbacks.append(WandbCallback())

    
    # Train the model
    history = rnn_seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_data=([val_data[0], val_data[1]], val_labels),
        batch_size=config.get("batch_size", 64),
        epochs=EPOCHS,
        callbacks=callbacks,
    )

    # Load the best weights after training
    rnn_seq_model.load_weights(model_path)

    # Evaluate the model on a test sample
    print("\nEvaluating the model...")
    _, accuracy = rnn_seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    
    if HYPERPARAMETER_TUNING_ENABLED:
        wandb.log({"test_accuracy": accuracy})
        
    # Log metrics to W&B if W&B is active
    if config is not None:
        wandb.log({
            "train_loss": history.history["loss"],
            "val_loss": history.history["val_loss"],
            "train_accuracy": history.history["accuracy"],
            "val_accuracy": history.history["val_accuracy"],
        })
        
        # Finish W&B run
        wandb.finish()

    return rnn_seq_model, history

In [None]:
train_and_evaluate_model(config)

In [None]:
# def start_training():
#     if HYPERPARAMETER_TUNING_ENABLED:
#         return wandb.agent(sweep_id, train_and_evaluate_model, count=5)
#     else:
#         return train_and_evaluate_model(config)

Run the experiment

In [None]:
# sequence_model, history = train_and_evaluate_model(rnn_seq_model)
model, history = start_training()

In [None]:
keras.utils.plot_model(
    model,
    to_file=f'{model_path}/model.png', #TODO: Fix this
    show_shapes=True,
    show_layer_names=True,
    show_layer_activations=True,
)

In [None]:
if os.path.exists(model_path):
    print(f"Model saved at: {model_path}")
    model_size = os.path.getsize(model_path) / (1024 * 1024)
    print(f"Model size: {model_size:.2f} MB")
else:
    print("Model file not found.")

Visualise the training and validation loss and accuracy

In [None]:
def visualize_training_metrics(history, val_metrics_path):
    train_loss = history.history['loss']
    val_loss = history.history['val_loss']
    train_accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    
    plt.figure(figsize=(12, 6))
    
    # Loss plot
    plt.subplot(1, 2, 1)
    plt.plot(train_loss, label='Training Loss')
    plt.plot(val_loss, label='Validation Loss')
    plt.title("Training and Validation Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()

    # Accuracy plot
    plt.subplot(1, 2, 2)
    plt.plot(train_accuracy, label='Training Accuracy')
    plt.plot(val_accuracy, label='Validation Accuracy')
    plt.title("Training and Validation Accuracy")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend()
    
    plt.savefig(val_metrics_path)
    
    plt.show()

In [None]:
def get_model_image_path(metric_type):
    image_name = None
    
    if metric_type == "Loss":
        image_name = f"loss.png"
    elif metric_type == "Confusion Matrix":
        image_name = f"confusion_matrix.png"
    
    image_path = os.path.join(model_base_path, image_name)
    print(f'{metric_type} plot saved at: {image_path}')
    
    return image_path

In [None]:
image_path = get_model_image_path("Loss")
visualize_training_metrics(history, image_path)

## Model Evaluation and Testing

In [None]:
test_features, test_masks = test_data

#### Evaluate on the entire dataset

In [None]:
loss, accuracy = sequence_model.evaluate([test_features, test_masks], test_labels, batch_size=32)

print(f"Test Loss: {loss :.4f}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

#### Evaluation with single sample

Function to prepare the test video

In [None]:
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH, ),dtype="bool" )
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, frames_batch in enumerate(frames):
        video_length = frames_batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(frames_batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask

Make predictions on a single video

In [None]:
def sequence_prediction(seq_model, frames, true_label):
    frame_features, frame_mask = create_video_mask_and_features(frames)
    probabilities = seq_model.predict([frame_features, frame_mask])[0]
        
    print("\nTop-5 predicted actions:")
    for i in np.argsort(probabilities)[::-1][:5]:
        print(f"  - {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    
    # Get the top-1 predicted label (highest probability)
    predicted_index = np.argmax(probabilities)
    predicted_label = class_vocab[predicted_index]

    # Convert true_label to index to align with predicted index format
    true_label_index = class_vocab.index(true_label)

    # Prepare y_true and y_pred as binary arrays (1 for correct label, 0 for others)
    y_true = np.zeros(len(class_vocab))
    y_pred = np.zeros(len(class_vocab))
    y_true[true_label_index] = 1
    y_pred[predicted_index] = 1
 
    return y_true, y_pred, predicted_label

Display predicted image as GIF

In [None]:
def display_as_gif(images):
    gif_path = "../data/animation.gif"
    converted_images = images.astype(np.uint8)
    imageio.mimsave(gif_path, converted_images, duration=100)
    return Image(gif_path)

Choosing a random video to use for prediction

In [None]:
random_index = np.random.randint(len(test_df))

# Get the test video path
test_video = test_df["video_path"].values[random_index]

# Get the true label of the test video
true_label_index = test_labels.tolist()[random_index][0]
true_label = class_vocab[true_label_index]

test_video_frames = load_and_preprocess_video(test_video)

print(f"Test video path: {test_video}")
print(f"Label: {true_label}")

print(f"""
Test video frames shape:
  - {test_video_frames.shape[0]} frames
  - {test_video_frames.shape[1]} pixels (height) x {test_video_frames.shape[2]} pixels (width)
  - {test_video_frames.shape[3]} color channels
""")

display_as_gif(test_video_frames[:MAX_SEQ_LENGTH])

Run the prediction on the test video

In [None]:
y_true, y_pred, predicted_label = sequence_prediction(sequence_model, test_video_frames, true_label)

In [None]:
print(f"\nTrue label: {true_label}")
print(f"Predicted label: {predicted_label}")

print(f"y_true: {y_true}")
print(f"y_pred: {y_pred}")

#### Evaluation on the entire test set

Utility to display evaluation Metrics

In [None]:
def display_evaluation_metrics(y_true, y_pred, predictions, target_classes):
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Top-1 Accuracy: {accuracy * 100:.2f}%")

    k = 2
    top_k_acc = top_k_accuracy_score(y_true, predictions, k=k)
    print(f"Top-{k} Accuracy: {top_k_acc * 100:.2f}%")

    report = classification_report(y_true, y_pred, target_names=target_classes, zero_division=0)
    print("\nClassification Report:\n", report)

In [None]:
predictions = sequence_model.predict([test_features, test_masks], batch_size=32)
predicted_classes = np.argmax(predictions, axis=1)

In [None]:
display_evaluation_metrics(test_labels, predicted_classes, predictions, class_vocab)

In [None]:
def display_confusion_matrix(y_true, y_pred, target_classes, plot_path, show_plot=False):
    conf_matrix = confusion_matrix(y_true, y_pred)

    plt.figure(figsize=(8, 6))
    sns.heatmap(
        conf_matrix,
        fmt="d",
        annot=True,
        cmap="Blues",
        cbar=True,
        xticklabels=target_classes,
        yticklabels=target_classes,
    )
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")

    plt.savefig(plot_path)

    if show_plot:
        plt.show()    
    else:
        print(conf_matrix)

    plt.close()

In [None]:
true_labels = np.array(test_labels).flatten()
cm_plot_path = get_model_image_path("Confusion Matrix")
display_confusion_matrix(true_labels, predicted_classes, class_vocab, cm_plot_path, show_plot=False)

## References

> K. Soomro, A. R. Zamir, and M. Shah, "UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild," arXiv, 2012. [Online]. Available: https://arxiv.org/abs/1212.0402

> A. Arnab, M. Dehghani, G. Heigold, C. Sun, M. Lučić, and C. Schmid, “ViVIT: a video vision transformer,” arXiv.org, Mar. 29, 2021. https://arxiv.org/abs/2103.15691

> X. Liu, Y. Shen, J. Liu, J. Yang, P. Xiong, and F. Lin, “Parallel Spatial–Temporal Self-Attention CNN-Based Motor Imagery Classification for BCI,” Frontiers in Neuroscience, vol. 14, Dec. 2020, doi: 10.3389/fnins.2020.587520.

> C. Szegedy, V. Vanhoucke, S. Ioffe, J. Shlens, and Z. Wojna, "Rethinking the Inception Architecture for Computer Vision," arXiv preprint arXiv:1512.00567, 2015.

> Singh, S., Dewangan, S., Krishna, G., Tyagi, V., & Reddy, S. (2022). Video vision transformers for violence detection. arXiv. https://doi.org/10.48550/arXiv.2209.03561