In [None]:
import os
import time
import copy
from enum import Enum

import cv2
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve
from torchinfo import summary

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import MultiStepLR

from local_landmark import LocalLandmark

In [None]:
RANDOM_SEED = 0
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

SAMPLING_FPS = 10
SAMPLING_SECONDS = 1
sequence_length = int(SAMPLING_SECONDS*SAMPLING_FPS)

DATA_FOLDER = "sample_data"

IMAGE_WIDTH = 640
IMAGE_HEIGHT = 480

HAND_CONNECTIONS = ((0, 1), (0, 5), (9, 13), (13, 17), (5, 9), (0, 17), (1, 2), (2, 3), (3, 4), (5, 6), (6, 7), (7, 8),
                    (9, 10), (10, 11), (11, 12), (13, 14), (14, 15), (15, 16), (17, 18), (18, 19), (19, 20))

class Action(Enum):
    IDLE = 0
    PICK = 1
    PLACE = 2
    SCREW_WRENCH = 3

OBJECT_NAMES = {
    0: "small_screw",
    1: "big_screw",
    2: "small_wrench",
    3: "big_wrench",
    4: "cap",
    5: "barrel",
    6: "piston",
    7: "support",
    8: "air_connector",
    9: "nut"
}

num_classes_actions = len(Action)
num_classes_objects = len(OBJECT_NAMES) + 1 # Last class for empty hand

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
def get_landmarks_from_flattened_array(flattened_landmarks):
    N_RIGHT_HAND_LANDMARKS = 21
    N_LEFT_HAND_LANDMARKS = 21

    right_hand_landmarks = []
    left_hand_landmarks = []

    cursor = 0

    for _ in range(N_RIGHT_HAND_LANDMARKS):
        cursor_end_position = cursor + 4
        lm_sub_array = flattened_landmarks[cursor:cursor_end_position]
        right_hand_landmarks.append(LocalLandmark.from_np_array(lm_sub_array))
        cursor = cursor_end_position

    for _ in range(N_LEFT_HAND_LANDMARKS):
        cursor_end_position = cursor + 4
        lm_sub_array = flattened_landmarks[cursor:cursor_end_position]
        left_hand_landmarks.append(LocalLandmark.from_np_array(lm_sub_array))
        cursor = cursor_end_position

    return right_hand_landmarks, left_hand_landmarks

def one_hot_encode_class(class_index, num_classes):
    one_hot_vector = np.zeros(num_classes, dtype=int)
    one_hot_vector[class_index] = 1
    return one_hot_vector

def get_selected_frame_data(frame_landmarks_data, frame_objects_data):
    selected_frame_data = []

    right_hand_landmarks, left_hand_landmarks = get_landmarks_from_flattened_array(frame_landmarks_data)
    for i in range(2):
        if i == 0:
            hand_landmarks = right_hand_landmarks
        else:
            continue # Select only the right hand
            hand_landmarks = left_hand_landmarks
            
        wrist_landmark = hand_landmarks[0]
        for lm in hand_landmarks:
            # Absolute coordinates (with respect to camera frame)
            selected_frame_data.append(lm.x)
            selected_frame_data.append(lm.y)
            # selected_frame_data.append(lm.d/IMAGE_WIDTH)

            # Relative coordinates (relative to the wrist landmark)
            selected_frame_data.append(lm.x - wrist_landmark.x)
            selected_frame_data.append(lm.y - wrist_landmark.y)
            selected_frame_data.append(lm.z)

    if frame_objects_data is not None:
        right_hand_object_found = False
        for frame_object_data in frame_objects_data:
            label_id, in_hand, x1, y1, x2, y2 = frame_object_data
            if in_hand == 0: # Only objects held in the right hand are considered
                selected_frame_data = np.concatenate((selected_frame_data, one_hot_encode_class(label_id, num_classes_objects)), axis=0)
                right_hand_object_found = True
                break
        
        if not right_hand_object_found:
            selected_frame_data = np.concatenate((selected_frame_data, one_hot_encode_class(num_classes_objects - 1, num_classes_objects)), axis=0)

    return selected_frame_data

In [None]:
def to_categorical(y, num_classes=None):
    y = np.array(y, dtype=int)
    if not num_classes:
        num_classes = np.max(y) + 1
    categorical = np.zeros((len(y), num_classes), dtype=int)
    categorical[np.arange(len(y)), y] = 1
    return categorical

def train_val_test_split(X_data, y_data, val_size=0.1, test_size=0.1, shuffle=True, random_state=None):
    assert len(X_data) == len(y_data), "The first dimension of X_data and y_data must coincide"
    assert val_size + test_size < 1.0, "The sum of val_size and test_size must be less than 1.0"

    n_samples = len(X_data)

    if shuffle:
        if random_state is not None:
            np.random.seed(random_state)
        indices = np.arange(n_samples)
        np.random.shuffle(indices)
        X_data = X_data[indices]
        y_data = y_data[indices]
    
    train_size = 1.0 - val_size - test_size
    train_end = int(train_size*n_samples)
    val_end = train_end + int(val_size*n_samples)
    
    X_train = X_data[:train_end]
    y_train = y_data[:train_end]
    X_val = X_data[train_end:val_end]
    y_val = y_data[train_end:val_end]
    X_test = X_data[val_end:]
    y_test = y_data[val_end:]
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
landmarks_data = []
objects_data = []
labels = []
for action in list(Action):
    action_name = action.name.lower()
    print("Loading sequences data for action:", action_name)
    action_folder = os.path.join(DATA_FOLDER, action_name)
    for sequence_data_folder in tqdm(os.listdir(action_folder)):
        sequence_data_landmarks_folder = os.path.join(action_folder, sequence_data_folder, "landmarks")
        sequence_data_objects_folder = os.path.join(action_folder, sequence_data_folder, "objects")
        sequence_landmarks_data = []
        sequence_objects_data = []
        for frame_index in range(sequence_length):
            frame_landmarks_data = np.load(os.path.join(sequence_data_landmarks_folder, f"frame_{frame_index}.npy"))
            sequence_landmarks_data.append(frame_landmarks_data)
            frame_objects_data = np.load(os.path.join(sequence_data_objects_folder, f"frame_{frame_index}.npy"))
            sequence_objects_data.append(frame_objects_data)

        landmarks_data.append(sequence_landmarks_data)
        objects_data.append(sequence_objects_data)
        labels.append(action.value)

In [None]:
# Select actions data NO objects
selected_actions_data = []
for sequence_landmarks_data in landmarks_data:
    selected_sequence_data = []
    for frame_landmarks_data in sequence_landmarks_data:
        selected_frame_data = get_selected_frame_data(frame_landmarks_data, None)
        selected_sequence_data.append(selected_frame_data)

    selected_actions_data.append(selected_sequence_data)

In [None]:
# Select actions data WITH objects
selected_actions_data = []
for sequence_index in range(len(landmarks_data)):
    sequence_landmarks_data = landmarks_data[sequence_index]
    sequence_objects_data = objects_data[sequence_index]
    selected_sequence_data = []
    for frame_index in range(len(sequence_landmarks_data)):
        frame_landmarks_data = sequence_landmarks_data[frame_index]
        frame_objects_data = sequence_objects_data[frame_index]
        selected_frame_data = get_selected_frame_data(frame_landmarks_data, frame_objects_data)
        selected_sequence_data.append(selected_frame_data)

    selected_actions_data.append(selected_sequence_data)

In [None]:
X_data = np.array(selected_actions_data)
y_data = to_categorical(labels, num_classes=num_classes_actions)
print("X shape:", X_data.shape)
print("y shape:", y_data.shape)

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X_data, y_data, val_size=0.1, test_size=0.1, random_state=RANDOM_SEED)
print("X shape (train, validation, test):", X_train.shape, X_val.shape, X_test.shape)
print("y shape (train, validation, test):", y_train.shape, y_val.shape, y_test.shape)

In [None]:
class LSTMClassifier(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.2):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)
 
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.shape[0], self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.shape[0], self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out

In [None]:
class ComplexLSTMClassifier(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.2):
        super(ComplexLSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, num_classes)
        self.dropout = nn.Dropout(dropout)
 
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.shape[0], self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.shape[0], self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = F.relu(self.fc1(out[:, -1, :]))
        out = self.dropout(out)
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out

In [None]:
class BiLSTMClassifier(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.2):
        super(BiLSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size*2, num_classes)
        self.dropout = nn.Dropout(dropout)
 
    def forward(self, x):
        h0 = torch.zeros(self.num_layers*2, x.shape[0], self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers*2, x.shape[0], self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out

In [None]:
class Conv1DNetClassifier(nn.Module):

    def __init__(self, input_size, hidden_size, num_classes, dropout=0.2):
        super(Conv1DNetClassifier, self).__init__()
        self.conv1d_1 = nn.Conv1d(input_size, hidden_size, kernel_size=3)
        self.conv1d_2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=2)
        self.conv1d_3 = nn.Conv1d(hidden_size, hidden_size, kernel_size=2)
        
        self.maxpool1d = nn.MaxPool1d(kernel_size=2)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.permute(x, dims=(0, 2, 1))
        out = F.relu(self.conv1d_1(x))
        out = self.maxpool1d(out)
        out = F.relu(self.conv1d_2(out))
        out = F.relu(self.conv1d_3(out))
        out = torch.mean(out, dim=2)
        out = self.dropout(out)
        out = self.fc(out)
        return out

In [None]:
class LSTMObjectsClassifier(nn.Module):

    def __init__(self, hidden_size, num_layers, num_classes, dropout=0.2):
        super(LSTMObjectsClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(105, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 20)
        self.fc_extra1 = nn.Linear(31, 32)
        self.fc_extra2 = nn.Linear(32, num_classes)
        self.dropout = nn.Dropout(dropout)
 
    def forward(self, x):
        landmarks_data = x[:, :, :105]
        objects_data = x[:, :, 105:]

        h0 = torch.zeros(self.num_layers, x.shape[0], self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.shape[0], self.hidden_size).to(x.device)
        out, _ = self.lstm(landmarks_data, (h0, c0))
        out = self.dropout(out[:, -1, :])
        out = F.relu(self.fc(out))
        out = F.relu(self.fc_extra1(torch.cat((out, torch.mean(objects_data, dim=1)), dim=1)))
        out = self.fc_extra2(out)
        return out

In [None]:
# Model definition
input_size = X_train.shape[2]
sequence_length = X_train.shape[1]
initial_learning_rate = 0.001
num_layers = 3
hidden_size = 128
num_classes = y_train.shape[1] # num_classes_actions

model = LSTMClassifier(input_size, hidden_size, num_layers, num_classes).to(device)
# model = ComplexLSTMClassifier(input_size, hidden_size, num_layers, num_classes).to(device)
# model = BiLSTMClassifier(input_size, hidden_size, num_layers, num_classes).to(device)
# model = Conv1DNetClassifier(input_size, hidden_size, num_classes).to(device)
# model = LSTMObjectsClassifier(hidden_size, num_layers, num_classes).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=initial_learning_rate)
scheduler = MultiStepLR(optimizer, milestones=[40, 80], gamma=0.1)

summary(model, input_size=(16, sequence_length, input_size))

In [None]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)

In [None]:
# Training loop
epochs = 100
batch_size = 16

# Early stopping parameters
early_stopping_enabled = True
patience = 15
min_delta_improvement = 0.001
restore_best_weights = True

best_loss = np.Inf
counter = 0
best_epoch = 0

if early_stopping_enabled and restore_best_weights:
    best_model_weights = copy.deepcopy(model.state_dict())

train_losses = []
val_losses = []

train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=batch_size, shuffle=True)

for epoch in range(epochs):
    model.train()
    train_loss = 0.0

    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Learning rate scheduler step
    scheduler.step()

    # Validation loss
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor)

    train_losses.append(train_loss/len(train_loader))
    val_losses.append(val_loss.item())
    
    print(f"Epoch [{epoch + 1}/{epochs}], Train Loss: {train_loss/len(train_loader):.4f}, Validation Loss: {val_loss:.4f}, lr: {optimizer.param_groups[0]['lr']}")

    # Early stopping logic
    if early_stopping_enabled:
        if val_loss < best_loss - min_delta_improvement:
            best_loss = val_loss
            counter = 0
            best_epoch = epoch + 1
            if restore_best_weights:
                best_model_weights = copy.deepcopy(model.state_dict())
        else:
            counter += 1
        
        if counter >= patience:
            restore_best_weights_message = f" Best weights restored to epoch {best_epoch}." if restore_best_weights else ""
            print(f"Early stopping at epoch {epoch + 1} with no improvement in validation loss.{restore_best_weights_message}")
            break

# Restore best weights
if early_stopping_enabled and restore_best_weights:
    model.load_state_dict(best_model_weights)

In [None]:
# Save model state and optimizer state
model_checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
torch.save(model_checkpoint, "model_ar.pth")

In [None]:
# Load model state and optimizer state
model_checkpoint = torch.load("model_ar.pth")
model.load_state_dict(model_checkpoint["state_dict"])
optimizer.load_state_dict(model_checkpoint["optimizer"])

In [None]:
# Save model TorchScript
model_scripted = torch.jit.script(model)
model_scripted.save(os.path.join("weights", "model_ar.pt"))

In [None]:
# Learning curves
plt.figure(figsize=(12, 6))
plt.plot(train_losses, label="Training Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss (Cross-Entropy)")
plt.ylim([-0.05, 1.4])
plt.legend()
plt.title("Learning Curves")
plt.show()

In [None]:
def plot_confusion_matrix(cm, class_names, normalize=False):
    if normalize:
        cm = cm/cm.sum(axis=1)[:, np.newaxis]
    plt.figure(figsize=(10, 8))
    sns.heatmap(np.transpose(cm), annot=True, fmt='.2f' if normalize else 'd', cmap="Blues", xticklabels=class_names, yticklabels=class_names)
    plt.xlabel("True")
    plt.ylabel("Predicted")
    plt.title("Confusion Matrix")
    plt.show()

def evaluate_performance_metrics(y_true, outputs, labels):
    preds = torch.argmax(outputs, dim=1).cpu().numpy()
    
    accuracy = accuracy_score(y_true, preds)
    precision = precision_score(y_true, preds, average="macro")
    recall = recall_score(y_true, preds, average="macro")
    f1 = f1_score(y_true, preds, average="macro")
    conf_matrix = confusion_matrix(y_true, preds)
    roc_auc = roc_auc_score(y_true, outputs.cpu().numpy(), multi_class="ovr")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    # print(f"Confusion Matrix:\n{conf_matrix}")
    plot_confusion_matrix(conf_matrix, labels, normalize=True)

def ROC_curve(y_true, outputs, labels):
    fpr = {}
    tpr = {}
    roc_auc = {}
    n_classes = outputs.size(1)
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true == i, outputs[:, i].cpu().numpy())
        roc_auc[i] = roc_auc_score(y_true == i, outputs[:, i].cpu().numpy())

    plt.figure()
    for i in range(n_classes):
        plt.plot(fpr[i], tpr[i], lw=2, label=f"ROC curve of action '{labels[i]}' (area = {roc_auc[i]:.2f})")

    plt.plot([0, 1], [0, 1], "k--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend(loc="lower left", bbox_to_anchor=(0.4, 0.0))
    ax = plt.gca()
    ax.set_aspect("equal", adjustable="box")
    plt.show()

In [None]:
model.eval()
with torch.no_grad():
    val_outputs = F.softmax(model(X_val_tensor), dim=1)
    test_outputs = F.softmax(model(X_test_tensor), dim=1)

val_true = torch.argmax(y_val_tensor, dim=1).cpu().numpy()
test_true = torch.argmax(y_test_tensor, dim=1).cpu().numpy()

In [None]:
# Validation set performance metrics
evaluate_performance_metrics(val_true, val_outputs, [action.name.lower() for action in list(Action)])

In [None]:
# Test set performance metrics
evaluate_performance_metrics(test_true, test_outputs, [action.name.lower() for action in list(Action)])

In [None]:
# Validation set ROC curve
ROC_curve(val_true, val_outputs, [action.name.lower() for action in list(Action)])

In [None]:
# Test set ROC curve
ROC_curve(test_true, test_outputs, [action.name.lower() for action in list(Action)])

In [None]:
def draw_landmarks(frame, landmarks, side, connections):
    """side: 0 = right, 1 = left"""

    if all([lm.is_empty() for lm in landmarks]):
        return
    
    image_height, image_width = frame.shape[0], frame.shape[1]

    right_landmarks_color = (0, 0, 255)
    left_landmarks_color = (255, 0, 0)

    for lm in landmarks:
        if side == 0:
            landmarks_color = right_landmarks_color
        elif side == 1:
            landmarks_color = left_landmarks_color
        else:
            return
        cv2.circle(frame, (int(lm.x*image_width), int(lm.y*image_height)), 2, landmarks_color, 2)

    for connection in connections:
        if connection[1] < len(landmarks):
            start_point = (int(landmarks[connection[0]].x*image_width), int(landmarks[connection[0]].y*image_height))
            end_point = (int(landmarks[connection[1]].x*image_width), int(landmarks[connection[1]].y*image_height))
            cv2.line(frame, start_point, end_point, (255, 255, 255), 2)

def get_blank_frame_with_landmarks(right_hand_landmarks, left_hand_landmarks, frame_counter=None, width=640, height=480):
    blank_frame = np.zeros(shape=(height, width, 3), dtype=np.uint8)

    draw_landmarks(blank_frame, right_hand_landmarks, side=0, connections=HAND_CONNECTIONS)
    draw_landmarks(blank_frame, left_hand_landmarks, side=1, connections=HAND_CONNECTIONS)

    if frame_counter is not None:
        cv2.putText(blank_frame, str(frame_counter), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    
    return blank_frame

def play_video_from_sequence_data(sequence_data):
    frame_interval = 1.0/SAMPLING_FPS
    
    for frame_index, frame_data in enumerate(sequence_data):
        frame_display_time = time.time()

        right_hand_landmarks, left_hand_landmarks = get_landmarks_from_flattened_array(frame_data)
        cv2.imshow("Video from sequence data", get_blank_frame_with_landmarks(right_hand_landmarks, left_hand_landmarks, frame_counter=frame_index))

        time_taken = time.time() - frame_display_time
        time_to_wait = frame_interval - time_taken
        if time_to_wait > 0:
            time.sleep(time_to_wait)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cv2.destroyAllWindows()

In [None]:
sequence_data_path = os.path.join(DATA_FOLDER, Action.PLACE.name.lower(), "sequence_37")

sequence_landmarks_data = []
for frame_index in range(sequence_length):
    frame_landmarks_data = np.load(os.path.join(sequence_data_path, "landmarks", f"frame_{frame_index}.npy"))
    sequence_landmarks_data.append(frame_landmarks_data)

# Video visualization
play_video_from_sequence_data(sequence_landmarks_data)

# Action recognition
selected_sequence_data = []
for frame_landmarks_data in sequence_landmarks_data:
    selected_frame_data = get_selected_frame_data(frame_landmarks_data, None)
    selected_sequence_data.append(selected_frame_data)
X = torch.tensor(np.array(selected_sequence_data), dtype=torch.float32).to(device)
X = torch.unsqueeze(X, axis=0)
model.eval()
with torch.no_grad():
    probabilities = F.softmax(model(X)[0], dim=0)
print("Probabilities:")
for action_index, action_prob in enumerate(probabilities):
    print(f"- {Action(action_index).name.lower()}: {(action_prob*100):.2f} %")
action_recognition_prediction = Action(torch.argmax(probabilities).item()).name.lower()
print("Action recognized:", action_recognition_prediction)