In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import ViTModel

2024-06-01 15:45:31.590728: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-01 15:45:31.590818: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-01 15:45:31.733944: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Define paths and actions
actions = np.array(['MoveRight', 'MoveLeft', 'ThumbsUp', 'ThumbsDown', 'Stop', 'Circle', 'FistClose', 'Victory'])
data_path = "/kaggle/input/hand-gesture-recogniton/Action_Recognition" 
no_sequences = 100
sequence_length = 60
features = 1662  # Adjust this based on the actual features per frame
num_classes = len(actions)

In [3]:
print(f"Actions: {actions}")
print(f"Number of classes: {num_classes}")
print(f"Data path: {data_path}")

Actions: ['MoveRight' 'MoveLeft' 'ThumbsUp' 'ThumbsDown' 'Stop' 'Circle'
 'FistClose' 'Victory']
Number of classes: 8
Data path: /kaggle/input/hand-gesture-recogniton/Action_Recognition


In [4]:
# Map each action to an integer
label_map = {label: num for num, label in enumerate(actions)}
print("Label map:", label_map)

Label map: {'MoveRight': 0, 'MoveLeft': 1, 'ThumbsUp': 2, 'ThumbsDown': 3, 'Stop': 4, 'Circle': 5, 'FistClose': 6, 'Victory': 7}


In [5]:
import os
# Load data
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequence_length):
            frame_path = os.path.join(data_path, action, str(sequence), f"{frame_num}.npy")
            res = np.load(frame_path)
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])
    print(f"Loaded data for action: {action}")

Loaded data for action: MoveRight
Loaded data for action: MoveLeft
Loaded data for action: ThumbsUp
Loaded data for action: ThumbsDown
Loaded data for action: Stop
Loaded data for action: Circle
Loaded data for action: FistClose
Loaded data for action: Victory


In [6]:
# Convert labels to one-hot encoding and lists to numpy arrays
labels = to_categorical(labels, num_classes=num_classes)
sequences = np.array(sequences)

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.05, random_state=42)
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

X_train shape: (760, 60, 1662), X_test shape: (40, 60, 1662)
y_train shape: (760, 8), y_test shape: (40, 8)


In [13]:
def transformer_model(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)
    # Add positional encoding
    position_encoding = layers.Embedding(input_dim=input_shape[0], output_dim=input_shape[1])(tf.range(start=0, limit=input_shape[0], delta=1))
    encoded_inputs = inputs + position_encoding

    # Transformer Encoder layers
    for _ in range(4):
        # Multi-head self-attention
        attention_output = layers.MultiHeadAttention(num_heads=8, key_dim=64)(encoded_inputs, encoded_inputs)
        # Add & Norm
        attention_output = layers.Add()([encoded_inputs, attention_output])
        attention_output = layers.LayerNormalization(epsilon=1e-6)(attention_output)
        # Feed forward
        feed_forward = layers.Dense(units=512, activation='relu')(attention_output)
        feed_forward = layers.Dense(units=input_shape[-1])(feed_forward)
        # Add & Norm
        encoded_inputs = layers.Add()([attention_output, feed_forward])
        encoded_inputs = layers.LayerNormalization(epsilon=1e-6)(encoded_inputs)

    # Global average pooling
    pooled_output = layers.GlobalAveragePooling1D()(encoded_inputs)
    # Output layer
    outputs = layers.Dense(units=8, activation='softmax')(pooled_output)

    model = Model(inputs=inputs, outputs=outputs)
    return model


In [14]:
# Instantiate the model
model = transformer_model(input_shape=X_train.shape[1:], num_classes=len(np.unique(y_train)))

In [15]:
# Compile the model
optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['categorical_accuracy'])
print("Model compilation complete.")

Model compilation complete.


In [16]:
# Callbacks
tb_callback = TensorBoard(log_dir='./logs', update_freq='batch')
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
print("Callbacks have been set.")

Callbacks have been set.


In [17]:
# Train the model
print("Starting model training...")
model.fit(X_train, y_train, validation_data=(X_test, y_test),
          epochs=100, callbacks=[early_stopping, tb_callback])
print("Model training complete.")

Starting model training...
Epoch 1/100


I0000 00:00:1717257536.007914     151 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1717257536.035740     151 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 392ms/step - categorical_accuracy: 0.1305 - loss: 4.1478

W0000 00:00:1717257545.279389     149 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1717257546.982352     149 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1717257548.192506     152 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 525ms/step - categorical_accuracy: 0.1321 - loss: 4.1172 - val_categorical_accuracy: 0.2250 - val_loss: 2.3050
Epoch 2/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 78ms/step - categorical_accuracy: 0.2688 - loss: 1.9658 - val_categorical_accuracy: 0.2750 - val_loss: 1.5214
Epoch 3/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 78ms/step - categorical_accuracy: 0.4414 - loss: 1.5305 - val_categorical_accuracy: 0.5750 - val_loss: 1.2028
Epoch 4/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 79ms/step - categorical_accuracy: 0.5512 - loss: 1.2760 - val_categorical_accuracy: 0.6250 - val_loss: 0.9363
Epoch 5/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 79ms/step - categorical_accuracy: 0.5941 - loss: 1.1418 - val_categorical_accuracy: 0.6500 - val_loss: 0.8172
Epoch 6/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 80ms

In [18]:
# Prediction
print("Starting prediction...")
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)
print("Prediction complete.")

Starting prediction...
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m1s[0m 1s/step

W0000 00:00:1717257684.506632     152 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step
Prediction complete.


W0000 00:00:1717257685.838859     152 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


In [19]:
# Evaluation
accuracy = accuracy_score(y_true, y_pred_classes)
f1 = f1_score(y_true, y_pred_classes, average='weighted')
print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')

Accuracy: 0.975
F1 Score: 0.9756535947712418


In [20]:
# Selecting a few samples (e.g., 5 samples) from the test set for prediction
num_samples_to_predict = 5
sample_indices = np.random.choice(X_test.shape[0], num_samples_to_predict, replace=False)  # Randomly pick indices without replacement
sample_data = X_test[sample_indices]
sample_labels = y_test[sample_indices]

In [21]:
# Make predictions
sample_predictions = model.predict(sample_data)
sample_pred_classes = np.argmax(sample_predictions, axis=1)
sample_true_classes = np.argmax(sample_labels, axis=1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step


W0000 00:00:1717257702.609980     150 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


In [22]:
# Print the results
print("Sample predictions (class indices):", sample_pred_classes)
print("Actual labels (class indices):", sample_true_classes)

Sample predictions (class indices): [3 2 4 2 7]
Actual labels (class indices): [3 2 4 2 7]


In [23]:
# Map indices back to action names for clearer interpretation
predicted_actions = [actions[idx] for idx in sample_pred_classes]
actual_actions = [actions[idx] for idx in sample_true_classes]

In [24]:
# Selecting a few samples (e.g., 5 samples) from the test set for prediction
num_samples_to_predict = 8
sample_indices = np.random.choice(X_test.shape[0], num_samples_to_predict, replace=False)  # Randomly pick indices without replacement
sample_data = X_test[sample_indices]
sample_labels = y_test[sample_indices]

In [25]:
# Make predictions
sample_predictions = model.predict(sample_data)
sample_pred_classes = np.argmax(sample_predictions, axis=1)
sample_true_classes = np.argmax(sample_labels, axis=1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step


In [26]:
# Print the results
print("Sample predictions (class indices):", sample_pred_classes)
print("Actual labels (class indices):", sample_true_classes)

Sample predictions (class indices): [3 0 3 5 6 6 3 0]
Actual labels (class indices): [3 0 3 5 6 6 3 0]


In [27]:
# Print the results
print("Sample predictions (class indices):", sample_pred_classes)
print("Actual labels (class indices):", sample_true_classes)

Sample predictions (class indices): [3 0 3 5 6 6 3 0]
Actual labels (class indices): [3 0 3 5 6 6 3 0]


In [28]:
print("Predicted actions:", predicted_actions)
print("Actual actions:", actual_actions)

Predicted actions: ['ThumbsDown', 'ThumbsUp', 'Stop', 'ThumbsUp', 'Victory']
Actual actions: ['ThumbsDown', 'ThumbsUp', 'Stop', 'ThumbsUp', 'Victory']
