In [None]:
import os, random, math

random.seed(42)

data_dir = "./data/asl_alphabet_train/asl_alphabet_train/" 

all_classes = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]

exclude_classes = {"nothing"}
classes = sorted([c for c in all_classes if c.lower() not in exclude_classes])

print("Classes used:", classes)
print("Number of classes:", len(classes)) 

train_files = []
val_files = []
test_files = []

for cls in classes:
    class_dir = os.path.join(data_dir, cls)
    files = [os.path.join(class_dir, f) for f in os.listdir(class_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    random.shuffle(files)
    n_total = len(files)
    n_train = math.floor(0.8 * n_total)   
    n_val = math.floor(0.1 * n_total)    

    train_list = files[:n_train]
    val_list = files[n_train:n_train + n_val]
    test_list = files[n_train + n_val:]
    train_files += [(fp, cls) for fp in train_list]
    val_files   += [(fp, cls) for fp in val_list]
    test_files  += [(fp, cls) for fp in test_list]

random.shuffle(train_files)
random.shuffle(val_files)
random.shuffle(test_files)

print(f"Total training samples: {len(train_files)}")
print(f"Total validation samples: {len(val_files)}")
print(f"Total test samples: {len(test_files)}")


Classes used: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'space']
Number of classes: 28
Total training samples: 67200
Total validation samples: 8400
Total test samples: 8400


In [None]:
import mediapipe as mp
import cv2
import numpy as np

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True,  
                       max_num_hands=1,        
                       min_detection_confidence=0.5)

def extract_landmarks(image_path):
    image = cv2.imread(image_path)
    if image is None:
        return None  
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)
    if results.multi_hand_landmarks:
        hand_landmarks = results.multi_hand_landmarks[0]
        landmark_coords = []
        for lm in hand_landmarks.landmark:
            landmark_coords.append([lm.x, lm.y, lm.z])
        return np.array(landmark_coords, dtype=np.float32)
    else:
        return None

X_train, y_train = [], []
X_val, y_val = [], []
X_test, y_test = [], []

class_to_idx = {cls: idx for idx, cls in enumerate(classes)}
print("Class to index mapping:", class_to_idx)

from tqdm import tqdm

print("Processing training set...")
for filepath, cls in tqdm(train_files, desc="Train", unit="img"):
    landmarks = extract_landmarks(filepath)
    if landmarks is not None:
        X_train.append(landmarks)
        y_train.append(class_to_idx[cls])

print("Processing validation set...")
for filepath, cls in tqdm(val_files, desc="Val", unit="img"):
    landmarks = extract_landmarks(filepath)
    if landmarks is not None:
        X_val.append(landmarks)
        y_val.append(class_to_idx[cls])

print("Processing test set...")
for filepath, cls in tqdm(test_files, desc="Test", unit="img"):
    landmarks = extract_landmarks(filepath)
    if landmarks is not None:
        X_test.append(landmarks)
        y_test.append(class_to_idx[cls])

X_train = np.array(X_train)  
X_val   = np.array(X_val)  
X_test  = np.array(X_test)   
y_train = np.array(y_train)
y_val   = np.array(y_val)
y_test  = np.array(y_test)

print("Extracted landmarks for training set:", X_train.shape)
print("Example of landmarks for one image (first training sample):")
print(X_train[0])

# Convert to NumPy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)

# Save to disk
np.save("X_train.npy", X_train)
np.save("y_train.npy", y_train)
print("Saved training set to disk.")

X_val = np.array(X_val)
y_val = np.array(y_val)
np.save("X_val.npy", X_val)
np.save("y_val.npy", y_val)
print("Saved validation set to disk.")

X_test = np.array(X_test)
y_test = np.array(y_test)
np.save("X_test.npy", X_test)
np.save("y_test.npy", y_test)
print("Saved test set to disk.")

I0000 00:00:1746787693.496938  854308 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1746787693.549218  854395 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 545.23.06), renderer: NVIDIA GeForce RTX 2080 Ti/PCIe/SSE2


Class to index mapping: {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10, 'L': 11, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20, 'V': 21, 'W': 22, 'X': 23, 'Y': 24, 'Z': 25, 'del': 26, 'space': 27}
Processing training set...


Train:   0%|          | 0/67200 [00:00<?, ?img/s]INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1746787693.602369  854347 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1746787693.631705  854377 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1746787693.644005  854361 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.
Train: 100%|██████████| 67200/67200 [48:20<00:00, 23.16img/s]


Processing validation set...


Val: 100%|██████████| 8400/8400 [06:04<00:00, 23.04img/s]


Processing test set...


Test: 100%|██████████| 8400/8400 [06:03<00:00, 23.10img/s]


Extracted landmarks for training set: (49915, 21, 3)
Example of landmarks for one image (first training sample):
[[ 6.70948148e-01  4.44872350e-01  4.01725231e-08]
 [ 6.58323467e-01  4.50201988e-01 -1.04909234e-01]
 [ 5.98267555e-01  4.43777174e-01 -1.45214453e-01]
 [ 5.23130357e-01  4.57256496e-01 -1.67215347e-01]
 [ 4.56149369e-01  4.56595361e-01 -1.86277658e-01]
 [ 5.85710466e-01  2.89745212e-01 -1.10409737e-01]
 [ 4.92681563e-01  2.27920890e-01 -1.55761063e-01]
 [ 4.24179524e-01  1.95349589e-01 -1.84811428e-01]
 [ 3.68203580e-01  1.77432194e-01 -2.01348335e-01]
 [ 5.50077617e-01  3.00024092e-01 -7.32422173e-02]
 [ 4.47823346e-01  2.90391743e-01 -1.33259296e-01]
 [ 4.22418714e-01  3.55146646e-01 -1.60876855e-01]
 [ 4.28664953e-01  4.07414615e-01 -1.66381136e-01]
 [ 5.21646440e-01  3.19321662e-01 -4.44201268e-02]
 [ 4.30239826e-01  3.04076105e-01 -1.04157694e-01]
 [ 4.16438460e-01  3.65684658e-01 -1.23959482e-01]
 [ 4.27218378e-01  4.09238696e-01 -1.20582022e-01]
 [ 5.04539371e-01  3

In [3]:
random_test_array = np.random.rand(21, 3).astype(np.float32)
np.save("random_test_array.npy", random_test_array)
print("Saved random test array to disk.")

Saved random test array to disk.


In [None]:
import torch
import torch.nn as nn


X_train_tensor = torch.tensor(X_train).permute(0, 2, 1)  
X_val_tensor   = torch.tensor(X_val).permute(0, 2, 1)    
X_test_tensor  = torch.tensor(X_test).permute(0, 2, 1)   
y_train_tensor = torch.tensor(y_train).long()
y_val_tensor   = torch.tensor(y_val).long()
y_test_tensor  = torch.tensor(y_test).long()

from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset   = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset  = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class ASLClassifier(nn.Module):
    def __init__(self):
        super(ASLClassifier, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=3, out_channels=32, kernel_size=3)   
        self.pool1 = nn.MaxPool1d(kernel_size=2)  
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3)  
        self.pool2 = nn.MaxPool1d(kernel_size=2)  
        self.fc1 = nn.Linear(64 * 3, 128) 
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, len(classes)) 
        
    def forward(self, x):
        x = nn.functional.relu(self.conv1(x))
        x = self.pool1(x)
        x = nn.functional.relu(self.conv2(x))
        x = self.pool2(x)
        x = x.view(x.size(0), -1)  
        x = nn.functional.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)  
        return x

model = ASLClassifier()
print(model)


ASLClassifier(
  (conv1): Conv1d(3, 32, kernel_size=(3,), stride=(1,))
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
  (pool2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=192, out_features=128, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=128, out_features=28, bias=True)
)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(1, num_epochs+1):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)            
        loss = criterion(outputs, labels) 
        loss.backward()                 
        optimizer.step()               
        
        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
    train_loss = running_loss / total
    train_acc = correct / total
    
    model.eval()
    val_correct = 0
    val_total = 0
    val_loss_sum = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss_sum += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)
    val_loss = val_loss_sum / val_total
    val_acc = val_correct / val_total
    
    print(f"Epoch {epoch}/{num_epochs}: "
          f"Train Loss = {train_loss:.4f}, Train Acc = {train_acc:.4f} | "
          f"Val Loss = {val_loss:.4f}, Val Acc = {val_acc:.4f}")


Using device: cuda
Epoch 1/10: Train Loss = 1.8466, Train Acc = 0.4361 | Val Loss = 0.6125, Val Acc = 0.8482
Epoch 2/10: Train Loss = 0.6069, Train Acc = 0.8265 | Val Loss = 0.3550, Val Acc = 0.9271
Epoch 3/10: Train Loss = 0.4346, Train Acc = 0.8836 | Val Loss = 0.2661, Val Acc = 0.9497
Epoch 4/10: Train Loss = 0.3550, Train Acc = 0.9086 | Val Loss = 0.2484, Val Acc = 0.9492
Epoch 5/10: Train Loss = 0.3124, Train Acc = 0.9190 | Val Loss = 0.1868, Val Acc = 0.9623
Epoch 6/10: Train Loss = 0.2738, Train Acc = 0.9284 | Val Loss = 0.1678, Val Acc = 0.9656
Epoch 7/10: Train Loss = 0.2506, Train Acc = 0.9363 | Val Loss = 0.1529, Val Acc = 0.9620
Epoch 8/10: Train Loss = 0.2290, Train Acc = 0.9398 | Val Loss = 0.1404, Val Acc = 0.9691
Epoch 9/10: Train Loss = 0.2108, Train Acc = 0.9449 | Val Loss = 0.1296, Val Acc = 0.9688
Epoch 10/10: Train Loss = 0.1962, Train Acc = 0.9485 | Val Loss = 0.1156, Val Acc = 0.9715


In [None]:
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        test_correct += (predicted == labels).sum().item()
        test_total += labels.size(0)
test_acc = test_correct / test_total
print(f"Test Accuracy = {test_acc:.4f}")


Test Accuracy = 0.9704


In [None]:
model_path = "asl_classifier_model.pth"
torch.save(model.state_dict(), model_path)
print("Model saved to", model_path)


Model saved to asl_classifier_model.pth
