# 1. Imports

In [None]:
import os
import cv2
import numpy as np
import zipfile
from io import BytesIO
import matplotlib.pyplot as plt
import random
import mediapipe as mp

# 2. Read Images from the dataset folder

In [None]:
base_path = "./Frames/"
exclude_letters=['X', 'J', 'Z']

# Read the images from the 6 zip files and label them according to the letter they represent
# Return two numpy arrays containing the images and their corresponding labels
def load_and_label_images_from_zip():
    images = []
    labels = []

    for zip_file in os.listdir(base_path):
        if zip_file.endswith('.zip'):
            zip_path = os.path.join(base_path, zip_file)

            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                for image_file in zip_ref.namelist():
                    if not any(excluded_letter in image_file for excluded_letter in exclude_letters):
                        with zip_ref.open(image_file) as file:
                            file_data = file.read()
                            image_stream = BytesIO(file_data)
                            image_array = np.frombuffer(image_stream.getvalue(), np.uint8)

                            image = cv2.imdecode(image_array, cv2.IMREAD_UNCHANGED)
                            flipped_image = cv2.flip(image, 1)
                            label = image_file.split('-')[1]  # Extracting label from filename
                            images.append(flipped_image)
                            labels.append(label)

    combined = list(zip(images, labels))

    # Shuffle the combined list. This will introduce randomness into the training batches
    random.shuffle(combined)

    # Unzip the combined list back into images and labels
    images, labels = zip(*combined)

    return np.array(images), np.array(labels)

# 3. Apply Hand Landmarks 

In [None]:
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_hands = mp.solutions.hands

def apply_landmarks():
    images, labels = imgreader.load_and_label_images_from_zip()

    with open('landmarks.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        # Write the header
        header = ['label']
        for i in range(21):
            header += [f'x{i}', f'y{i}', f'z{i}']
        writer.writerow(header)

        with mp_hands.Hands(
            static_image_mode=True,
            max_num_hands=2,
            min_detection_confidence=0.5) as hands:

            for image, label in zip(images, labels):
                # Convert the grayscale image to RGB before processing.
                image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
                results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

                if results.multi_hand_landmarks:
                    for hand_landmarks in results.multi_hand_landmarks:
                        # Extract landmarks
                        landmarks = [label]  # Start with the label
                        for lm in hand_landmarks.landmark:
                            landmarks.extend([lm.x, lm.y, lm.z])
                        writer.writerow(landmarks)


# 4. Data Preprocessing and Augmentation

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.transform import Rotation as R

def augment_landmarks(landmarks, angles, scales):
    augmented_data = []

    for angle in angles:
        for scale in scales:
            r = R.from_euler('xyz', [angle, angle, angle], degrees=True)
            rotated_landmarks = r.apply(landmarks)

            # Apply scaling
            scaled_landmarks = rotated_landmarks * scale
            augmented_data.append(scaled_landmarks.flatten())

    return augmented_data

df = pd.read_csv('landmarks.csv')

# Define angles and scales for augmentation
angles = np.arange(0, 360, 36)
scales = np.arange(0.8, 1.2, 0.1)

augmented_data = []

# Apply augmentation to each row
for _, row in df.iterrows():
    # Extract label and landmarks
    label = row['label']
    landmarks = np.array(row[1:]).reshape(-1, 3)

    # Augment landmarks
    augmented_landmarks = augment_landmarks(landmarks, angles, scales)

    for augmented_set in augmented_landmarks:
        augmented_row = np.hstack([label, augmented_set])
        augmented_data.append(augmented_row)

# Create a new DataFrame
augmented_df = pd.DataFrame(augmented_data, columns=df.columns)

# Save the augmented data
augmented_df.to_csv('augmented_landmarks.csv', index=False)

# 5. Train and Evaluate Neural Network

In [None]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the data
df = pd.read_csv('./augmented_landmarks.csv')

# Split the data into features and labels
X = df.drop('label', axis=1)
y = df['label']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Assuming y contains categorical string labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

best_accuracy = 0
best_model = None

# Cross-validation loop
fold = 1
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

    # Define the model architecture
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(y.nunique(), activation='softmax')  # Adjust the number of neurons to match the number of classes
    ])

    # Compile the model
    model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, epochs=30, validation_split=0.2, batch_size=16)

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f'Fold {fold} Test accuracy: {test_accuracy}')
    
    # Check if this model is the best so far
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        best_model = model
    
    fold += 1

best_model.save('neuralnet_kfoldtest.h5')

print(model.summary())

# Save the scaler
joblib.dump(scaler, 'scaler.save')
joblib.dump(label_encoder, 'label_encoder.save')

# 6. Create Real Time Interpreter based on this model

In [None]:
from tabnanny import verbose
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
from collections import deque

mp_hands = mp.solutions.hands
hands = mp_hands.Hands()
mp_draw = mp.solutions.drawing_utils

MAX_PREDICTIONS = 10
CONFIDENCE_THRESHOLD = 0.6
recent_predictions = deque(maxlen=MAX_PREDICTIONS)

model = load_model('neuralnet_097.h5')

cap = cv2.VideoCapture(0)

scaler = joblib.load('scaler.save')
label_encoder = joblib.load('label_encoder.save')

def process_landmarks(landmarks):
    landmarks_array = np.array([[landmark.x, landmark.y, landmark.z] for landmark in landmarks.landmark]).flatten()
    landmarks_scaled = scaler.transform([landmarks_array])
    return landmarks_scaled

while cap.isOpened():
    warnings.filterwarnings('ignore', category=UserWarning)

    success, image = cap.read()
    if not success:
        print("Ignoring empty camera frame.")
        continue

    image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
    results = hands.process(image)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_draw.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            processed_landmarks = process_landmarks(hand_landmarks)
            
            prediction = model.predict(processed_landmarks, verbose=0)
            predicted_class = np.argmax(prediction, axis=1)
            confidence = np.max(prediction)

            if confidence > CONFIDENCE_THRESHOLD:
                recent_predictions.append(predicted_class[0])

            # Use highest vote for prediction over last few frames
            if len(recent_predictions) >= MAX_PREDICTIONS:
                most_common = np.bincount(np.array(recent_predictions)).argmax()
                gesture_name = label_encoder.inverse_transform([most_common])[0]
                cv2.putText(image, f'Gesture: {gesture_name}', (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

    cv2.imshow('MediaPipe Hands', image)
    if cv2.waitKey(5) & 0xFF == 27:
        break

cap.release()
cv2.destroyAllWindows()