In [4]:
import cv2 as cv
import mediapipe as mp
import time
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import os
from matplotlib import pyplot as plt

print(cv.__version__)
print(np.__version__)

4.10.0
1.26.4


In [92]:
mp_hands = mp.solutions.hands.Hands()

In [93]:
img = cv.imread("Fred.jpg")
if mp_hands.process(image=img).multi_hand_landmarks:
    print("True")
else:
    print("False")

False


In [94]:
img = cv.imread("Hand.jpg")
if mp_hands.process(image=img).multi_hand_landmarks:
    print("True")
else:
    print("False")

True


In [95]:
vid = cv.VideoCapture(0)

while(True):
    ret, frame = vid.read()
    cv.imshow('Video', frame)
    if cv.waitKey(1) == ord('q'):
        break
vid.release()
cv.destroyAllWindows()


In [77]:
### Capture Images

# Define the labels
Labels = ["Hallo", "Gut", "Schlecht"]
label_map = {"Hallo": 0, "Gut": 1, "Schlecht": 2}
inv_label_map = {v: k for k, v in label_map.items()}
# Define the capture time
waitTime = 8
holdTime = 1

# Initialize an array to store face landmarks
face_landmarks_array = []

# Set VideoCapture to default camera
vid = cv.VideoCapture(0)

# Get the starting time
start = time.time()


# For each label
for label in Labels:
    # Create a directory for this label in the Dataset directory
    os.makedirs(f'./Dataset/{label}', exist_ok=True)
    while(True):
        ret, frame = vid.read()
        # Calculate the countdown time
        countdown = int(waitTime - (time.time() - start))
        # Add the countdown time to the frame
        cv.putText(frame, "Record for "+ label +" begin in " + str(countdown), (50, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv.LINE_AA)
        cv.imshow('Video', frame)
    
        if cv.waitKey(1) == ord('q') or time.time() - start > waitTime:
            break
    
    start = time.time()


    # Frame counter 
    frame_num = 0
    
    while (True):
        ret, frame = vid.read()
        if ret:
            # Convert the image from BGR to RGB
            image = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
    
            cv.imwrite(f'./Dataset/{label}/frame_{frame_num}.jpg', frame)
            frame_num += 1
        
            # Calculate the countdown time
            countdown = int(holdTime - (time.time() - start))
            # Add the countdown time to the frame
            cv.putText(frame, "Holde Pose " + str(countdown), (50, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv.LINE_AA)
    
            cv.imshow('Video', frame)
    
            # Break the loop if 'q' is pressed or 5 seconds have passed
            if cv.waitKey(1) == ord('q') or time.time() - start > holdTime:
                break
        else:
            print("Video stream not available.")
            break

vid.release()
cv.destroyAllWindows()

In [99]:
# Initialize MediaPipe holistic
mp_holistic = mp.solutions.holistic.Holistic()

def extract_landmarks(frame):
    image = cv.cvtColor(frame, cv.COLOR_BGR2RGB)

    # Process the image
    results = mp_holistic.process(image)

    # Get landmarks from results if available, otherwise set to zeros
    face_landmarks = np.array([[landmark.x, landmark.y, landmark.z] for landmark in
                               results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(
        (468 * 3,))
    pose_landmarks = np.array([[landmark.x, landmark.y, landmark.z] for landmark in
                               results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(
        (33 * 3,))
    left_hand_landmarks = np.array([[landmark.x, landmark.y, landmark.z] for landmark in
                                    results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(
        (21 * 3,))
    right_hand_landmarks = np.array([[landmark.x, landmark.y, landmark.z] for landmark in
                                     results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(
        (21 * 3,))

    # Concatenate all landmarks into a single array
    all_landmarks = np.concatenate((face_landmarks, pose_landmarks, left_hand_landmarks, right_hand_landmarks))
    return all_landmarks

mp_holistic.close()

In [100]:
mp_holistic = mp.solutions.holistic.Holistic()

# Initialize a list to hold dataset
dataset = []

# For every labeled folder in the Dataset directory
for label in os.listdir('./Dataset/'):
    print("Checking: " + label)
    if os.path.isdir(f'./Dataset/{label}'):
        # For every image file in the folder
        for filename in os.listdir(f'./Dataset/{label}'):

            # Read the image file
            image = cv.imread(f'./Dataset/{label}/{filename}')

            all_landmarks = extract_landmarks(image)
                
            dataset.append((torch.tensor(all_landmarks), label))

mp_holistic.close()

Checking: Gut
Checking: Hallo
Checking: Schlecht


In [81]:
InFeatures=1629 
OutClasses=3

class LandmarksClassifier(nn.Module):
    def __init__(self, in_feat=InFeatures, hiddenlayer=50, num_classes=OutClasses):  # Start with three classes and adjust as your project evolves.
        super().__init__()
        self.layer1 = nn.Linear(in_feat, hiddenlayer)
        self.layer2 = nn.Linear(hiddenlayer, hiddenlayer)
        self.layer3 = nn.Linear(hiddenlayer, num_classes)

    def forward(self, x):
        print(x.shape)
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)  # No need activation here as normally the loss function will take care of it
        return x

In [82]:
batch_size = 128

landmarks_list = [item[0] for item in dataset]
labels_list = [item[1] for item in dataset]

landmarks_tensor = torch.stack(landmarks_list)
labels_list = [label_map[item[1]] for item in dataset]
labels_tensor = torch.tensor(labels_list)

tensor_dataset = TensorDataset(landmarks_tensor, labels_tensor)
data_loader = DataLoader(tensor_dataset, batch_size=batch_size, shuffle=True)

In [83]:
lr = 0.0005
epochs = 200

In [84]:
# Setup the model, criterion, and optimizer
model = LandmarksClassifier(InFeatures, hiddenlayer=50, num_classes=len(Labels))
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Number of epochs
num_epochs = 25

# Start training loop
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(data_loader):
        inputs = inputs.float()
        labels = labels.float()  # change it back to float
        labels_one_hot = F.one_hot(labels.to(torch.int64), num_classes=3)  # use labels as int64 for one-hot function
        outputs = model(inputs)
        loss = criterion(outputs, labels_one_hot.float())

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        correct = (predicted == labels).sum().item()

        if (i + 1) % 100 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {correct / len(labels):.2f}')

print('Finished Training')

torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
torch.Size([121, 1629])
Finished Training


In [85]:
def process_landmarks(landmarks):
    # Converts landmarks into a tensor and adds another dimension to fit 
    # the (batch_size, num_features). 
    landmarks = torch.tensor(landmarks)
    landmarks = landmarks.unsqueeze_(0).float()  # convert to (1, num_features)
    return landmarks

def predict_landmark_class(landmarks, model):
    # Use your classifier model to predict the landmark class.
    output = model(landmarks)
    _, pred = torch.max(output, 1)  # get the index of the max log-probability
    return pred.item()

In [89]:
mp_holistic = mp.solutions.holistic.Holistic()
model.eval()

vid = cv.VideoCapture(0)

while (True):
    ret, frame = vid.read()
    landmarks = extract_landmarks(frame)
    landmarks = process_landmarks(landmarks)
    prediction = predict_landmark_class(landmarks, model)
    
    cv.putText(frame, "Predicted class is " + str(inv_label_map[prediction]), (50, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv.LINE_AA)

    cv.imshow('Video', frame)

    if cv.waitKey(1) == ord('q'):
        break

vid.release()
cv.destroyAllWindows()
mp_holistic.close()

torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size([1, 1629])
torch.Size