In [1]:
import cv2
import mediapipe as mp
import numpy as np
import os

mp_pose = mp.solutions.pose
pose = mp_pose.Pose()
mp_drawing = mp.solutions.drawing_utils

video_path = r"C:\Users\dogat\Desktop\DL_Vidoes" # Update this path
keypoints_data = []

for file in os.listdir(video_path):
    if file.endswith(".mp4"):
        cap = cv2.VideoCapture(os.path.join(video_path, file))

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = pose.process(frame_rgb)

            if results.pose_landmarks:
                keypoints = [ 
                    (lm.x, lm.y, lm.z) for lm in results.pose_landmarks.landmark
                ]
                keypoints_data.append(keypoints)

        cap.release()

keypoints_data = np.array(keypoints_data)
np.save("pose_data.npy", keypoints_data)  # Save extracted features


In [28]:
def label_squat(landmarks):
    hip_y = landmarks[mp_pose.PoseLandmark.LEFT_HIP.value][1]  # Y-coordinate of the left hip
    return "DOWN" if hip_y > 0.6 else "UP"  # Adjust threshold based on video


In [34]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

X_train = np.load("pose_data.npy")  # Load extracted features
y_train = np.array([label_squat(frame) for frame in X_train])  # Convert labels to array

# Convert labels to numeric values (0 = DOWN, 1 = UP)
y_train = np.array([0 if label == "DOWN" else 1 for label in y_train])

# Build LSTM model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(33, 3)),  # 33 keypoints with (x, y, z)
    LSTM(32),
    Dense(16, activation="relu"),
    Dense(1, activation="sigmoid")  # Binary classification (UP or DOWN)
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Save the model
model.save("squat_classifier2.h5")


Epoch 1/10
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.6876 - loss: 0.6353
Epoch 2/10
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.8177 - loss: 0.4292
Epoch 3/10
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.8616 - loss: 0.3234
Epoch 4/10
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.8581 - loss: 0.3255
Epoch 5/10
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.8978 - loss: 0.2576
Epoch 6/10
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.8854 - loss: 0.2692
Epoch 7/10
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9051 - loss: 0.2300
Epoch 8/10
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - accuracy: 0.9164 - loss: 0.2140
Epoch 9/10
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━



In [36]:
import cv2
import mediapipe as mp
import os
import numpy as np
import tensorflow as tf

class poseDetector():
    def __init__(self, mode=False, upBody=False, smooth=True, detectionCon=True, trackCon=0.5):
        self.mpDraw = mp.solutions.drawing_utils
        self.mpPose = mp.solutions.pose
        self.pose = self.mpPose.Pose(mode, upBody, smooth, detectionCon, trackCon)

    def findPose(self, img, draw=True):
        imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        self.results = self.pose.process(imgRGB)  # Use the pose object to process the frame
        if self.results.pose_landmarks and draw:
            self.mpDraw.draw_landmarks(img, self.results.pose_landmarks, self.mpPose.POSE_CONNECTIONS)
        return img

    def findPosition(self, img):
        lmList = []
        if self.results.pose_landmarks:
            for id, lm in enumerate(self.results.pose_landmarks.landmark):
                h, w, _ = img.shape
                cx, cy = int(lm.x * w), int(lm.y * h)
                lmList.append([id, cx, cy, lm.x, lm.y, lm.z])
        return lmList

def classify_frame(frame, model, pose, squat_threshold=0.5):
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = pose.pose.process(frame_rgb)  # Fix here: Access `pose.process`

    if results.pose_landmarks:
        # Extract all 33 keypoints (x, y, z)
        keypoints = np.array([[lm.x, lm.y, lm.z] for lm in results.pose_landmarks.landmark])
        keypoints = np.expand_dims(keypoints, axis=0)  # Reshape to (1, 33, 3)

        # Make prediction
        prediction = model.predict(keypoints)[0][0]
        print(f"Model prediction: {prediction}, Threshold: {squat_threshold}")

        label = "UP" if prediction > squat_threshold else "DOWN"
    else:
        label = "UNKNOWN"

    return label, results.pose_landmarks

# Load model
model = tf.keras.models.load_model("squat_classifier2.h5")

# Open the video
video_path = r"C:\Users\dogat\Desktop\DL_Vidoes\fortSquat1.mov"  # Change file name if needed
cap = cv2.VideoCapture(video_path)

# Initialize pose detector
pose = poseDetector()

# Initialize frame predictions for smoothing (e.g., last 5 frames)
frame_predictions = []

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Get classification label and pose landmarks
    label, landmarks = classify_frame(frame, model, pose)

    # Append the current prediction (1 for "UP", 0 for "DOWN") to the list
    frame_predictions.append(1 if label == "UP" else 0)

    # Keep only the last 5 predictions (e.g., smoothing over 5 frames)
    if len(frame_predictions) > 5:
        frame_predictions.pop(0)

    # Apply majority voting or moving average for smoothing
    if np.mean(frame_predictions) > 0.5:
        final_label = "UP"
    else:
        final_label = "DOWN"

    # If landmarks exist, draw them
    if landmarks:
        mp.solutions.drawing_utils.draw_landmarks(frame, landmarks, mp.solutions.pose.POSE_CONNECTIONS)

    # Overlay final label on frame
    cv2.putText(frame, final_label, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Display the frame with keypoints and label
    cv2.imshow("Pose Estimation with Label", frame)

    # Exit loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 332ms/step
Model prediction: 0.9965253472328186, Threshold: 0.5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Model prediction: 0.9927159547805786, Threshold: 0.5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Model prediction: 0.9894998073577881, Threshold: 0.5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Model prediction: 0.9848856925964355, Threshold: 0.5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Model prediction: 0.9908884763717651, Threshold: 0.5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Model prediction: 0.9899758100509644, Threshold: 0.5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Model prediction: 0.9895860552787781, Threshold: 0.5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Model prediction: 0.9908852577209473, Threshold: 0.5