In [40]:
import os
import cv2
import mediapipe as mp
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models

In [41]:
# initialize mediapipe pose estimation model
mp_pose = mp.solutions.pose
# pose = mp_pose.Pose()
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5)

In [42]:
# get keypoints from image with mediapipe
def extract_keypoints(image):
    # convert image to RGB
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = pose.process(rgb_image)

    # if no landmarks (joints) return zeros
    if not results.pose_landmarks:
        return np.zeros(33*3)
    
    # get keypoints in (x,y,z) coordinates format
    keypoints = []
    for landmark in results.pose_landmarks.landmark:
        keypoints.append([landmark.x, landmark.y, landmark.z])
    return np.array(keypoints).flatten()


In [43]:
# load images and their labels
def load_images(folder):
    data = []
    labels = []
    #big3 = ["bench press","squat", "deadlift" ]
    big3 = ["push up","barbell biceps curl", "squat" ]
    # go through each folder
    for exercise in os.listdir(folder):
        if exercise in big3:
            exercise_folder = os.path.join(folder,exercise)
            if os.path.isdir(exercise_folder):
                # go through each image in folder
                for img_file in os.listdir(exercise_folder):
                    img_path = os.path.join(exercise_folder, img_file)

                    # read image
                    image = cv2.imread(img_path)
                    if image is not None:
                        keypoints = extract_keypoints(image)
                        data.append(keypoints)
                        labels.append(exercise)
    return np.array(data), np.array(labels)

In [44]:
# load images from workout data folder
base_folder = "./workout_data"
data,labels = load_images(base_folder)

In [45]:
# encode exercise names into numerical format
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

In [46]:
# split data into training and test sets
from sklearn.model_selection import train_test_split
trainData, testData, trainLabel, testLabel = train_test_split(data, labels_encoded, test_size=0.2, random_state=42)

In [47]:
# build neural network to classify exercises based on keypoints
model = models.Sequential([
    layers.Dense(128, activation="relu", input_shape=(trainData.shape[1],)),
    layers.Dense(64, activation="relu"),
    layers.Dense(len(np.unique(labels_encoded)), activation="softmax")
])

# def create_exercise_recognition_model(num_exercises, sequence_length=30):
#     input_layer = tf.keras.Input(shape=(sequence_length, 33, 3))  # Sequence of 33 pose landmarks, 3 coordinates each
    
#     x = tf.keras.layers.TimeDistributed(tf.keras.layers.Flatten())(input_layer)
#     x = tf.keras.layers.LSTM(128, return_sequences=True)(x)
#     x = tf.keras.layers.LSTM(64)(x)
#     x = tf.keras.layers.Dense(32, activation='relu')(x)
#     output = tf.keras.layers.Dense(num_exercises, activation='softmax')(x)

#     model = tf.keras.Model(inputs=input_layer, outputs=output)
#     return model

# # Create and compile the model
# num_exercises = 5  # Adjust based on the number of exercises you want to recognize
# sequence_length = 30  # Adjust based on your video length and frame rate
# model = create_exercise_recognition_model(num_exercises, sequence_length)


# compile model
model.compile(optimizer='adam', loss="sparse_categorical_crossentropy", metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [48]:
model.fit(trainData, trainLabel, epochs=5, validation_data=(testData, testLabel))
# model.fit(trainData, trainLabel, epochs=5, validation_data=(testData, testLabel), batch_size=32)


Epoch 1/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6556 - loss: 0.8413 - val_accuracy: 0.8024 - val_loss: 0.4909
Epoch 2/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8170 - loss: 0.4441 - val_accuracy: 0.8415 - val_loss: 0.4040
Epoch 3/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8593 - loss: 0.3674 - val_accuracy: 0.8317 - val_loss: 0.3953
Epoch 4/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8608 - loss: 0.3586 - val_accuracy: 0.8488 - val_loss: 0.3380
Epoch 5/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9084 - loss: 0.2635 - val_accuracy: 0.9049 - val_loss: 0.2664


<keras.src.callbacks.history.History at 0x26192604d10>

In [49]:
loss, accuracy = model.evaluate(testData, testLabel)
print(f"test accuracy: {accuracy}")

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9030 - loss: 0.2927 
test accuracy: 0.9048780202865601


In [50]:
# use trained model to predict on new images
def classify_image(image_path):
    image = cv2.imread(image_path)
    keypoints = extract_keypoints(image)
    keypoints = np.expand_dims(keypoints, axis=0)
    prediction = model.predict(keypoints)
    predicted_class = label_encoder.inverse_transform([np.argmax(prediction)])
    return predicted_class[0]


In [51]:
def classify_video(video_path):
    vid = cv2.VideoCapture(video_path)
    if not vid.isOpened():
        print("Error opening video file")
        return
    dict1 = {}
    while vid.isOpened():
        ret, frame = vid.read()
        if not ret:
            break
        keypoints = extract_keypoints(frame)
        keypoints = np.expand_dims(keypoints, axis=0)
        prediction = model.predict(keypoints)
        predicted_class = label_encoder.inverse_transform([np.argmax(prediction)])
        if predicted_class[0] not in dict1:
            dict1[predicted_class[0]] =1
        else:
            dict1[predicted_class[0]] +=1
        print(predicted_class)
        cv2.putText(frame, f'Predicted: {predicted_class[0]}', (10, 30), 
            cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
        
        cv2.imshow("Video classification", frame)

        # break loop on 'q' key press
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    vid.release()
    cv2.destroyAllWindows()
    # return predicted_class[0]
    maxv = None  # Initialize maxv to negative infinity
    curv = None  
    for v in dict1:
        if  maxv == None or dict1[v] > dict1[maxv]:
            cur1 = dict1[v]
            curv = v
            max1 = cur1
            maxv = curv
    return maxv


In [52]:
import warnings

# Suppress specific UserWarning from protobuf
warnings.filterwarnings("ignore", category=UserWarning, message=r'SymbolDatabase.GetPrototype\(\) is deprecated')


In [53]:
path_test_img = "./test_data/nickside_pushup.MOV"
# predicted_exercise = classify_image(path_test_img)
predicted_exercise = classify_video(path_test_img)
print(f"predicted exercise: {predicted_exercise}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
['barbell biceps curl']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
['push up']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
['push up']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
['push up']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
['push up']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
['push up']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
['push up']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
['push up']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
['push up']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
['push up']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
['push up']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18