In [1]:
import mediapipe as mp
import cv2
import numpy as np
from turtle import right

In [6]:
global_boundingbox_coord = None
global_distance = None
xy_negative_reflection = True
unnormalized_z = False

def normalize(holistic, mp_holistic, image):
    mp_drawing = mp.solutions.drawing_utils
    drawing = {"mp_drawing": mp_drawing}
    result = holistic.process(image)
    params = { "pose_landmarks": result.pose_landmarks, "image": image, "result": result }
    params = body_tracking(params)
    coordinates = extract_data(params)

    return coordinates

def extract_data(params):
    result = params["result"]
    face_landmarks = result.face_landmarks
    left_hand_landmarks = result.left_hand_landmarks
    right_hand_landmarks = result.right_hand_landmarks
    pose_landmarks = params["pose_landmarks"]
    shoulders_centroid = params["shoulders_centroid"]
    hips_centroid = params["hips_centroid"]
    image = params["image"]
    im_h = image.shape[0]
    im_w = image.shape[1]
    point_a = pixel_coordinate_convertion(shoulders_centroid.copy(), im_w, im_h)
    point_b = pixel_coordinate_convertion(hips_centroid.copy(), im_w, im_h)
    px_radius = int(euclidean(point_a, point_b))
    params["px_radius"] = px_radius
    if params["radius"] == 0:
        params["px_radius_multiplier"] = 0
    else:
        params["px_radius_multiplier"] = px_radius / params["radius"]

    mouth_coordinates = landmarks_data(face_landmarks, params, "mouth")
    left_hand_coordinates = landmarks_data(left_hand_landmarks, params, "hand")
    right_hand_coordinates = landmarks_data(right_hand_landmarks, params, "hand")
    pose_coordinates = landmarks_data(pose_landmarks, params, "pose")
    coordinates_collection = mouth_coordinates + left_hand_coordinates + right_hand_coordinates + pose_coordinates
    return coordinates_collection

def landmarks_data(landmarks, params, key):
    coordinates = []
    if landmarks:
        if key == "mouth":
            for i in mouth_indices():
                landmark = landmarks.landmark[i]
                x, y, z = coordinate_recalculation(landmark, params)
                coordinates.append(x)
                coordinates.append(y)
                coordinates.append(z)
        elif key == "pose":
            for i in range(0, 33):
                landmark = landmarks.landmark[i]
                x, y, z = coordinate_recalculation(landmark, params)
                coordinates.append(x)
                coordinates.append(y)
                coordinates.append(z)
        else:
            for i in landmarks.landmark:
                x, y, z = coordinate_recalculation(i, params)
                coordinates.append(x)
                coordinates.append(y)
                coordinates.append(z)
    else:
        if key == "hand":
            vertex_num = 21
        if key == "pose":
            vertex_num = 33
        if key == "mouth":
            vertex_num = len(mouth_indices())
        for i in range(0, vertex_num):
            for i in range(0, 3):
                coordinates.append(0)

    return coordinates

def coordinate_recalculation(landmark, params):
    radius = params["radius"] # radius ternormalisasi
    px_radius = params["px_radius"] # radius dalam pixel
    centroid = params["centroid"] # centroid ternormalisasi
    coordinates = [] # x y z
    centroid_x = centroid[0]
    centroid_y = centroid[1]
    original_h = params["image"].shape[0]
    original_w = params["image"].shape[1]
    px_centroid_x, px_centroid_y = pixel_coordinate_convertion([centroid_x, centroid_y], original_w, original_h)
    left = px_centroid_x - px_radius
    top = px_centroid_y - px_radius
    px_x, px_y = pixel_coordinate_convertion([landmark.x, landmark.y], original_w, original_h)
    w_h = np.min([original_h, original_w])
    radius_width_ratio = w_h / (px_radius * 2)
    if xy_negative_reflection == True:
        if px_x >= 0:
            normalized_px_x = abs(left - px_x)
        else:
            normalized_px_x = -abs(left - px_x)
        if px_y >= 0:
            normalized_px_y = abs(top - px_y)
        else:
            normalized_px_y = -abs(top - px_y)
    else:
        normalized_px_x = px_x - left
        normalized_px_y = px_y - top
    normalized_x = normalized_px_x / (2 * px_radius)
    normalized_y = normalized_px_y / (2 * px_radius)
    if unnormalized_z == False:
        z_axis = landmark.z * radius_width_ratio
    else:
        z_axis = landmark.z
    coordinates.append(normalized_x)
    coordinates.append(normalized_y)
    coordinates.append(z_axis)

    return coordinates

def mouth_indices():
    return [0,13,14,17,37,39,40,61,78,80,81,82,84,87,88,91,95,146,178,181,185,191,267,269,270,291,308,310,311,312,314,317,318,321,324,375,402,405,409,415]

def body_tracking(params):
    pose = params["pose_landmarks"]
    centroid_indices = [0, 11, 12, 23, 24]
    if pose is None and global_boundingbox_coord is None:
        params["centroid"] = [0, 0, 0]
    elif pose is None and global_boundingbox_coord is not None:
       params["centroid"] = global_boundingbox_coord
    else:
        params["centroid"] = find_body_centroid(pose, centroid_indices)

    if pose is None and global_distance is not None:
        params["radius"] = global_distance
        params["shoulders_centroid"] = [0, 0, 0]
        params["hips_centroid"] = [0, 0, 0]
    else:
        params = find_distance(pose, params)
    return params

def find_body_centroid(landmarks, indices):
    main_body = indices
    if landmarks:
        x_bodies = []
        y_bodies = []
        z_bodies = []
        for i in main_body:
            x_bodies.append(landmarks.landmark[i].x)
            y_bodies.append(landmarks.landmark[i].y)
            z_bodies.append(landmarks.landmark[i].z)
        global_boundingbox_coord = [x_bodies, y_bodies, z_bodies]
        return np.average(x_bodies), np.average(y_bodies), np.average(z_bodies)
    else:
        return 0, 0, 0

def find_distance(landmarks, params):
    indices_a = [11, 12]
    indices_b = [23, 24]
    centroid_a = np.array(find_body_centroid(landmarks, indices_a))
    centroid_b = np.array(find_body_centroid(landmarks, indices_b))
    params["radius"] = euclidean(centroid_a, centroid_b)
    params["shoulders_centroid"] = centroid_a
    params["hips_centroid"] = centroid_b
    return params

def euclidean(a, b):
    sum_sq = np.sum(np.square(a - b))
    euclidean = np.sqrt(sum_sq)

    return euclidean

def pixel_coordinate_convertion(coordinates, w, h):
    x = int(coordinates[0] * w)
    y = int(coordinates[1] * h)
    coordinates[0] = x
    coordinates[1] = y

    return coordinates

def draw_landmarks(image, params):
    mp_drawing = params["mp_drawing"]
    landmarks = params["landmarks"]
    connections = params["connections"]
    landmarks_drawing_spec = mp_drawing.DrawingSpec(
        color=[255, 0, 0],
        thickness=2,
        circle_radius=2,
    )
    connection_drawing_spec = mp_drawing.DrawingSpec(
        color=[0, 255, 00],
        thickness=1,
        circle_radius=2,
    )
    mp_drawing.draw_landmarks(
        image=image,
        landmark_list=landmarks,
        connections=connections,
        landmark_drawing_spec=landmarks_drawing_spec,
        connection_drawing_spec=connection_drawing_spec
    )

    return image

def capture(path):
    cap = cv2.VideoCapture(path)
    mp_holistic = mp.solutions.holistic
    holistic = mp_holistic.Holistic()
    frame_datas = []
    frame_index = 0
    with holistic:
        while cap.isOpened():
            ret, frame = cap.read()
            if ret == True:
                image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                coordinates = normalize(holistic, mp_holistic, image)
                frame_data = {
                    "frame_index": frame_index,
                    "skeleton": [
                        {
                            "pose": coordinates
                        }
                    ]
                }
                frame_datas.append(frame_data)
                key = cv2.waitKey(1)
                frame_index += 1
                if key == ord("q"):
                    break
            else:
                break
    cap.release()
    cv2.destroyAllWindows()

    return frame_datas

In [7]:
import glob
import os
import json
import time
data_save_path = "./image_crop_dynamic_bounding_box"
if not os.path.exists(data_save_path):
    os.makedirs(data_save_path)

start = time.time()
label_index = 0
bad_video = []
for i in glob.glob("./raw_video_dataset/*"):
    print(label_index)
    vid_dir = os.path.join(i, "*.mp4")
    file_index = 0
    for j in glob.glob(vid_dir):
        class_name = os.path.basename(os.path.split(j)[0])
        file_name = os.path.basename(os.path.splitext(j)[0])
        save_path = os.path.join(data_save_path, class_name+"_"+str(file_index)+".json")
        keypoints_data = capture(j)
        if len(keypoints_data) == 0:
            bad = {
                "label_index": file_name,
                "label": class_name,
            }
            bad_video.append(bad)
            continue
        data = {
            "label_index": label_index,
            "label": class_name,
            "data": keypoints_data
        }

        with open(save_path, "w") as fp:
            json.dump(data,fp)
        file_index += 1
    label_index += 1
finish = time.time()
time_elapsed = finish - start