In [1]:
import copy
import cv2
import h5py
import imageio
import math
import matlab.engine
import numpy as np
import os
import pickle
import skimage
import skimage.io
import skimage.transform
from functools import lru_cache
from tqdm import tqdm as tqdm
from vectormath import Vector2, Vector3

In [2]:
z_limits = pickle.load(open('z_limits.dat', 'rb'))
z_centers = (z_limits[1:65] + z_limits[0:64]) / 2
z_delta = z_limits[32]
z_depth = z_limits[1] - z_limits[0]

anno = pickle.load(open("train.bin", 'rb'))

In [3]:
print(z_limits[0], z_limits[1])
print(z_centers[0], z_centers[1])
print(z_delta)
print(z_depth)

-1005.18590298 -974.153937906
-989.669920444 -958.637955369
-12.1630205748
31.0319650752


In [4]:
root = 'D:/data/Human3.6M/Release-v1.1/'
script_paths = [subdir for subdir, _, _ in os.walk(root) if '.git' not in subdir]
additional_script_paths = [
    # empty
]
subjects = [
    1, 5, 6, 7, 8, # training
    9, 11, # validation
]

In [5]:
core = matlab.engine.start_matlab()
for script_path in script_paths + additional_script_paths:
    core.addpath(script_path)

In [6]:
core.workspace['DB'] = core.H36MDataBase.instance()
core.workspace['feature_RGB'] = core.H36MRGBVideoFeature()
core.workspace['feature_BB'] = core.H36MMyBBMask()
core.workspace['feature_BG'] = core.H36MMyBGMask()
core.workspace['features'] = [
    core.H36MPose2DPositionsFeature(),
    core.H36MPose3DPositionsFeature('Monocular', True),
]

In [7]:
def valid_sequence(subject, action, sub_action, camera):
    return subject in [1, 5, 6, 7, 8, 9, 11] and\
        1 <= action <= 16 and\
        1 <= sub_action <= 2 and\
        1 <= camera <= 4

In [8]:
def get_max_frame(subject, action, sub_action):
    return int(core.getNumFrames(core.workspace['DB'], subject, action, sub_action))

In [9]:
def get_sequence(subject, action, sub_action, camera):
    core.workspace['sequence'] = core.H36MSequence(subject, action, sub_action, camera, -1)
    return core.workspace['sequence']

In [10]:
def get_intrinsics(subject, action, sub_action, camera):
    if not valid_sequence(subject, action, sub_action, camera):
        raise IndexError()
    
    sequence = get_sequence(subject, action, sub_action, camera)
    core.workspace['camera'] = core.getCamera(sequence)
    
    f, c, k, p = [core.eval('camera.%s' % attrib)[0] for attrib in ['f', 'c', 'k', 'p']]
    
    return f, c, k, p

In [11]:
def get_RGB(subject, action, sub_action, camera, frame):
    if not valid_sequence(subject, action, sub_action, camera):
        raise IndexError()
    
    max_frame = get_max_frame(subject, action, sub_action)
    if not (1 <= frame <= max_frame):
        raise IndexError()
    
    sequence = get_sequence(subject, action, sub_action, camera)
    core.workspace['metadata'] = core.serializer(core.workspace['feature_RGB'], sequence)
    
    image = core.getFrame(core.workspace['metadata'], core.double(frame))
    image = np.reshape(np.asarray(image._data, dtype=np.float), newshape=(image._size[2], image._size[1], image._size[0])).transpose(2, 1, 0)
    
    video_name = core.eval('metadata.Reader.VideoName')
    
    return image, video_name

In [12]:
def get_video_name(subject, action, sub_action, camera):
    if not valid_sequence(subject, action, sub_action, camera):
        raise IndexError()
    
    sequence = get_sequence(subject, action, sub_action, camera)
    core.workspace['metadata'] = core.serializer(core.workspace['feature_RGB'], sequence)
    
    video_name = core.eval('metadata.Reader.VideoName')
    
    return video_name

In [13]:
def get_pose(subject, action, sub_action, camera, frame):
    if not valid_sequence(subject, action, sub_action, camera):
        raise IndexError()
    
    max_frame = get_max_frame(subject, action, sub_action)
    if not (1 <= frame <= max_frame):
        raise IndexError()
    
    sequence = get_sequence(subject, action, sub_action, camera)
    core.eval('sequence.IdxFrames = %d;' % frame, nargout=0)
    
    pose = core.H36MComputeFeatures(sequence, core.workspace['features'])
    
    return np.reshape(np.asarray(pose[0]), newshape=(32, 2)),\
        np.reshape(np.asarray(pose[1]), newshape=(32, 3))

In [14]:
def get_center_scale(subject, action, sub_action, camera, frame):
    if not valid_sequence(subject, action, sub_action, camera):
        raise IndexError()
    
    max_frame = get_max_frame(subject, action, sub_action)
    if not (1 <= frame <= max_frame):
        raise IndexError()
    
    sequence = get_sequence(subject, action, sub_action, camera)
    core.workspace['metadata'] = core.serializer(core.workspace['feature_BB'], sequence)
    
    mask = core.getFrame(core.workspace['metadata'], core.double(frame))
    mask = np.reshape(np.asarray(mask._data, dtype=np.float), newshape=(mask._size[1], mask._size[0])).transpose(1, 0)
    
    flatten = mask.flatten()
    flatten = np.nonzero(flatten)[0]
    ul, br = [flatten[where] for where in [0, -1]]
    ul = Vector2(ul % mask.shape[1], ul // mask.shape[1])
    br = Vector2(br % mask.shape[1], br // mask.shape[1])

    center = (ul + br) / 2
    height = (br - ul).y
    width  = (br - ul).x
    scale = max(height, width) / 200
    
    return center, scale

In [15]:
def get_center_scale_directly(video_name, frame):
    
    sub = video_name.split('/')[-3].split('\\')[0]
    act, cam = video_name.split('/')[-1].split('.mp4')[0].split('.')
    #act = act.replace(' ', '_')
        
    data_root = 'D:/data/Human3.6M/downloaded/'
    bb_path = os.path.join(data_root, sub, 'MySegmentsMat', 'ground_truth_bb', '%s.%s.mat' % (act, cam))
    with h5py.File(bb_path, 'r') as file:
        mask = np.asarray(file[file['Masks'][frame][0]]).transpose(1, 0)

        flatten = mask.flatten()
        flatten = np.nonzero(flatten)[0]
        ul, br = [flatten[where] for where in [0, -1]]
        ul = Vector2(ul % mask.shape[1], ul // mask.shape[1])
        br = Vector2(br % mask.shape[1], br // mask.shape[1])

        center = (ul + br) / 2
        height = (br - ul).y
        width  = (br - ul).x
        scale = max(height, width) / 200
    
    return center, scale

In [16]:
def project(keypoints, f, c, k, p):
    X = keypoints.transpose(1, 0) # Already in 3D pose
    XX = np.divide(X[0:2, :], X[2, :])
    r2 = np.power(XX[0, :], 2) + np.power(XX[1, :], 2)
    radial = np.dot(k, np.asarray([r2, np.power(r2, 2), np.power(r2, 3)])) + 1
    tan = p[0] * XX[1, :] + p[1] * XX[0, :]
    temp = radial + tan
    first = XX * np.stack([temp, temp])
    second = np.expand_dims(np.asarray([p[1], p[0]]), axis=1) * np.expand_dims(r2, axis=0)
    XXX = first + second
    XXX = XXX.transpose(1, 0)
    proj = f * XXX + c
    
    return proj

In [17]:
def crop_image(image, center, scale, rotate, resolution):
    center = Vector2(center)  # assign new array
    height, width, channel = image.shape
    crop_ratio = 200 * scale / resolution
    if crop_ratio >= 2:  # if box size is greater than two time of resolution px
        # scale down image
        height = math.floor(height / crop_ratio)
        width = math.floor(width / crop_ratio)

        if max([height, width]) < 2:
            # Zoomed out so much that the image is now a single pixel or less
            raise ValueError("Width or height is invalid!")

        image = skimage.transform.resize(image, (height, width), mode='constant')
#         image = image.resize(image, (height, width), mode='constant')
        center /= crop_ratio
        scale /= crop_ratio

    ul = (center - 200 * scale / 2).astype(int)
    br = (center + 200 * scale / 2).astype(int)  # Vector2

    if crop_ratio >= 2:  # force image size 256 x 256
        br -= (br - ul - resolution)

    pad_length = math.ceil((ul - br).length - (br.x - ul.x) / 2)

    if rotate != 0:
        ul -= pad_length
        br += pad_length

    src = [max(0, ul.y), min(height, br.y), max(0, ul.x), min(width, br.x)]
    dst = [max(0, -ul.y), min(height, br.y) - ul.y, max(0, -ul.x), min(width, br.x) - ul.x]

    new_image = np.zeros([br.y - ul.y, br.x - ul.x, channel], dtype=np.float32)
    new_image[dst[0]:dst[1], dst[2]:dst[3], :] = image[src[0]:src[1], src[2]:src[3], :]

    if rotate != 0:
        new_image = skimage.transform.rotate(new_image, rotate)
        new_height, new_width, _ = new_image.shape
        new_image = new_image[pad_length:new_height - pad_length, pad_length:new_width - pad_length, :]

    if crop_ratio < 2:
        new_image = skimage.transform.resize(new_image, (resolution, resolution), mode='constant')
#         new_image = Image.resize(new_image, (resolution, resolution), mode='constant')

    return new_image


In [18]:
# subject = 1
# action = 2
# sub_action = 1
# camera = 1
# frame = 1

# pelvis = [1]
# left_leg = [7, 8, 9]
# right_leg = [2, 3, 4]
# spine = [13, 14, 15, 16]
# left_arm = [18, 19, 20]
# right_arm = [26, 27, 28]
# keypoints = pelvis + left_leg + right_leg + spine + left_arm + right_arm

# image, image_name = get_RGB(subject, action, sub_action, camera, frame) # RGB image
# center, scale = get_center_scale(subject, action, sub_action, camera, frame) # center, scale
# in_image_space, in_camera_space = get_pose(subject, action, sub_action, camera, frame) # part, S
# f, c, k, p = get_intrinsics(subject, action, sub_action, camera)

# z = in_camera_space[:, -1]
# z_center = z[0]
# z_index = (z - z_center - z_delta)/(z_depth) + 33
# z_index = np.floor(z_index).astype(int) # zidx

In [19]:
# imageio.imwrite('original.jpg', crop_image(image, center, scale, 0, 256))

In [20]:
# dir_center, dir_scale = get_center_scale_directly(image_name, frame)
# imageio.imwrite('directly.jpg', crop_image(image, dir_center, dir_scale, 0, 256))

In [21]:
# print(anno['image'][0])

# sub = image_name.split('/')[-3].split('\\')[0]
# act, cam = image_name.split('/')[-1].split('.mp4')[0].split('.')
# act = act.replace(' ', '_')
# image_name = '%s_%s.%s_%06d' % (sub, act, cam, frame)
# print(image_name)

In [22]:
# print(anno['S'][0])

# print(np.reshape([in_camera_space[idx-1] for idx in keypoints], (-1, 3)))

In [23]:
# print(anno['part'][0])
# print(np.reshape([in_image_space[idx-1] for idx in keypoints], (-1, 2)))

In [24]:
# print(anno['scale'][0])
# print(scale)

In [25]:
# print(anno['center'][0])
# print(center)

In [26]:
# print(anno['zind'][0])
# print(np.reshape([z_index[idx-1] for idx in keypoints], (-1)))

In [28]:
pelvis = [1]
left_leg = [7, 8, 9]
right_leg = [2, 3, 4]
spine = [13, 14, 15, 16]
left_arm = [18, 19, 20]
right_arm = [26, 27, 28]
keypoints = pelvis + left_leg + right_leg + spine + left_arm + right_arm

converted = dict()
converted ['S'] = list()
converted ['part'] = list()
converted ['center'] = list()
converted ['scale'] = list()
converted ['image'] = list()
converted ['zind'] = list()

total = 0

for subject in [1, 5, 6, 7, 8, ]:
    for action in range(2, 16 + 1):
        for sub_action in [1, 2]:
            for camera in [1, 2, 3, 4]:

                # Data corrupted.
                if subject == 11 and action == 2 and sub_action == 2 and camera == 1:
                    continue
                
                max_frame = get_max_frame(subject, action, sub_action)
                total = total + max_frame//5
                
    
with tqdm(total=total) as progress:

    for subject in [1, 5, 6, 7, 8, ]:
        for action in range(2, 16 + 1):
            for sub_action in [1, 2]:
                for camera in [1, 2, 3, 4]:

                    progress.set_description('subject(%d) action(%d-%d) camera(%d)' % (subject, action, sub_action, camera))

                    # Data corrupted.
                    if subject == 11 and action == 2 and sub_action == 2 and camera == 1:
                        continue

                    max_frame = get_max_frame(subject, action, sub_action)

                    video_name = get_video_name(subject, action, sub_action, camera)
                    sub = video_name.split('/')[-3].split('\\')[0]
                    act, cam = video_name.split('/')[-1].split('.mp4')[0].split('.')
                    
                    data_root = 'D:/data/Human3.6M/downloaded/'
                    bb_path = os.path.join(data_root, sub, 'MySegmentsMat', 'ground_truth_bb', '%s.%s.mat' % (act, cam))
                    
                    act = act.replace(' ', '_')
                    video_name = '%s_%s.%s' % (sub, act, cam)
                    
                    with h5py.File(bb_path, 'r') as file:

                        for frame in range(1, max_frame+1, 5):
                            mask = np.asarray(file[file['Masks'][frame-1][0]]).transpose(1, 0)

                            flatten = mask.flatten()
                            flatten = np.nonzero(flatten)[0]
                            ul, br = [flatten[where] for where in [0, -1]]
                            ul = Vector2(ul % mask.shape[1], ul // mask.shape[1])
                            br = Vector2(br % mask.shape[1], br // mask.shape[1])

                            center = (ul + br) / 2 # center
                            height = (br - ul).y
                            width  = (br - ul).x
                            scale = max(height, width) / 200 # scale
                
                            # center, scale = get_center_scale(subject, action, sub_action, camera, frame) # center, scale
                            in_image_space, in_camera_space = get_pose(subject, action, sub_action, camera, frame) # part, S

                            z = in_camera_space[:, -1]
                            z_center = z[0]
                            z_index = (z - z_center - z_delta)/(z_depth) + 33
                            z_index = np.floor(z_index).astype(int) # zidx
                            
                            for idx in keypoints:
                                if not (1 <= z_index[idx-1] <= 64):
                                    print(subject, action, sub_action, camera, frame)
                                    raise Exception('zind out of range!')

                            converted ['S'].append(np.reshape([in_camera_space[idx-1] for idx in keypoints], (-1, 3)))
                            converted ['part'].append(np.reshape([in_image_space[idx-1] for idx in keypoints], (-1, 2)))
                            converted ['center'].append(center)
                            converted ['scale'].append(scale)
                            converted ['image'].append('%s_%06d.jpg' % (video_name, frame))
                            converted ['zind'].append(np.reshape([z_index[idx-1] for idx in keypoints], (-1)))

                            progress.update(1)

pickle.dump(converted, open('converted_train.bin', 'wb'))

subject(6) action(3-2) camera(3):  44%|█████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                       | 136919/311724 [11:05:44<14:09:57,  3.43it/s]


TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [36]:
subject = 6
action = 15
sub_action = 1
camera = 4

pelvis = [1]
left_leg = [7, 8, 9]
right_leg = [2, 3, 4]
spine = [13, 14, 15, 16]
left_arm = [18, 19, 20]
right_arm = [26, 27, 28]
keypoints = pelvis + left_leg + right_leg + spine + left_arm + right_arm

max_frame = get_max_frame(subject, action, sub_action)

video_name = get_video_name(subject, action, sub_action, camera)
sub = video_name.split('/')[-3].split('\\')[0]
act, cam = video_name.split('/')[-1].split('.mp4')[0].split('.')

data_root = 'D:/data/Human3.6M/downloaded/'
bb_path = os.path.join(data_root, sub, 'MySegmentsMat', 'ground_truth_bb', '%s.%s.mat' % (act, cam))

act = act.replace(' ', '_')
video_name = '%s_%s.%s' % (sub, act, cam)

print(video_name)

with h5py.File(bb_path, 'r') as file:

    frame = 391
    
    mask = np.asarray(file[file['Masks'][frame-1][0]]).transpose(1, 0)

    flatten = mask.flatten()
    flatten = np.nonzero(flatten)[0]
    ul, br = [flatten[where] for where in [0, -1]]
    ul = Vector2(ul % mask.shape[1], ul // mask.shape[1])
    br = Vector2(br % mask.shape[1], br // mask.shape[1])

    center = (ul + br) / 2 # center
    height = (br - ul).y
    width  = (br - ul).x
    scale = max(height, width) / 200 # scale

    # center, scale = get_center_scale(subject, action, sub_action, camera, frame) # center, scale
    in_image_space, in_camera_space = get_pose(subject, action, sub_action, camera, frame) # part, S

    z = in_camera_space[:, -1]
    z_center = z[0]
    z_index = (z - z_center - z_delta)/(z_depth) + 33
    z_index = np.floor(z_index).astype(int) # zidx
            
    print(np.reshape([in_camera_space[idx-1] for idx in keypoints], (-1, 3)))
    print(np.reshape([z_index[idx-1] for idx in keypoints], (-1)))
    
    image, image_name = get_RGB(subject, action, sub_action, camera, frame) # RGB image
    center, scale = get_center_scale(subject, action, sub_action, camera, frame) # center, scale
    in_image_space, in_camera_space = get_pose(subject, action, sub_action, camera, frame) # part, S
    f, c, k, p = get_intrinsics(subject, action, sub_action, camera)
    
    imageio.imwrite('original.jpg', crop_image(image, center, scale, 0, 256))

#     converted ['S'].append(np.reshape([in_camera_space[idx-1] for idx in keypoints], (-1, 3)))
#     converted ['part'].append(np.reshape([in_image_space[idx-1] for idx in keypoints], (-1, 2)))
#     converted ['center'].append(center)
#     converted ['scale'].append(scale)
#     converted ['image'].append('%s_%06d.jpg' % (video_name, frame))
#     converted ['zind'].append(np.reshape([z_index[idx-1] for idx in keypoints], (-1)))

S6_WalkDog_1.60457274
[[  236.1051178   -672.7512207   4491.43652344]
 [  370.77529907  -691.55163574  4534.43164062]
 [  386.10348511  -206.30427551  4566.74902344]
 [  313.61047363   177.60742188  4812.37744141]
 [  101.43293762  -653.95056152  4448.44091797]
 [   72.15738678  -270.58151245  4746.63964844]
 [   38.75497055    26.1042099   5098.54638672]
 [  290.51269531  -823.11938477  4283.61962891]
 [  408.41387939  -978.88031006  4112.03076172]
 [  428.20654297 -1004.84918213  3997.18334961]
 [  411.77511597 -1111.13061523  4037.91723633]
 [  454.07437134 -1018.47338867  4248.63378906]
 [  300.04336548 -1181.57250977  4449.33056641]
 [  239.09413147 -1318.15759277  4659.44824219]
 [  316.32446289  -906.9989624   4018.94335938]
 [  312.87930298  -803.78192139  3736.20654297]
 [  369.99295044  -776.5869751   3486.16992188]]
[33 34 35 43 32 41 52 26 21 17 18 25 32 38 18  9  0]




In [45]:
# for index in range(len(anno['image'])):
#     if anno['image'][index] == 'S6_WalkDog_1.60457274_000391.jpg':
#         print(index)
#         break
index = 169790

print((anno['S'][index][-1] - anno['S'][index][0] - z_delta))
temp = np.reshape([in_camera_space[idx-1] for idx in keypoints], (-1, 3))
print(temp[-1] - temp[0] - z_delta)

print((anno['S'][index][-1] - anno['S'][index][0] - z_delta) // z_depth + 33)

[ 146.03289332  -91.61660087 -993.02288241]
[ 146.05085322  -91.67273382 -993.10358099]
[ 37.  30.   1.]
