In [None]:
"""
merge openface, openpose, resnet, and label data per frame
 * need openface, openpose, resnet data
 * output folder: ./proc_data/Merged_VA, ./proc_data/Merged_EXP
   * create "Training", "Validation" sub-folder in output folder
 * filename of openface: <vidoe name>.csv
 * filename of openpose: <vidoe name>_openpose.csv
 * filename of resnet: <vidoe name>_resnet50.h5
"""

import numpy as np
import pandas as pd
import glob
import os
import pathlib
from pathlib import Path

In [None]:
# root folder
dir_submit = str(Path().resolve())
dir_base = str(Path(Path().resolve()).parent) + "\\base_data"

# openface, openpose, resnet folder
dir_of = dir_base + "\\OpenFace\\"
dir_op = dir_base + "\\OpenPose\\"
dir_rn = dir_base + "\\Resnet\\"

# VA, EXP label folder
dir_lva = dir_base + "\\annotations\\VA_Set\\"
dir_lexp = dir_base + "\\annotations\\EXPR_Set\\"

# VA, EXP output dataset folder
dir_out_va_train = dir_base + "\\Merged_with_resnet\\Merged_VA\\Training\\"

if os.path.isdir(dir_out_va_train) == False:
    os.makedirs(dir_out_va_train)
dir_out_va_val = dir_base + "\\Merged_with_resnet\\Merged_VA\\Validation\\"
if os.path.isdir(dir_out_va_val) == False:
    os.makedirs(dir_out_va_val)

dir_out_exp_train = dir_base + "\\Merged_with_resnet\\Merged_EXP\\Training\\"
if os.path.isdir(dir_out_exp_train) == False:
    os.makedirs(dir_out_exp_train)
dir_out_exp_val = dir_base + "\\Merged_with_resnet\\Merged_EXP\\Validation\\"
if os.path.isdir(dir_out_exp_val) == False:
    os.makedirs(dir_out_exp_val)

# create standardization parameter folder
dir_norm_param = dir_base + "\\Merged_with_resnet\\Norm\\"
if os.path.isdir(dir_norm_param) == False:
    os.makedirs(dir_norm_param)

# standardize data  or not
flag_norm = True

# exclude file name (with out "file_exc") *exclude multi-person
file_exc = ["6-30-1920x1080", "10-60-1280x720", "30-30-1920x1080", "46-30-484x360", "52-30-1280x720",
            "130-25-1280x720", "135-24-1920x1080", "video2", "video5", "video49", "video55", "video59", "video74"
           ]

# openface file serach name
file_of = dir_of + "*.csv"
files_tmp = [
    filename for filename in sorted(glob.glob(file_of))
]
for i in files_tmp:
    name = os.path.splitext(os.path.basename(i))[0]
    for j in file_exc:
        if name == j:
            files_tmp.remove(i)
            break
files_of = files_tmp
log = "file number of openface: {0}".format(len(files_of))
print(log)

# openpose file serach name
file_op = dir_op + "*.csv"
files_tmp = [
    filename for filename in sorted(glob.glob(file_op))
]
for i in files_tmp:
    name = os.path.splitext(os.path.basename(i))[0]
    for j in file_exc:
        if name == j + "_openpose":
            files_tmp.remove(i)
            break
files_op = files_tmp
log = "file number of openpose: {0}".format(len(files_op))
print(log)


# resnet file serach name
file_rn = dir_rn + "*.h5"
files_tmp = [
    filename for filename in sorted(glob.glob(file_rn))
]
for i in files_tmp:
    name = os.path.splitext(os.path.basename(i))[0]
    for j in file_exc:
        if name == j + "_resnet50":
            files_tmp.remove(i)
            break
files_rn = files_tmp
log = "file number of resnet: {0}".format(len(files_rn))
print(log)

# VA label file serach name *Train*
file_lva_train = dir_lva + "Training_Set\\*.txt"
files_tmp = [
    filename for filename in sorted(glob.glob(file_lva_train))
]
for i in files_tmp:
    name = os.path.splitext(os.path.basename(i))[0]
    for j in file_exc:
        if (name == j) | (name == j + "_right") | (name == j + "_left"):
            files_tmp.remove(i)
            break
files_lva_train = files_tmp
log = "file number of VA label (Train): {0}".format(len(files_lva_train))
print(log)

# VA label file serach name *Validation*
file_lva_val = dir_lva + "Validation_Set\\*.txt"
files_tmp = [
    filename for filename in sorted(glob.glob(file_lva_val))
]
for i in files_tmp:
    name = os.path.splitext(os.path.basename(i))[0]
    for j in file_exc:
        if (name == j) | (name == j + "_right") | (name == j + "_left"):
            files_tmp.remove(i)
            break
files_lva_val = files_tmp
log = "file number of VA label (Validation): {0}".format(len(files_lva_val))
print(log)

# EXP label file serach name *Train*
file_lexp_train = dir_lexp + "Training_Set\\*.txt"
files_tmp = [
    filename for filename in sorted(glob.glob(file_lexp_train))
]
for i in files_tmp:
    name = os.path.splitext(os.path.basename(i))[0]
    for j in file_exc:
        if (name == j) | (name == j + "_right") | (name == j + "_left"):
            files_tmp.remove(i)
            break
files_lexp_train = files_tmp
log = "file number of EXP label (Train): {0}".format(len(files_lexp_train))
print(log)

# EXP label file serach name *Validation*
file_lexp_val = dir_lexp + "Validation_Set\\*.txt"
files_tmp = [
    filename for filename in sorted(glob.glob(file_lexp_val))
]
for i in files_tmp:
    name = os.path.splitext(os.path.basename(i))[0]
    for j in file_exc:
        if (name == j) | (name == j + "_right") | (name == j + "_left"):
            files_tmp.remove(i)
            break
files_lexp_val = files_tmp
log = "file number of EXP label (Validation): {0}".format(len(files_lexp_val))
print(log)



In [None]:
# calculate standardization parameter
def get_standardize_param(in_data):
    # standardize
    data_m = in_data.mean()
    data_s = in_data.std()

    return data_m, data_s


In [None]:
# calculate standardization parameter from files (ex. openface files) and save paramter
# save path is .h5 file
def standardize_data(files_data, out_path_mean, out_path_std):
    # make columns
    data_tmp = pd.read_csv(files_data[0])
    data_columns = data_tmp.columns
    
    # create empty
    np_data = np.zeros((1,len(data_columns)))
    # set loop
    count = 1
    max_count = len(files_data)
    
    for i in range(max_count):
        data = pd.read_csv(files_data[i])
        if len(data) < 1:
            count = count+1
            continue
        np_data_tmp = data.values
        
        np_data = np.append(np_data, np_data_tmp, axis=0)
        
        log = "{0}/{1}, data shape: {2}, sum: {3}".format(count, max_count,
                                                          np_data_tmp.shape, np_data.shape)
        print(log)
        
        count = count + 1

    np_data = np.delete(np_data, 0, 0)
    data = pd.DataFrame(np_data)
    data.columns = data_columns
    
    log = "all loaded, data shape: {2}".format(count, max_count, np_data.shape)
    print(log)

    pmean, pstd = get_standardize_param(data)

    #pmean.to_csv(out_path_mean)
    #pstd.to_csv(out_path_std)
    pmean.to_hdf(out_path_mean, key="key", mode="w", complevel=5, complib="lzo")
    pstd.to_hdf(out_path_std, key="key", mode="w", complevel=5, complib="lzo")
    
    return pmean, pstd


In [None]:
# calculate standardization parameter from files (ex. openface files) and save paramter
# save path is .h5 file
def standardize_data_rn(files_data, out_path_mean, out_path_std):
    # make columns
    #data_tmp = pd.read_csv(files_data[0])
    data_tmp = pd.read_hdf(files_data[0])
    data_tmp = data_tmp.reset_index(drop=True).iloc[0:10,0:201]
    data_columns = data_tmp.columns
    #print(data_columns)
    #print(data_tmp)
    
    # create empty
    np_data = np.zeros((1,len(data_columns)))
    # set loop
    count = 1
    max_count = len(files_data)
    
    for i in range(max_count):
        #data = pd.read_csv(files_data[i])
        data = pd.read_hdf(files_data[i])
        data = data.reset_index(drop=True).iloc[:,0:201]
        #print(data)
        if len(data) < 1:
            count = count+1
            continue
        np_data_tmp = data.values
        
        np_data = np.append(np_data, np_data_tmp, axis=0)
        
        log = "{0}/{1}, data shape: {2}, sum: {3}".format(count, max_count,
                                                          np_data_tmp.shape, np_data.shape)
        print(log)
        
        count = count + 1

    np_data = np.delete(np_data, 0, 0)
    data = pd.DataFrame(np_data)
    data.columns = data_columns
    
    log = "all loaded, data shape: {2}".format(count, max_count, np_data.shape)
    print(log)

    pmean, pstd = get_standardize_param(data)

    #pmean.to_csv(out_path_mean)
    #pstd.to_csv(out_path_std)
    pmean.to_hdf(out_path_mean, key="key", mode="w", complevel=5, complib="lzo")
    pstd.to_hdf(out_path_std, key="key", mode="w", complevel=5, complib="lzo")
    
    return pmean, pstd


In [None]:
# if flag = True, get standardization parameter of openface, openpose, resnet
# calculate and save parameter
if flag_norm == True:
    # openface standardization 
    mean_data_of, std_data_of = standardize_data(files_of,dir_norm_param + "raw_mean_of.h5",
                                       dir_norm_param + "raw_std_of.h5")
    # openpose standardization 
    mean_data_op, std_data_op = standardize_data(files_op,dir_norm_param + "raw_mean_op.h5",
                                       dir_norm_param + "raw_std_op.h5")
    # resnet standardization 
    mean_data_rn, std_data_rn = standardize_data_rn(files_rn,dir_norm_param + "raw_mean_rn.h5",
                                       dir_norm_param + "raw_std_rn.h5")
    # merge
    mean_data = mean_data_of.append(mean_data_op)
    mean_data = mean_data.append(mean_data_rn)
    mean_data = mean_data.reset_index()
    std_data  = std_data_of.append(std_data_op)
    std_data  = std_data.append(std_data_rn)
    std_data = std_data.reset_index()
else:
    mean_data = 1
    std_data = 1

In [None]:
# load standardized parameter
# openface standardization 
mean_data_of = pd.read_hdf(dir_norm_param + "raw_mean_of.h5", key="key")
std_data_of = pd.read_hdf(dir_norm_param + "raw_std_of.h5", key="key")
# openpose standardization 
mean_data_op = pd.read_hdf(dir_norm_param + "raw_mean_op.h5", key="key")
std_data_op = pd.read_hdf(dir_norm_param + "raw_std_op.h5", key="key")
# resnet standardization 
mean_data_rn = pd.read_hdf(dir_norm_param + "raw_mean_rn.h5", key="key")
std_data_rn = pd.read_hdf(dir_norm_param + "raw_std_rn.h5", key="key")

# merge
mean_data = mean_data_of.append(mean_data_op)
mean_data = mean_data.append(mean_data_rn)
mean_data = mean_data.reset_index()
std_data  = std_data_of.append(std_data_op)
std_data  = std_data.append(std_data_rn)
std_data = std_data.reset_index()

In [None]:
# merge openface, openpose and label
def merge_of_data_label(files_of, dir_op, dir_rn, files_label, dir_out, param_mean, param_std, flag_norm, str_type):
    count = 1
    max_count = len(files_label)
    if len(files_of) <1:
        print("openface files are not found")
        data_merge = pd.DataFrame()
        return
    
    # set label file
    for f_lv in files_label:
        # get label name
        name_lv = os.path.splitext(os.path.basename(f_lv))[0]
        
        for i in range(len(files_of)):
            # get openface name
            name_of = os.path.splitext(os.path.basename(files_of[i]))[0]
            if name_lv != name_of:
                continue
            else:
                # set save file mame
                file_out = dir_out + name_of + ".h5"

                # read label, set frame column
                data_lv = pd.read_csv(f_lv)
                data_lv['frame'] = data_lv.index+1
                
                # read openface data, delete duplicated frame, set index based on "frame"
                # openface ** 
                data_of = pd.read_csv(files_of[i])
                data_of = data_of.drop_duplicates(["frame"])
                data_of = data_of.set_index("frame", drop=False)
                
                # openpose ** 
                # read openpose data, delete duplicated frame, set frame column based on "Unnamed: 0"+1
                f_op = dir_op + name_of + "_openpose.csv"
                data_op = pd.read_csv(f_op)
                #data_op = data_op.drop_duplicates(["Unnamed: 0"])
                data_op["frame"] = data_op["Unnamed: 0"]+1
                data_op = data_op.set_index("frame")
                
                # resnet ** 
                f_rn = dir_rn + name_of + "_resnet50.h5"
                data_rn = pd.read_hdf(f_rn).iloc[:,0:201]
                #data_rn = data_rn.drop_duplicates(["Unnamed: 0"])
                #data_rn["frame"] = data_op["Unnamed: 0"]+1
                data_rn = data_rn.set_index("frame")
                
                # join data openface, openpose
                data_tmp = data_of.join(data_op)
                data_tmp = data_tmp.join(data_rn)
                data_tmp = data_tmp.fillna(0)
                data_tmp = data_tmp[data_tmp["frame"]>0]
                data_tmp = data_tmp.reset_index(drop=True)
                
                #print(data_tmp.shape)
                
                # if flag = True, 
                if flag_norm == True:
                    # standardize *** 
                    col_len = len(data_tmp.columns)
                    for col in range(col_len):
                        if (col >= 5) & (col <= 35):
                            data_tmp.iloc[:,col] = (data_tmp.iloc[:,col] - param_mean.iloc[col,1]) / param_std.iloc[col,1]
                        elif (col >= 56) & (col <= 130):
                            data_tmp.iloc[:,col] = (data_tmp.iloc[:,col] - param_mean.iloc[col,1]) / param_std.iloc[col,1]
                        elif (col >= 131):
                            data_tmp.iloc[:,col] = (data_tmp.iloc[:,col] - param_mean.iloc[col,1]) / param_std.iloc[col,1]
                    #data_of = (data_of - data_of.mean()) / data_x.std()

                # merge data and label based on "frame"
                data_merge = data_tmp.merge(data_lv, on='frame', how='left')
                
                name_op = os.path.splitext(os.path.basename(f_op))[0].replace("_openpose", "")
                name_rn = os.path.splitext(os.path.basename(f_rn))[0].replace("_resnet50", "")
                # save merged file
                #data_merge.to_csv(file_out, index=False, float_format='%.6g')
                data_merge.to_hdf(file_out, key="key", mode="w", complevel=5, complib="lzo")
                log = "{0}/{1}: {2}, {3}, {4}".format(count, max_count, name_of, name_op, name_rn)
                print(log)
                count = count + 1
                break
    log = "** finished **"
    print(log)


In [None]:

# create and save merge data "VA Training"
merge_of_data_label(files_of, dir_op, dir_rn, files_lva_train, dir_out_va_train,
                   mean_data, std_data, flag_norm, "VA")

# create and save merge data "VA Validation"
merge_of_data_label(files_of, dir_op, dir_rn, files_lva_val, dir_out_va_val,
                   mean_data, std_data, flag_norm, "VA")

# create and save merge data "EXP Training"
merge_of_data_label(files_of, dir_op, dir_rn, files_lexp_train, dir_out_exp_train,
                   mean_data, std_data, flag_norm, "EXP")

# create and save merge data "EXP Validation"
merge_of_data_label(files_of, dir_op, dir_rn, files_lexp_val, dir_out_exp_val,
                   mean_data, std_data, flag_norm, "EXP")
