In [None]:
'''
save rolled data including openface, openpose, and label
 * rolling window: 1s, 6s, 12s
 * resampling: 1 frame (overlapped)
 * creating "Validation" subfolder in output folder
'''
#from numba import jit
import numpy as np
import pandas as pd
import glob
import os
from statistics import mean, median,variance,stdev
import math
import scipy.stats as stats
import pathlib
from pathlib import Path

In [None]:
# root folder
dir_submit = str(Path().resolve())
dir_base = str(Path(Path().resolve()).parent) + "\\base_data"

# merged data folder (va, exp : validation)
dir_data_va_val = dir_base + "\\Merged_with_resnet\\Merged_VA\\Validation\\"
dir_data_exp_val = dir_base + "\\Merged_with_resnet\\Merged_EXP\\Validation\\"

# output data folder
dir_out_va_val = dir_base + "\\Merged_with_resnet\\Merged_VA_roll\\Validation_Frame\\"
if os.path.isdir(dir_out_va_val) == False:
    os.makedirs(dir_out_va_val)

dir_out_exp_val = dir_base + "\\Merged_with_resnet\\Merged_EXP_roll\\Validation_Frame\\"
if os.path.isdir(dir_out_exp_val) == False:
    os.makedirs(dir_out_exp_val)


# list of file path: VA validation data 
file_data_va_val = dir_data_va_val + "*.h5"
files_data_va_val = [
    filename for filename in sorted(glob.glob(file_data_va_val))
]
log = "file number of files_data_va_val: {0}".format(len(files_data_va_val))
print(log)


# list of file path: EXP validation data 
file_data_exp_val = dir_data_exp_val + "*.h5"
files_data_exp_val = [
    filename for filename in sorted(glob.glob(file_data_exp_val))
]
log = "file number of files_data_exp_val: {0}".format(len(files_data_exp_val))
print(log)

# rolling and resampling time list
time_list =  [30, 180, 360, 90]  #frame
time_shift = [ 1,   1,   1,  1]  #frame
Th_conf = 0.7


In [None]:
# create rolled data (1s, 6s, 12s) *str_type: "EXP" or "VA"
def data_roll_np(in_data_np, th_conf, str_type):
    
    # get length of columns of source data and output data
    len_col = len(in_data_np[0,:])
    len_col_out = len(in_data_np[0,:]) * 4
    
    # delete nan data
    data = in_data_np[~np.isnan(in_data_np).any(axis=1), :]
    
    # cut data of that "confidence" is below th_conf  閾値以下のデータをカット
    data = in_data_np[in_data_np[:,3]>=th_conf]
    
    # create dummy numpy array (for merge)
    data_1s = [np.zeros(len_col_out)]
    data_6s = [np.zeros(len_col_out)]
    data_12s = [np.zeros(len_col_out)]
    data_3s = [np.zeros(len_col_out)]
    
    if len(data) <1:
        return data_1s,data_6s, data_12s, data_3s
    
    count = 0
    
    # calc loop number 
    max_count = data[:,0].max()  #math.round(data['frame'].max()).astype(int)
    max_count = int(max_count)
    
    # loop of data rolling, resampling
    for count in range(max_count):
        # 1 sec window 1 frame shift
        start = count * time_shift[0] - time_list[0]*0.5
        stop = start + time_list[0]
        # substract window sec data
        #data_tmp = data[(data[:,2]>=start) & (data[:,2]<stop)]
        data_tmp = data[(data[:,0]>=start) & (data[:,0]<stop)]
        # if data_tmp is empty, return nan array
        if len(data_tmp)<1:
            data_mean = np.zeros(len_col)
            data_mean[:] = np.nan
            data_mean[0] = count + 1
            data_mean[2] = count/30
            data_std = np.zeros(len_col)
            data_std[:] = np.nan
            data_range = np.zeros(len_col)
            data_range[:] = np.nan
            data_slope = np.zeros(len_col)
            data_slope[:] = np.nan
        else:
            # mean
            data_mean = data_tmp.mean(axis=0)
            # set " timestamp" to window average
            data_mean[0] = count + 1
            # std
            data_std = data_tmp.std(axis=0)
            # range: max-min
            data_range = data_tmp.max(axis=0) - data_tmp.min(axis=0)
            # slope
            # create empty list (length is columns' length)
            data_slope = np.empty(len_col)
            # if data_tmp's length is over 2, calculate slope
            if len(data_tmp) > 1:
                for col in range(len_col):
                    x_arr = data_tmp[:,2].ravel()
                    y_arr = data_tmp[:,col].ravel()
                    if (len(y_arr[~np.isnan(y_arr)])>1) & (len(x_arr[~np.isnan(x_arr)])>1):
                        try:
                            a, b = np.polyfit(x_arr, y_arr, 1)
                        except:
                            a=0
                    else:
                        a=0
                    #lr.fit(x_arr, y_arr)
                    data_slope[col] = a
            else:
                data_slope[:] = 0
            # if str_type is "EXP", calculate the mode of expression in window
            if str_type == "EXP":
                if len(data[data[:,0]==count+1,331])<1:
                    data_mean[331] = np.nan
                else:
                    try:
                        data_mean[331] = int(data[data[:,0]==count+1,331].mean())
                    except:
                        data_mean[331] = -1
            else:
                if len(data[data[:,0]==count+1,331])<1:
                    data_mean[331] = np.nan
                    data_mean[332] = np.nan
                else:
                    try:
                        data_mean[331] = data[data[:,0]==count+1,331].mean()
                        data_mean[332] = data[data[:,0]==count+1,332].mean()
                    except:
                        data_mean[331] = np.nan
                        data_mean[332] = np.nan
        
        # merge mean, std, range, slope
        data_mix = np.append(data_mean, data_std)
        data_mix = np.append(data_mix, data_range)
        data_mix = np.append(data_mix, data_slope)
        
        data_1s = np.append(data_1s, [data_mix], axis=0)

    #print(data_1s)

    count = 0
    #max_count = math.floor(((data[' timestamp'].max() / 3) - 1))
    for count in range(max_count):
        # 6 sec window 1 frame shift
        start = count * time_shift[1] - time_list[1]*0.5
        stop = start + time_list[1]
        # substract window sec data
        #data_tmp = data[(data[:,2]>=start) & (data[:,2]<stop)]
        data_tmp = data[(data[:,0]>=start) & (data[:,0]<stop)]
        # if data_tmp is empty, return nan array
        if len(data_tmp)<1:
            data_mean = np.zeros(len_col)
            data_mean[:] = np.nan
            data_mean[0] = count + 1
            data_mean[2] = count/30
            data_std = np.zeros(len_col)
            data_std[:] = np.nan
            data_range = np.zeros(len_col)
            data_range[:] = np.nan
            data_slope = np.zeros(len_col)
            data_slope[:] = np.nan
        else:
            # mean
            data_mean = data_tmp.mean(axis=0)
            # set " timestamp" to window average
            data_mean[0] = count + 1
            # std
            data_std = data_tmp.std(axis=0)
            # range: max-min
            data_range = data_tmp.max(axis=0) - data_tmp.min(axis=0)
            # slope
            # create empty list (length is columns' length)
            data_slope = np.empty(len_col)
            # if data_tmp's length is over 2, calculate slope
            if len(data_tmp) > 1:
                for col in range(len_col):
                    x_arr = data_tmp[:,2].ravel()
                    y_arr = data_tmp[:,col].ravel()
                    if (len(y_arr[~np.isnan(y_arr)])>1) & (len(x_arr[~np.isnan(x_arr)])>1):
                        try:
                            a, b = np.polyfit(x_arr, y_arr, 1)
                        except:
                            a=0
                    else:
                        a=0
                    #lr.fit(x_arr, y_arr)
                    data_slope[col] = a
            else:
                data_slope[:] = 0
            # if str_type is "EXP", calculate the mode of expression in window
            if str_type == "EXP":
                if len(data[data[:,0]==count+1,331])<1:
                    data_mean[331] = np.nan
                else:
                    try:
                        data_mean[331] = int(data[data[:,0]==count+1,331].mean())
                    except:
                        data_mean[331] = -1
            else:
                if len(data[data[:,0]==count+1,331])<1:
                    data_mean[331] = np.nan
                    data_mean[332] = np.nan
                else:
                    try:
                        data_mean[331] = data[data[:,0]==count+1,331].mean()
                        data_mean[332] = data[data[:,0]==count+1,332].mean()
                    except:
                        data_mean[331] = np.nan
                        data_mean[332] = np.nan
        
        # merge mean, std, range, slope
        data_mix = np.append(data_mean, data_std)
        data_mix = np.append(data_mix, data_range)
        data_mix = np.append(data_mix, data_slope)
        
        data_6s = np.append(data_6s, [data_mix], axis=0)

    count = 0
    #max_count = math.floor(((data[' timestamp'].max() / 3) - 1))
    for count in range(max_count):
        # 12 sec window 1 frame shift
        start = count * time_shift[2] - time_list[2]*0.5
        stop = start + time_list[2]
        # substract window sec data
        #data_tmp = data[(data[:,2]>=start) & (data[:,2]<stop)]
        data_tmp = data[(data[:,0]>=start) & (data[:,0]<stop)]
        # if data_tmp is empty, return nan array
        if len(data_tmp)<1:
            data_mean = np.zeros(len_col)
            data_mean[:] = np.nan
            data_mean[0] = count + 1
            data_mean[2] = count/30
            data_std = np.zeros(len_col)
            data_std[:] = np.nan
            data_range = np.zeros(len_col)
            data_range[:] = np.nan
            data_slope = np.zeros(len_col)
            data_slope[:] = np.nan
        else:
            # mean
            data_mean = data_tmp.mean(axis=0)
            # set " timestamp" to window average
            data_mean[0] = count + 1
            # std
            data_std = data_tmp.std(axis=0)
            # range: max-min
            data_range = data_tmp.max(axis=0) - data_tmp.min(axis=0)
            # slope
            # create empty list (length is columns' length)
            data_slope = np.empty(len_col)
            # if data_tmp's length is over 2, calculate slope
            if len(data_tmp) > 1:
                for col in range(len_col):
                    x_arr = data_tmp[:,2].ravel()
                    y_arr = data_tmp[:,col].ravel()
                    if (len(y_arr[~np.isnan(y_arr)])>1) & (len(x_arr[~np.isnan(x_arr)])>1):
                        try:
                            a, b = np.polyfit(x_arr, y_arr, 1)
                        except:
                            a=0
                    else:
                        a=0
                    #lr.fit(x_arr, y_arr)
                    data_slope[col] = a
            else:
                data_slope[:] = 0
            # if str_type is "EXP", calculate the mode of expression in window
            if str_type == "EXP":
                if len(data[data[:,0]==count+1,331])<1:
                    data_mean[331] = np.nan
                else:
                    try:
                        data_mean[331] = int(data[data[:,0]==count+1,331].mean())
                    except:
                        data_mean[331] = -1
            else:
                if len(data[data[:,0]==count+1,331])<1:
                    data_mean[331] = np.nan
                    data_mean[332] = np.nan
                else:
                    try:
                        data_mean[331] = data[data[:,0]==count+1,331].mean()
                        data_mean[332] = data[data[:,0]==count+1,332].mean()
                    except:
                        data_mean[331] = np.nan
                        data_mean[332] = np.nan
        
        # merge mean, std, range, slope
        data_mix = np.append(data_mean, data_std)
        data_mix = np.append(data_mix, data_range)
        data_mix = np.append(data_mix, data_slope)
        
        data_12s = np.append(data_12s, [data_mix], axis=0)
    
    
    count = 0
    #max_count = math.floor(((data[' timestamp'].max() / 3) - 1)) *********************************
    for count in range(max_count):
        # 03 sec window 1 frame shift
        start = count * time_shift[3] - time_list[3]*0.5
        stop = start + time_list[3]
        # substract window sec data
        #data_tmp = data[(data[:,2]>=start) & (data[:,2]<stop)]
        data_tmp = data[(data[:,0]>=start) & (data[:,0]<stop)]
        # if data_tmp is empty, return nan array
        if len(data_tmp)<1:
            data_mean = np.zeros(len_col)
            data_mean[:] = np.nan
            data_mean[0] = count + 1
            data_mean[2] = count/30
            data_std = np.zeros(len_col)
            data_std[:] = np.nan
            data_range = np.zeros(len_col)
            data_range[:] = np.nan
            data_slope = np.zeros(len_col)
            data_slope[:] = np.nan
        else:
            # mean
            data_mean = data_tmp.mean(axis=0)
            # set " timestamp" to window average
            data_mean[0] = count + 1
            # std
            data_std = data_tmp.std(axis=0)
            # range: max-min
            data_range = data_tmp.max(axis=0) - data_tmp.min(axis=0)
            # slope
            # create empty list (length is columns' length)
            data_slope = np.empty(len_col)
            # if data_tmp's length is over 2, calculate slope
            if len(data_tmp) > 1:
                for col in range(len_col):
                    x_arr = data_tmp[:,2].ravel()
                    y_arr = data_tmp[:,col].ravel()
                    if (len(y_arr[~np.isnan(y_arr)])>1) & (len(x_arr[~np.isnan(x_arr)])>1):
                        try:
                            a, b = np.polyfit(x_arr, y_arr, 1)
                        except:
                            a=0
                    else:
                        a=0
                    #lr.fit(x_arr, y_arr)
                    data_slope[col] = a
            else:
                data_slope[:] = 0
            # if str_type is "EXP", calculate the mode of expression in window
            if str_type == "EXP":
                if len(data[data[:,0]==count+1,331])<1:
                    data_mean[331] = np.nan
                else:
                    try:
                        data_mean[331] = int(data[data[:,0]==count+1,331].mean())
                    except:
                        data_mean[331] = -1
            else:
                if len(data[data[:,0]==count+1,331])<1:
                    data_mean[331] = np.nan
                    data_mean[332] = np.nan
                else:
                    try:
                        data_mean[331] = data[data[:,0]==count+1,331].mean()
                        data_mean[332] = data[data[:,0]==count+1,332].mean()
                    except:
                        data_mean[331] = np.nan
                        data_mean[332] = np.nan
        
        # merge mean, std, range, slope
        data_mix = np.append(data_mean, data_std)
        data_mix = np.append(data_mix, data_range)
        data_mix = np.append(data_mix, data_slope)
        
        data_3s = np.append(data_3s, [data_mix], axis=0)
    
    
    # delete first data (dummy data)
    data_1s = np.delete(data_1s, 0, 0) #data_1s.reset_index(drop=True)
    data_6s = np.delete(data_6s, 0, 0) #data_6s.reset_index(drop=True)
    data_12s = np.delete(data_12s, 0, 0) #data_12s.reset_index(drop=True)
    
    data_3s = np.delete(data_3s, 0, 0) #data_3s.reset_index(drop=True)

    
    log = "shape 1s: {0}, shape 6s: {1}, shape 12s: {2}, shape 3s: {3}".format(data_1s.shape, data_6s.shape, 
                                                                               data_12s.shape, data_3s.shape)
    print(log)
    
    
    return data_1s, data_6s, data_12s, data_3s

In [None]:
# sequential run [data rolling]
def seq_data_rolling(files_data, dir_out, th_conf, str_type):

    count = 1
    max_count = len(files_data)
    
    for f_data in files_data:
        name_data = os.path.splitext(os.path.basename(f_data))[0]
        
        # read data to dataframe
        #data_tmp = pd.read_csv(f_data)
        data_tmp = pd.read_hdf(f_data, 'key')
        
        data_tmp.columns = data_tmp.columns.astype(str)
        
        # if str_type is "EXP", delete nan label columns
        if str_type == "EXP":
            data_tmp = data_tmp.drop(["Anger","Disgust","Fear","Happiness","Sadness","Surprise"],axis=1)

        # create average, std, rang, sope columns
        col_avg = data_tmp.columns + "-avg"
        col_std = data_tmp.columns + "-std"
        col_range = data_tmp.columns + "-range"
        col_slope = data_tmp.columns + "-slope"
        col_list = col_avg.append(col_std)
        col_list = col_list.append(col_range)
        col_list = col_list.append(col_slope)

        # convet pandas data to numpy array
        data_arr = data_tmp.values
        
        # calc rolling
        data1, data6, data12, data3 = data_roll_np(data_arr,  th_conf, str_type)
        
        
        # convert numpy array to pandas data
        data_1s = pd.DataFrame(data1, columns = col_list)
        data_6s = pd.DataFrame(data6, columns = col_list)
        data_12s = pd.DataFrame(data12, columns = col_list)
        data_3s = pd.DataFrame(data3, columns = col_list)
        
        # save windowed data
        file_out_1s = dir_out + name_data + '_' + str(1).zfill(2) + 's.h5'
        #data_1s.to_csv(file_out_1s, index=False, float_format='%.6g')
        data_1s.to_hdf(file_out_1s, "key", mode="w", complevel=5, complib="lzo")
        
        file_out_6s = dir_out + name_data + '_' + str(6).zfill(2) + 's.h5'
        #data_6s.to_csv(file_out_6s, index=False, float_format='%.6g')
        data_6s.to_hdf(file_out_6s, "key", mode="w", complevel=5, complib="lzo")
        
        file_out_12s = dir_out + name_data + '_' + str(12).zfill(2) + 's.h5'
        #data_12s.to_csv(file_out_12s, index=False, float_format='%.6g')
        data_12s.to_hdf(file_out_12s, "key", mode="w", complevel=5, complib="lzo")
        
        file_out_3s = dir_out + name_data + '_' + str(3).zfill(2) + 's.h5'
        #data_12s.to_csv(file_out_12s, index=False, float_format='%.6g')
        data_3s.to_hdf(file_out_3s, "key", mode="w", complevel=5, complib="lzo")

        log = "{0}/{1}, {2}".format(count, max_count, name_data)
        print(log)
        count = count + 1

    print('**** finished ****') 


In [None]:
# roll and resample data: VA validation
seq_data_rolling(files_data_va_val, dir_out_va_val, Th_conf, "VA")

# roll and resample data: EXP validatio
seq_data_rolling(files_data_exp_val, dir_out_exp_val, Th_conf, "EXP")