In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_dir = "../data/GestureData/"
file_name = "Circle_V01_Pascal_Raw_labels.csv"

In [3]:
df = pd.read_csv(data_dir + file_name)

In [4]:
df

Unnamed: 0,from,to,label,real_start,real_end,diff
0,,,1,18.7,20.45,-0.25
1,,,1,20.4,22.4,0.0
2,,,1,22.35,24.5,0.15
3,,,1,26.65,28.55,-0.1
4,,,1,30.7,32.6,-0.1
5,,,1,34.65,36.35,-0.3
6,,,1,38.65,40.55,-0.1
7,,,1,42.6,44.5,-0.1
8,,,1,46.5,48.4,-0.1
9,,,1,50.7,52.5,-0.2


In [5]:
class LabelGenerator():
    
    def __init__(self, data, raw_labels, ms_per_frame, max_delta = 500):
        
        # stores the original data and the used framerate.
        self.data = data
        self.raw_labels = raw_labels 
        self.ms_per_frame = ms_per_frame
        
        # creates label-Dataframe whose "from"/"to" columns will be used for labelling.
        self.label_df = pd.DataFrame(
            columns = ["label","real_start","real_end"]
        )
        self.label_df[["label","real_start","real_end"]] =\
            self.raw_labels[["label","real_start","real_end"]]
        
        self.label_df["real_start"] = self.label_df["real_start"].apply(lambda x: x*1000)
        self.label_df["real_end"] = self.label_df["real_end"].apply(lambda x: x*1000)
        
        self.label_df["real_start"] = np.round(self.label_df["real_start"],0).astype("int32")
        self.label_df["real_end"] = np.round(self.label_df["real_end"],0).apply(int)
        
        
        # creates a Dataframe to store the used slacks for each labeled sample
        self.label_info = pd.DataFrame(
            columns=["diff","indicator","l_slack","u_slack","min_range_ind"]
        )
        
        # default variables
        self.symmetric_slack = 0
        self.min_range = 200
        self.is_fitted = False
        
        self.max_delta = max_delta
        self.has_cutoff = False
        
        self.is_labeled = False
        
    
    # PUBLIC method that creates two Dataframe, label_df and label_info
    # label_df --> includes the acceptable range with the columns "from" and "to".
    #              any value between "from" and "to" is an acceptable movement endpoint and can be labeled accordingly.
    #              USAGE: this data frame will be used to create the final training data set
    # label_info --> provides additional information about how the slacks were calculated
    #              USAGE: this data frame is for debugging mainly
    def fit_slack(self, symmetric_slack = None, min_range = None):
        
        symmetric_slack = self.__check_variable("symmetric_slack", symmetric_slack)
        min_range = self.__check_variable("min_range", min_range)

        diff = self.label_df["real_end"] - self.label_df["real_start"] - 2000
        
        indicator, lower_slack, upper_slack, min_range_ind = self.__calc_slack(diff)
    
        self.label_df["from"] = (self.label_df["real_end"] + lower_slack).astype("int32")
        self.label_df["to"] = (self.label_df["real_end"] + upper_slack).astype("int32")
        self.label_df["ignore"] = (abs(diff) >= self.max_delta)

    
        self.label_info["diff"] = diff
        self.label_info["indicator"] = indicator
        self.label_info["l_slack"] = lower_slack 
        self.label_info["u_slack"] = upper_slack
        self.label_info["min_range_ind"] = min_range_ind
        
        self.is_fitted = True

 

    # PRIVATE METHOD
    # calculates the accaptable interval range for each sample
    # adds slack on both sides of the actuall dataframe when the movement ended
    def __calc_slack(self, diff):
        indicator = (diff >= 0)
        delta = self.symmetric_slack - self.min_range
        
        lower_slack = - indicator * diff - delta * (delta > 0)
        upper_slack = - (~indicator) * diff + delta * (delta > 0)
 
        current_range = upper_slack - lower_slack
        range_delta = self.min_range - current_range
        min_range_ind = (range_delta > 0)
    
        lower_slack = lower_slack - range_delta//2 * min_range_ind
        upper_slack = upper_slack + range_delta//2 * min_range_ind
        
        return indicator, lower_slack.astype("int32"), upper_slack.astype("int32"), min_range_ind
    
  
    # PRIVATE method that returns default variable values if no value is provided 
    #   and sets instance variables otherwise:
    #   symmetric_slack, min_range, max_delta
    def __check_variable(self, identifier, value):
        
        if identifier == "symmetric_slack":
            if not value:
                value = self.symmetric_slack
            else:
                self.symmetric_slack = value
        
        elif identifier == "min_range":
            if not value:
                value = self.min_range
            else:
                self.min_range = value
        
        elif identifier == "max_delta":
            if not value:
                value = self.max_delta
            else:
                self.max_delta = value
            
        return value
    
    
    # creates the cutoff Dataframe with additional information about all movements that exceeded the max_delta
    #   ... specified on initialization
    # USAGE: any movement in the cutoff_df will not yield any labeled data. In the future it might even be removed 
    #        ... completely from the data (not implemented yet)
    def set_cutoff(self):
        
        if not self.is_fitted:
            raise ValueError("You have to fit the slack before you can set a cutoff")
        
            
        self.__cutoff_df = self.label_df[abs(self.label_info["diff"])>= self.max_delta]\
            [["real_start","real_end"]]
        self.__cutoff_df["start_idx"] =\
            (self.__cutoff_df["real_start"]//self.ms_per_frame).apply(int)
        self.__cutoff_df["start_calc"] =\
            self.__cutoff_df["start_idx"] * self.ms_per_frame
        self.__cutoff_df["end_idx"] =\
            np.ceil(self.__cutoff_df["real_end"]/self.ms_per_frame).apply(int)
        self.__cutoff_df["end_calc"] =\
            self.__cutoff_df["end_idx"] * self.ms_per_frame
        
        self.has_cutoff = True
        
    
    # calls the cutoff Dataframe with additional information about all movements that exceeded the max_delta
    #   ... specified on initialization
    # this method can only be called after the cutoff Dataframe has been created by calling the set_cutoff method
    def get_cutoff(self):
        
        if not self.has_cutoff:
            raise ValueError("You have to set the cutoff with the set_cutoff method")
            
        else:
            return self.__cutoff_df
        
 
    # PUBLIC method that creates the PRIVATE labeled-Data Dataframe. 
    # This dataframe can be called by the get_labeled_data method
    # this is the the data frame that provides a label for each wire frame from posenet
    def set_labels(self):
        _T = pd.DataFrame(columns=["time"])
        _T["time"] = (self.data.index.values+1) * self.ms_per_frame
        _T["_key_"] = 0
        _l = self.label_df[["from","to","label","ignore"]]
        _l["_key_"] = 0
        _m = _T.reset_index().merge(_l, how="left").set_index("index")
        _l = _m[(_m["time"] >= _m["from"]) & (_m["time"] <= _m["to"])].loc[:,["time","label","ignore"]]
        
        self.__labeled_data = self.data.copy()
        self.__labeled_data["label"] = _l["label"][~_l["ignore"]]
        self.__labeled_data.fillna(value={'label': 0}, inplace = True)
        self.__labeled_data["label"] = self.__labeled_data["label"].astype("int32")
        self.__labeled_data["time"] = np.round(_T["time"],0).astype("int32")
 
        self.is_labeled = True
    
    
    # PUBLIC get-Method for the private dataset that stores the labeled data
    def get_labeled_data(self):
        if not self.is_labeled:
            raise ValueError("You have to set the labels with the set_labels-method")
        else:
            return self.__labeled_data
        
    
    # provides 3D labeled data and labels for training. The instance can call X, y, feature_names and final_time
    # X --> Array with dimensions [sample size] x [timesteps per sample] x [number of features]
    # y --> vector of labels with length [sample size]
    # feature_names --> list of the names of the assiciated columns in X
    # final_time --> vector with the number of milliseconds associated with the first dimension of X ([sample size])
    def extract_input_data(self):
        
        if not self.is_labeled:
            raise ValueError("You have to set the labels with the set_labels-method")
        
        steps = int(2000//self.ms_per_frame) + 1
        self.feature_names = self.__labeled_data.columns.drop(['label','time'])
        
        _fn = self.__labeled_data.shape[0] - steps + 1
        _ln = self.__labeled_data.shape[0]
        self.final_time = self.__labeled_data.loc[(_ln-_fn):_ln,"time"].values
        
        self.X = np.zeros((
            _fn,
            steps,
            len(self.feature_names)
        ))
        self.y = np.zeros(self.__labeled_data.shape[0] - steps + 1)
        _F = self.__labeled_data.loc[:,self.feature_names].values

        for i in range(steps,_F.shape[0]+1):
            self.X[i-steps] = _F[i-steps:i,:]
            self.y[i-steps] = self.__labeled_data['label'][i-1] 
    
        

In [6]:
lgen = LabelGenerator(
    # using dummy data here. This is supposed your wireframe data from posenet
    data = pd.DataFrame(np.zeros((8,540))).transpose(),
    
    # manually labeled "raw" gesture data with real beginning and real end of movement
    raw_labels = df[["real_start","real_end","label"]],
    
    # associated frame rate of the LabelGenerator. This instance only works with the framerate specified on instantiation
    ms_per_frame = 130,
    
    # maximum acceptable difference in movement length compared to the theoretical movement length (2000 ms)
    # if movement length is smaller than 2000 - max_delta or greater than 2000 + max_delta, there will be 0-label
    max_delta = 400
)
# fits the slacks, the length of the acceptable range will be 400 ms
lgen.fit_slack(0,400)

# makes the cutoff_df attribute of the LabelGenerator accesible. 
# cutoff_df provides additional information about movements with a delta of more than max_delta
lgen.set_cutoff()

# creates the labeled data set. it can be called with the get_labeled_data method
lgen.set_labels()

# provides 3D labeled data and labels for training. The instance can call X, y, feature_names and final_time
# X --> Array with dimensions [sample size] x [timesteps per sample] x [number of features]
# y --> vector of labels with length [sample size]
# feature_names --> list of the names of the assiciated columns in X
# final_time --> vector with the number of milliseconds associated with the first dimension of X ([sample size])
lgen.extract_input_data()

In [7]:
lgen.label_df

Unnamed: 0,label,real_start,real_end,from,to,ignore
0,1,18700,20450,20125,20525,False
1,1,20400,22400,22200,22600,False
2,1,22350,24500,24375,24775,False
3,1,26650,28550,28300,28700,False
4,1,30700,32600,32350,32750,False
5,1,34650,36350,36000,36400,False
6,1,38650,40550,40300,40700,False
7,1,42600,44500,44250,44650,False
8,1,46500,48400,48150,48550,False
9,1,50700,52500,52200,52600,False


In [8]:
lgen.label_info

Unnamed: 0,diff,indicator,l_slack,u_slack,min_range_ind
0,-250,False,-325,75,True
1,0,True,-200,200,True
2,150,True,-125,275,True
3,-100,False,-250,150,True
4,-100,False,-250,150,True
5,-300,False,-350,50,True
6,-100,False,-250,150,True
7,-100,False,-250,150,True
8,-100,False,-250,150,True
9,-200,False,-300,100,True


In [9]:
lgen.get_labeled_data().head()

Unnamed: 0,0,1,2,3,4,5,6,7,label,time
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,130
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,260
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,390
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,520
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,650


In [10]:
print(len(lgen.y))
print(lgen.X.shape)
print("")

i = 157
print(lgen.y[i], '\n', lgen.X[i,:,:])

525
(525, 16, 8)

1.0 
 [[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]


In [11]:
lgen.final_time[-2:]

array([70070, 70200])

In [12]:
lgen.get_labeled_data().tail(2)

Unnamed: 0,0,1,2,3,4,5,6,7,label,time
538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,70070
539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,70200
