In [1]:
#!pip install pyarrow
#!pip install tensorflow

In [2]:
import numpy as np
import pandas as pd
import pyarrow
from tqdm import tqdm
import matplotlib.pyplot as plt

In [3]:
df = pd.read_parquet('dataset.parquet', engine='pyarrow')

In [4]:
nan_mask = df.drop(['Participant', 'Exercise', 'Set', 'Camera','time(s)'], axis = 1).isna().all(axis = 1)
# Get the indices of the rows with NaN values
nan_indices = [i for i, has_nan in enumerate(nan_mask) if has_nan]

th = 30
current_sequence_length = 1
ids = []

for i in range(1, len(nan_indices)):
    if nan_indices[i] == nan_indices[i - 1] + 1:
        current_sequence_length += 1
    else:
        if current_sequence_length > th:
            ids.append(np.arange(nan_indices[i- current_sequence_length], nan_indices[i]+1))
        current_sequence_length = 1
ids = np.concatenate(ids)

In [95]:
ids

array([  40005,   40006,   40007, ..., 2166420, 2166421, 2166422])

In [31]:
#Interpolating linearly for the missing frames
df = df.interpolate(method='linear', axis=0, limit_direction='both')

In [None]:
df.to_parquet('prep_dataset.parquet', index=True)

In [2]:
df = pd.read_parquet('prep_dataset.parquet', engine='pyarrow')

In [1]:
display(df[1828:1831])

NameError: ignored

In [28]:
BLOCK_LENGTH = 20

params_columns = ['Participant', 'Exercise', 'Set', 'Camera']
df['SequenceLength'] = df.groupby(params_columns)['time(s)'].transform('count')

# Initialize lists to store padded arrays
padded_params_list = []
padded_sequences_list = []

# Get the total number of unique groups
total_groups = 2100

# Create a tqdm object
pbar = tqdm(total=total_groups, desc="Processing", position=0, leave=True)

# Iterate over the groups with tqdm to display a progress bar
for idx, (sequence_id, group) in enumerate(df.groupby(params_columns)):
    sequence_data = group.iloc[:, 4:-1].values
    sequence_length = len(sequence_data)
    pad_width = ((0, ((sequence_length // BLOCK_LENGTH) + 1) * BLOCK_LENGTH - sequence_length), (0, 0))
    padded_sequence = np.pad(sequence_data, pad_width, mode='constant', constant_values=0).astype('float32')
    # Pad the parameters by repeating the same values for each row in the padded sequence
    repeated_params = np.tile(sequence_id, (len(padded_sequence), 1))
    
    # Append the padded arrays to the lists
    padded_params_list.append(repeated_params)
    padded_sequences_list.append(padded_sequence)

    # Update the tqdm progress bar
    pbar.update(1)

# Close the tqdm progress bar after completion
pbar.close()


Processing: 100%|██████████| 2100/2100 [00:32<00:00, 64.58it/s] 


In [26]:
padded_params = np.vstack(padded_params_list)
#del padded_params_list
padded_sequences = np.vstack(padded_sequences_list).astype('float32')
#del padded_sequences_list
print(padded_sequences.shape)
#del padded_sequences
#del padded_params


(2232400, 100)


In [29]:
str_df = pd.DataFrame(padded_params, columns = params_columns)
float_df = pd.DataFrame(padded_sequences, columns = df.columns[4:-1])
padded_df = pd.concat([str_df,float_df], axis = 1)
#del data

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2210672 entries, 0 to 734
Columns: 105 entries, Participant to SequenceLength
dtypes: float32(99), float64(1), int64(1), object(4)
memory usage: 952.9+ MB


In [30]:
padded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2232400 entries, 0 to 2232399
Columns: 104 entries, Participant to right_wrist_z
dtypes: float32(100), object(4)
memory usage: 919.7+ MB


In [31]:
display(padded_df)

Unnamed: 0,Participant,Exercise,Set,Camera,time(s),left_ankle_x,left_ankle_y,left_ankle_z,left_ear_x,left_ear_y,...,right_pinky_z,right_shoulder_x,right_shoulder_y,right_shoulder_z,right_thumb_x,right_thumb_y,right_thumb_z,right_wrist_x,right_wrist_y,right_wrist_z
0,P04,Abduction,A,Frontal_Low,0.000000,0.032965,-0.474184,0.624312,-0.070598,0.542229,...,-0.672386,-0.069615,0.441862,-0.438722,-0.049521,0.955902,-0.631346,-0.057846,0.942763,-0.628748
1,P04,Abduction,A,Frontal_Low,0.033333,0.043956,-0.442556,0.637613,-0.059279,0.543494,...,-0.674930,-0.066681,0.441969,-0.440480,-0.042736,0.959293,-0.633053,-0.050225,0.945640,-0.630666
2,P04,Abduction,A,Frontal_Low,0.066667,0.032360,-0.456428,0.619597,-0.062057,0.545033,...,-0.675053,-0.068457,0.442443,-0.441442,-0.042656,0.959001,-0.633970,-0.050077,0.945382,-0.631667
3,P04,Abduction,A,Frontal_Low,0.100000,0.022615,-0.472325,0.627187,-0.061786,0.542426,...,-0.734107,-0.062215,0.423310,-0.462091,-0.044001,0.939052,-0.687761,-0.051261,0.924206,-0.685207
4,P04,Abduction,A,Frontal_Low,0.133333,0.010121,-0.498533,0.502819,-0.063879,0.551627,...,-0.609257,-0.041220,0.455357,-0.418660,-0.030477,0.960790,-0.572801,-0.033562,0.948232,-0.573078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2232395,P28,Stretch,E,Side_Top,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2232396,P28,Stretch,E,Side_Top,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2232397,P28,Stretch,E,Side_Top,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2232398,P28,Stretch,E,Side_Top,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [32]:
padded_df.to_parquet('block_dataset.parquet', index=True)