In [39]:
import pandas as pd
import numpy as np
import os

In [2]:
DATA_INPUT_DIR = "D:\\p_eaglesense\\eaglesense\\data\\topviewkinect"

In [3]:
DATA_OUTPUT_DIR = DATA_INPUT_DIR + "\\v2"
if not os.path.exists(DATA_OUTPUT_DIR):
    os.makedirs(DATA_OUTPUT_DIR)

### features and labels

In [69]:
data_dirs = []
for subdir in sorted(next(os.walk(DATA_INPUT_DIR))[1]):
    if subdir.isdigit():
        data_dirs.append(int(subdir))
data_dirs.sort(key=int)
data_dirs

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [70]:
ignored_X_cols = ["frame_id", "skeleton_id", "x", "y", "z"]
ignored_y_cols = ["frame_id", "skeleton_id", "orientation", "orientation_accurate"]

In [72]:
all_X_csv = "{d}/{tag}_X.csv".format(d=DATA_OUTPUT_DIR, tag="v1")
open(all_X_csv, "w").close()

all_y_csv = "{d}/{tag}_y.csv".format(d=DATA_OUTPUT_DIR, tag="v1")
open(all_y_csv, "w").close()

In [73]:
all_X_file = open(all_X_csv, "a")

all_y_file = open(all_y_csv, "a")

write_header=True

for dataset_id in data_dirs:
    # read data
    X_csv = "{d}/{dataset}/features.csv".format(d=DATA_INPUT_DIR, dataset=dataset_id)
    X_df = pd.read_csv(X_csv, dtype=np.float64)
    y_csv = "{d}/{dataset}/labels.csv".format(d=DATA_INPUT_DIR, dataset=dataset_id)
    y_df = pd.read_csv(y_csv, dtype=np.float64)

    # get single skeleton activities
    y_df = y_df.loc[y_df["skeleton_id"] == 0]
    y_df = y_df.loc[y_df["activity"].isin(list(range(0, 6)))]

    # get common frame indices
    X_df = X_df.loc[X_df["frame_id"].isin(y_df["frame_id"].values)]
    y_df = y_df.loc[y_df["frame_id"].isin(X_df["frame_id"].values)]
    
    # drop unncessary columns
    X_df = X_df.drop(labels=ignored_X_cols, axis=1)
    y_df = y_df.drop(labels=ignored_y_cols, axis=1)
    
    # add dataset id
    X_df["dataset_id"] = dataset_id
    y_df["dataset_id"] = dataset_id
    
    # save to csv
    X_df = X_df.astype("float64")
    X_df.to_csv(all_X_file, header=write_header, index=False)
    y_df = y_df.astype("int")
    y_df.to_csv(all_y_file, header=write_header, index=False)
    
    write_header = False
    
    print(dataset_id, "Done!")
    
all_X_file.close()
all_y_file.close()

1 Done!
2 Done!
3 Done!
4 Done!
5 Done!
6 Done!
7 Done!
8 Done!
9 Done!
10 Done!
11 Done!
12 Done!
13 Done!
14 Done!
15 Done!
16 Done!
17 Done!
18 Done!
19 Done!
20 Done!


### overview

In [74]:
X_all_df = pd.read_csv(all_X_csv, dtype=np.float64)
y_all_df = pd.read_csv(all_y_csv, dtype=np.int)

In [76]:
X_all_df.shape

(92053, 73)

In [77]:
y_all_df.shapepe

(92053, 2)

###  data sampling

In [None]:
def sample_test_split(features_df, labels_df, train_test_ratio, seed):
    
    # Get training sizes
    training_sizes_per_subject = np.zeros((num_subjects, num_activities), dtype=np.int64)
    for subject_idx, subject_id in enumerate(unique_subjects):
        subject_activities = labels_df[labels_df["subject"] == subject_id]["activity"].values
        subject_activities_bin = np.bincount(np.squeeze(subject_activities))
        training_sizes_per_subject[subject_idx] = np.array([int(size * train_test_ratio) for size in subject_activities_bin])
    
    # Get training and testing data
    X_train = np.array([], dtype=np.float64).reshape(0, num_features)
    y_train = np.array([], dtype=np.int32).reshape(0, 1)
    X_test = np.array([], dtype=np.float64).reshape(0, num_features)
    y_test = np.array([], dtype=np.int32).reshape(0, 1)

    # Stratified sampling
    for subject_idx, subject_id in enumerate(unique_subjects):
        subject_features = features_df[features_df["subject"] == subject_id]
        subject_features = subject_features.drop(["subject"], axis=1)
        subject_labels = labels_df[labels_df["subject"] == subject_id]
        subject_labels = subject_labels[["activity"]]

        for activity_idx in range(num_activities):
            num_activity_samples = training_sizes_per_subject[subject_idx, activity_idx]
            activity_labels_df = subject_labels[subject_labels["activity"] == activity_idx]
            activity_train_labels_df = activity_labels_df.sample(n=num_activity_samples, replace=False, random_state=seed)
            
            activity_all_indices = list(activity_labels_df.index.values)
            activity_train_indices = list(activity_train_labels_df.index.values)
            activity_test_indices =  [idx for idx in activity_all_indices if idx not in activity_train_indices]
            
            acitivty_X_train = subject_features.ix[activity_train_indices]
            activity_y_train = subject_labels.ix[activity_train_indices]
            acitivty_X_test = subject_features.ix[activity_test_indices]
            activity_y_test = subject_labels.ix[activity_test_indices]

            X_train = np.vstack([X_train, acitivty_X_train.values])
            y_train = np.vstack([y_train, activity_y_train.values])
            X_test = np.vstack([X_test, acitivty_X_test.values])
            y_test = np.vstack([y_test, activity_y_test.values])
            
    return X_train, y_train, X_test, y_test