In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

# Load and compare all available datasets

## Data_1: HARSense

In [None]:
# load dataset in the predefined format
# a.k.a: concat(X_train,y_train), labels in last column (name = 'activity')
# NOTE: if we have labeled data from multiple subjects we will keep subject id in the 'subject' column
def load_data1():
    # load dataset
    data1_path = "/Users/admin/Desktop/thesis/dataset/Data_1_HARSense_Statistical_Human_Activity_Recognition/HARSense/All Users Combined.csv"
    df = pd.read_csv(data1_path)
    return df

In [None]:
df = load_data1()
df.head()

## Data_2: HAR_using_smartphones

In [None]:
# Comments:
# 1. this data stores info of training subject 0-30. Thus we have a real fed dataset.We will skip this for the time being
# 2. keep data_1 as reference in order for code to be cross compatible => a.k.a convert every data to data_1 format
# 3. to many features we may use PCA or something

In [None]:
# Data2 = HAR using smartphone
# return data2 in df format in the data1 way
# TODO: add subject attributes
def load_data2():
    # load data
    data2_path = "/Users/admin/Desktop/thesis/dataset/Data_2_human_activity_recognition_using_smartphones/"
    X_train_df = pd.read_fwf(data2_path + "UCI HAR Dataset/train/" + "X_train.txt",header=None)
    y_train_df = pd.read_fwf(data2_path + "UCI HAR Dataset/train/" + "y_train.txt",header=None)
    X_test_df = pd.read_fwf(data2_path + "UCI HAR Dataset/test/" + "X_test.txt",header=None)
    y_test_df = pd.read_fwf(data2_path + "UCI HAR Dataset/test/" + "y_test.txt",header=None)
    
    # hardwire labels
    labels = {0:"walking", 1:"walking_upstairs", 2:"walking_downstairs", 3:"sitting", 4:"standing", 5:"laying"}
    
    # label conversion: indices to text
    y_train_df = y_train_df.replace(labels)
    y_test_df = y_test_df.replace(labels)
    
    # concat results
    Xy_train_df = pd.concat((X_train_df,y_train_df),axis=1)
    Xy_train_df.columns = [*Xy_train_df.columns[:-1], "activity"] # set labels name to activity
    Xy_test_df = pd.concat((X_test_df,y_test_df),axis=1)
    Xy_test_df.columns = [*Xy_test_df.columns[:-1], "activity"] # set labels name to activity
    
    df = pd.concat((Xy_train_df,Xy_test_df),axis=0)
    return df

In [None]:
df2 = load_data2()
df2.head()

## Data3: Pamap2

In [3]:
# drop column 0 : timstamp
# drop column 2 : heart rate attribute 90% missing : Note -> if we want heart rate.we have the extra 10% for testing
# column 1 : activity
# df.shape = (2872533, 54) : very large
# we may want to filter the sheer volume of data
# add subject_id tag

In [2]:
def load_data3(n_subjects=4): # max = 9
    data3_path = "/Users/admin/Desktop/thesis/dataset/Data_3_pamap2_physical_activity_monitoring/PAMAP2_Dataset/"
    data_type = "/Protocol/"
    
    subject_list = ["subject10" + str(idx) + ".txt" for idx in range(1,n_subjects + 1,1)]
    
    # read subjects data
    lines = []
    for subject in subject_list:
        with open(data3_path + data_type + subject)as f:
            lines.append(f.readlines())
            
    flat_lines = [item for sublist in lines for item in sublist]
    
    data = []
    # convert str to float
    for sample in flat_lines:
        sample_vec = sample.strip('\n').split()
        sample_vec = [float(x) for x in sample_vec] # str to float
        data.append(sample_vec)
    
        
    # hardwire labels
    labels = {1:"lying", 2:"sitting", 3:"standing", 4:"walking", 5:"running", 6:"cycling", \
          7:"nordic_walking", 8:"watching_TV", 9:"computer_work", 10:"car_driving", \
          11:"ascending_stairs", 12:"descending_stairs", 13:"vacuum_cleaning", \
          16:"vacumm_cleaning", 17:"ironing", 18:"folding_laundry", 19:"house_cleaning", \
          20:"playing_soccer", 24:"rope_jumping"}
        
    # convert to DataFrame
    df = pd.DataFrame(data)
    df.drop([0,2],axis=1,inplace=True) # drop timestamp and heart rate attribute
    df.columns = ["activity",*range(51)] # ["activity",*df.columns[:-1]] # dont change activity order (to avoid unecessary copy)
    ind = np.where(df['activity'].to_numpy() == 0.0)[0] # find zero lable indices
    df.drop(ind,inplace=True) # delete zero labels (transient activities)
    df["activity"].replace(labels,inplace=True)
    
    # remove samples that contain Nan values
    # we could augment data.But unnecessary 
    to_remove = np.zeros(df.shape[0],dtype=bool)
    for idx in range(df.shape[1] - 1):
        find_nans = df[idx].isna().to_numpy()
        to_remove = np.logical_or(to_remove,find_nans)
    ind = np.where(to_remove == True)[0]
        
    df.reset_index(drop=True,inplace=True) # reset indexing after shuffling
    df.drop(ind,axis=0,inplace=True)
    df.reset_index(drop=True,inplace=True) # just in case
    
    return df

In [3]:
df = load_data3(n_subjects=9) # filter subjects to reduce the sheer amount of data

In [88]:
df.head()

Unnamed: 0,activity,0,1,2,3,4,5,6,7,8,...,41,42,43,44,45,46,47,48,49,50
0,lying,30.375,2.2153,8.27915,5.58753,2.24689,8.55387,5.77143,-0.00475,0.037579,...,0.002908,-0.027714,0.001752,-61.1081,-36.8636,-58.3696,1.0,0.0,0.0,0.0
1,lying,30.375,2.29196,7.67288,5.74467,2.27373,8.14592,5.78739,-0.17171,0.025479,...,0.020882,0.000945,0.006007,-60.8916,-36.3197,-58.3656,1.0,0.0,0.0,0.0
2,lying,30.375,2.2909,7.1424,5.82342,2.26966,7.66268,5.78846,-0.238241,0.011214,...,-0.035392,-0.052422,-0.004882,-60.3407,-35.7842,-58.6119,1.0,0.0,0.0,0.0
3,lying,30.375,2.218,7.14365,5.8993,2.22177,7.25535,5.88,-0.192912,0.019053,...,-0.032514,-0.018844,0.02695,-60.7646,-37.1028,-57.8799,1.0,0.0,0.0,0.0
4,lying,30.375,2.30106,7.25857,6.09259,2.2072,7.24042,5.95555,-0.069961,-0.018328,...,0.001351,-0.048878,-0.006328,-60.204,-37.1225,-57.8847,1.0,0.0,0.0,0.0


## Data4: MHealth

In [None]:
# TODO: add subject attributes
def load_data4():
    data4_path = "/Users/admin/Desktop/thesis/dataset/Data_4_MHEALTHDATASET/"
    id_list = [*range(11)][1:]
    lines = []
    
    # scan every subject
    for idx in id_list:
        with open(data4_path + "mHealth_subject" + str(idx) + ".txt") as f:
            data = f.readlines()
            lines.append(data)
            
    # skip subject id for the time being : convert list of list to list
    flat_lines = [item for sublist in lines for item in sublist]
    
    data = []
    # convert str to float
    for sample in flat_lines:
        sample_vec = sample.strip('\n').split('\t')
        sample_vec = [float(x) for x in sample_vec] # str to float
        data.append(sample_vec)
    
    # convert to df
    df = pd.DataFrame(data)
    
    # hardwire labels
    labels = {0:"null", 1:"standing_still", 2:"sitting_and_relaxing", 3:"lying_down", 4:"walking", 5:"climbing_stairs", \
             6:"waist_bend_forward", 7:"frontal_elevation_of_arms", 8:"knees_bending", 9:"cycling", \
             10:"jogging", 11:"running", 12:"jummp_front_and_back"}
    
    df.columns = [*df.columns[:-1], "activity"] # set labels name to activity
    #df = df.replace(labels) # set labels
    df['activity'].replace(labels,inplace=True)
    
    # filter null indices: we may want to keep this if we want to classify null activities
    ind = np.where(df['activity'].to_numpy() == 'null')[0]
    df = df.drop(ind)
    
    return df

In [None]:
df = load_data4()

In [None]:
df.head()

## Data_5 ACC DATA (RAW)

In [38]:
data5_path = "/Users/admin/Desktop/thesis/dataset/Data_5_labeled-raw-accelerometry-data-captured-during-walking-stair-climbing-and-driving-1.0.0/"

In [16]:
# get data info
demograhpic_info = pd.read_csv(data5_path + "participant_demog.csv")
features_info = pd.read_csv(data5_path + "raw_accelerometry_data_dict.csv")

In [19]:
#demograhpic_info

In [23]:
features_info

Unnamed: 0,Variable name,Definition,Values
0,activity,Type of activity,1=walking; 2=descending stairs; 3=ascending st...
1,time_s,Time in seconds from device initiation,Measured in hundredths of seconds
2,lw_x,Left wrist x-axis,Acceleration measured with respect to Earth's ...
3,lw_y,Left wrist y-axis,Acceleration measured with respect to Earth's ...
4,lw_z,Left wrist z-axis,Acceleration measured with respect to Earth's ...
5,lh_x,Left hip x-axis,Acceleration measured with respect to Earth's ...
6,lh_y,Left hip y-axis,Acceleration measured with respect to Earth's ...
7,lh_z,Left hip z-axis,Acceleration measured with respect to Earth's ...
8,la_x,Left ankle x-axis,Acceleration measured with respect to Earth's ...
9,la_y,Left ankle y-axis,Acceleration measured with respect to Earth's ...


In [145]:
def load_data5():
    subject_path_list = os.listdir(data5_path + "raw_accelerometry_data") # concenrate every file in directory

    # correlate subjects id : (for Fed data creation)
    subject_id = {}
    df_list = []
    for idx,subject_path in enumerate(subject_path_list):
        subject_id[idx] = subject_path
        current_df = pd.read_csv(data5_path + "raw_accelerometry_data/" + subject_path)
        # append subject column
        subject_id_data = np.ones(current_df.shape[0],dtype=np.int32) * idx
        current_df.insert(1,"subject",subject_id_data)
        df_list.append(current_df)
        
    df = pd.concat(df_list,axis=0) # concat dfs

    labels = {1:"walking", 2:"descending_stairs", 3:"ascending_stairs", 4:"driving", 77:"clapping", 99:"non_study_activity"};    
    df.drop("time_s",axis=1,inplace=True) # drop timestamp
    df.reset_index(drop=True,inplace=True)
    # remove label 99 : null activity
    ind = np.where(df['activity'].to_numpy() == 99)[0]
    df.drop(ind,inplace=True)
    df['activity'].replace(labels,inplace=True) # replace label tags with names
    
    return df

In [146]:
df = load_data5()

In [147]:
df.head()

Unnamed: 0,activity,subject,lw_x,lw_y,lw_z,lh_x,lh_y,lh_z,la_x,la_y,la_z,ra_x,ra_y,ra_z
35216,clapping,0,0.094,0.703,-0.703,0.293,-0.926,-0.25,-0.152,-0.953,0.117,-0.156,0.992,0.082
35217,clapping,0,0.172,0.699,-0.73,0.293,-0.922,-0.25,-0.16,-0.953,0.113,-0.156,0.992,0.086
35218,clapping,0,0.258,0.695,-0.762,0.293,-0.922,-0.254,-0.16,-0.953,0.113,-0.156,1.0,0.078
35219,clapping,0,0.348,0.711,-0.793,0.289,-0.918,-0.254,-0.156,-0.957,0.113,-0.152,1.004,0.074
35220,clapping,0,0.438,0.734,-0.797,0.289,-0.922,-0.254,-0.156,-0.957,0.117,-0.156,1.008,0.078


## Data_6 MotionSense

In [2]:
data6_path = "/Users/admin/Desktop/thesis/dataset/Data_6_MotionSense/"

In [4]:
data_subjects_info = pd.read_csv(data6_path + "data_subjects_info.csv")
data_subjects_info

Unnamed: 0,code,weight,height,age,gender
0,1,102,188,46,1
1,2,72,180,28,1
2,3,48,161,28,0
3,4,90,176,31,1
4,5,48,164,23,0
5,6,76,180,28,1
6,7,62,175,30,0
7,8,52,161,24,0
8,9,93,190,32,1
9,10,72,164,31,0


In [5]:
# dataloading code from https://github.com/mmalekzadeh/sensplit/tree/master

def set_data_types(data_types=["userAcceleration"]):
    """
    Select the sensors and the mode to shape the final dataset.
    
    Args:
        data_types: A list of sensor data type from this list: [attitude, gravity, rotationRate, userAcceleration] 

    Returns:
        It returns a list of columns to use for creating time-series from files.
    """
    dt_list = []
    for t in data_types:
        if t != "attitude":
            dt_list.append([t+".x",t+".y",t+".z"])
        else:
            dt_list.append([t+".roll", t+".pitch", t+".yaw"])
    print(dt_list)
    return dt_list

def get_ds_infos():
    """
    Read the file includes data subject information.
    
    Data Columns:
    0: code [1-24]
    1: weight [kg]
    2: height [cm]
    3: age [years]
    4: gender [0:Female, 1:Male]
    
    Returns:
        A pandas DataFrame that contains inforamtion about data subjects' attributes 
    """ 
    data6_path = "/Users/admin/Desktop/thesis/dataset/Data_6_MotionSense/" # <- path here
    dss = pd.read_csv(data6_path + "data_subjects_info.csv")
    print("[INFO] -- Data subjects' information is imported.")
    
    return dss

def creat_time_series(folder_name, dt_list, act_labels, trial_codes, mode="mag", labeled=True):
    """
    Args:
        folder_name: one of 'A_DeviceMotion_data', 'B_Accelerometer_data', or C_Gyroscope_data
        dt_list: A list of columns that shows the type of data we want.
        act_labels: list of activites
        trial_codes: list of trials
        mode: It can be 'raw' which means you want raw data
        for every dimention of each data type,
        [attitude(roll, pitch, yaw); gravity(x, y, z); rotationRate(x, y, z); userAcceleration(x,y,z)].
        or it can be 'mag' which means you only want the magnitude for each data type: (x^2+y^2+z^2)^(1/2)
        labeled: True, if we want a labeld dataset. False, if we only want sensor values.

    Returns:
        It returns a time-series of sensor data.
    
    """
    num_data_cols = len(dt_list) if mode == "mag" else len(dt_list*3)

    if labeled:
        dataset = np.zeros((0,num_data_cols+7)) # "7" --> [act, code, weight, height, age, gender, trial] 
    else:
        dataset = np.zeros((0,num_data_cols))
        
    ds_list = get_ds_infos()
    
    print("[INFO] -- Creating Time-Series")
    for sub_id in ds_list["code"]:
        for act_id, act in enumerate(act_labels):
            for trial in trial_codes[act_id]:
                fname = folder_name+'/'+act+'_'+str(trial)+'/sub_'+str(int(sub_id))+'.csv'
                raw_data = pd.read_csv(fname)
                raw_data = raw_data.drop(['Unnamed: 0'], axis=1)
                vals = np.zeros((len(raw_data), num_data_cols))
                for x_id, axes in enumerate(dt_list):
                    if mode == "mag":
                        vals[:,x_id] = (raw_data[axes]**2).sum(axis=1)**0.5        
                    else:
                        vals[:,x_id*3:(x_id+1)*3] = raw_data[axes].values
                    vals = vals[:,:num_data_cols]
                if labeled:
                    lbls = np.array([[act_id,
                            sub_id-1,
                            ds_list["weight"][sub_id-1],
                            ds_list["height"][sub_id-1],
                            ds_list["age"][sub_id-1],
                            ds_list["gender"][sub_id-1],
                            trial          
                           ]]*len(raw_data), dtype=int)
                    vals = np.concatenate((vals, lbls), axis=1)
                dataset = np.append(dataset,vals, axis=0)
    cols = []
    for axes in dt_list:
        if mode == "raw":
            cols += axes
        else:
            cols += [str(axes[0][:-2])]
            
    if labeled:
        cols += ["act", "id", "weight", "height", "age", "gender", "trial"]
    
    dataset = pd.DataFrame(data=dataset, columns=cols)
    return dataset

In [39]:
def load_data6():
    # wrap function for creat_time_series
    ACT_LABELS = ["dws","ups", "wlk", "jog", "std", "sit"]
    TRIAL_CODES = {
        ACT_LABELS[0]:[1,2,11],
        ACT_LABELS[1]:[3,4,12],
        ACT_LABELS[2]:[7,8,15],
        ACT_LABELS[3]:[9,16],
        ACT_LABELS[4]:[6,14],
        ACT_LABELS[5]:[5,13]
    }
    
    folder_name = data6_path + "A_DeviceMotion_data/A_DeviceMotion_data" # <- path here
    sdt = ["attitude", "gravity", "rotationRate", "userAcceleration"]
    dt_list = set_data_types(sdt)
    act_labels = ACT_LABELS [0:6]
    trial_codes = [TRIAL_CODES[act] for act in act_labels]
    
    df = creat_time_series(folder_name, dt_list, act_labels, trial_codes, mode="raw", labeled=True)
    
    # modify existing format
    # 1. act -> activity
    # 2. id -> subject
    # 3. match activit_id with har name
    # 4. remove weight,height,age,gender,trial
    
    df.drop(["weight","height","age","gender","trial"],axis=1,inplace=True) # 4. drop timestamp
    # 1,2
    columns = [*df.columns]
    columns[-2] = 'activity'
    columns[-1] = 'subject'
    df.columns = columns
    # 3
    labels = {0:'downstairs', 1:'upstairs', 2:'walking', 3:'jogging', 4:'standing', 5:'sitting'}
    df['activity'].replace(labels,inplace=True) # replace label tags with names # -> find labels

    return df

In [40]:
df = load_data6()

[['attitude.roll', 'attitude.pitch', 'attitude.yaw'], ['gravity.x', 'gravity.y', 'gravity.z'], ['rotationRate.x', 'rotationRate.y', 'rotationRate.z'], ['userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z']]
[INFO] -- Data subjects' information is imported.
[INFO] -- Creating Time-Series


In [41]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,activity,subject
0,1.528132,-0.733896,0.696372,0.741895,0.669768,-0.031672,0.316738,0.77818,1.082764,0.294894,-0.184493,0.377542,downstairs,0.0
1,1.527992,-0.716987,0.677762,0.753099,0.657116,-0.032255,0.842032,0.424446,0.643574,0.219405,0.035846,0.114866,downstairs,0.0
2,1.527765,-0.706999,0.670951,0.759611,0.649555,-0.032707,-0.138143,-0.040741,0.343563,0.010714,0.134701,-0.167808,downstairs,0.0
3,1.516768,-0.704678,0.675735,0.760709,0.647788,-0.04114,-0.025005,-1.048717,0.03586,-0.008389,0.136788,0.094958,downstairs,0.0
4,1.493941,-0.703918,0.672994,0.760062,0.64721,-0.05853,0.114253,-0.91289,0.047341,0.199441,0.353996,-0.044299,downstairs,0.0
