<h4>Identifying Individuals Based Off Of Their Accelerometer Data Using The WISDM Data Set</h4>


In [90]:
import numpy as np
import pandas as pd

In [102]:
columns = ['user','activity','timestamp', 'x-axis', 'y-axis', 'z-axis']

# save this for making a new data set if need be
# load in data set
df = pd.read_csv('WISDM_at_v2.0_raw.txt', header = None, names = columns)

# get rid of unnecessary characters
df['z-axis'] = df['z-axis'].str.rstrip(';')

# obtain only walking data since this has the most of any activity
walking = df.loc[df.activity == 'Walking']

In [103]:
# unique id's before getting rid of those without a sufficient amount of data
walking.user.unique()

array([1679,  599,  685,  669, 1277,  674,  594,  678,  648,  584,  582,
        636, 1758,  708,  711,  687,  563,  621,  623,  720, 1793,  568,
        640,  671,  694,  664,  585,  684, 1480,  655, 1603,  651,  579,
        613,  590,  639,  587,  719,  635, 1742,  624,  710,  676,  693,
        646,  713,  610,  615,  705,  653,  702,  604,  618,  654,  656,
        606,  998,  586, 1319,  668, 1768, 1100,  573, 1491,  712, 1518,
        730,  622,  647, 1727, 1477,  588,  634,  661,  686,  690,  709,
        663,  597, 1656,  630,  616,  691,  625,  612,  650,  658, 1750,
        598,  729,  714,  607,  628,  589,  728,  925,  593,  600,  637,
        641,  609,  633, 1783,  688,  605, 1247,  695,  595,  602,  194,
        583,  716,  727, 1797, 1320, 1676,  673, 1802,  611, 1117,  689,
       1774,  666,  617,  706,  627,  675, 1799, 1703,  670, 1759, 1554,
       1778, 1064, 1238,  726,  632,  608,  725,  614,  697, 1775,  592,
        723,  591, 1512, 1253,  580,  703,  722,  6

In [104]:
# obtain only users who have over ~15 minutes of data
for i in walking.user.unique():    
    if len(walking[walking.user == i]) < 19000:
        walking = walking[walking.user != i]

In [105]:
# There are 7 unique users
walking.user.unique()

array([ 585, 1750,  688,  675, 1238,  603,  679])

In [106]:
# get rid of unnecessary characters and rows with empty entries
walking = walking[walking.columns.unique()].replace(';', '')
walking = walking.dropna()

# update columns to be usable types
walking.timestamp = walking.timestamp.astype(int)
walking['x-axis'] = walking['x-axis'].astype(float)
walking['y-axis'] = walking['y-axis'].astype(float)
walking['z-axis'] = walking['z-axis'].astype(float)



In [107]:
# function to normalize time stamps starting at 0 milliseconds
def normalize_time_stamps(df):

    # iterate through all users
    for user in df.user.unique():

        # iterate through all activities
        for activity in df.activity.unique():

            key = (df.user == user) & (df.activity == activity)
            # obtain all rows for a particular user and an activity
            user_activity_to_process = df[key]
            
            # if there are no rows for this activity, drop the activity from that user
            if len(user_activity_to_process.timestamp) == 0:
                df[(df.user == user)] = df[(df.user == user) & (df.activity != activity)]
                break

            # otherwise normalize the time stamps to start at 0, and be in milliseconds?
            else:
                df.loc[key, 'timestamp'] -= user_activity_to_process.timestamp.iloc[0]
                df.loc[key, 'timestamp'] /= 1000
                print(df[key])
                            
    return df

In [108]:
# standardize accelerometer data 
#normalized = normalize_time_stamps(df)

# min max scaling

# standard scaler 

# perform windowing of the data for each user, and each activity

# translate each minute window into an individual row with a label (which is the user)

# add bias for each window (3 biases for each window) + bias so it will end up being a 4 x 1200 
# feature vector. Maybe 

# add row to matrix of all data for a particular activity

# decide on model, and train model to see what we can do

In [109]:
# obtain a data set with normalized time stamps
# this doesn't really matter for actual training, but it's nice to have clean timestamps
walking_norm = normalize_time_stamps(walking)
for user in walking_norm.user.unique():
    print(walking_norm.loc[walking_norm.user == user, 'timestamp'])

        user activity  timestamp    x-axis    y-axis     z-axis
316949   585  Walking       0.00  3.173541 -0.694638   9.152874
316950   585  Walking       0.05  3.146300 -0.340509   9.193735
316951   585  Walking       0.10  1.184970 -0.040861  10.841797
316952   585  Walking       0.15  2.792171  0.653777  10.310603
316953   585  Walking       0.20  2.220117  0.503953  10.106298
...      ...      ...        ...       ...       ...        ...
342233   585  Walking    1333.50 -0.463092  5.706926   8.349273
342234   585  Walking    1333.55 -1.607201  5.516241   8.621680
342235   585  Walking    1333.60 -1.307553  5.625204   8.539958
342236   585  Walking    1333.65 -1.334794  5.788648   9.003050
342237   585  Walking    1333.70 -1.225831  6.020194   8.471856

[21886 rows x 6 columns]
         user activity  timestamp    x-axis     y-axis     z-axis
1492383  1750  Walking      0.000 -3.823853  12.672226   0.331696
1492384  1750  Walking      0.050  4.261078   4.433762  10.710114
1492385 

In [118]:
# utilize 2 second windows with 50% overlap 
new_features = []
labels = []

# 20 HZ sampling means 20 samples every second. Therefore 2 seconds is 40 samples, .
step_size = 20
window_size = 40

for user in walking_norm.user.unique():
    user_walking_data = walking_norm[walking_norm.user == user]
    # go through ea
    for window_start in range(0, len(user_walking_data) - window_size, step_size):
        x = user_walking_data['x-axis'].values[window_start: window_start + window_size]
        y = user_walking_data['y-axis'].values[window_start: window_start + window_size]
        z = user_walking_data['z-axis'].values[window_start: window_start + window_size]
        new_features.append([x, y, z])

        # Label for a data window is the label that appears most commonly
        labels.append(np.full((1, window_size), user))

# Convert to numpy
new_features = np.asarray(new_features, dtype=np.float32).transpose(0, 2, 1)

40

In [None]:
# perform feature transformations
new_features = []
labels = []

# Slide a "SEGMENT_TIME_SIZE" wide window with a step size of "TIME_STEP"
for i in range(0, len(data) - SEGMENT_TIME_SIZE, TIME_STEP):
    x = data['x-axis'].values[i: i + SEGMENT_TIME_SIZE]
    y = data['y-axis'].values[i: i + SEGMENT_TIME_SIZE]
    z = data['z-axis'].values[i: i + SEGMENT_TIME_SIZE]
    data_convoluted.append([x, y, z])

    # Label for a data window is the label that appears most commonly
    label = stats.mode(data['activity'][i: i + SEGMENT_TIME_SIZE])[0][0]
    labels.append(label)

# Convert to numpy
data_convoluted = np.asarray(data_convoluted, dtype=np.float32).transpose(0, 2, 1)

<h4>Prepare Model</h4>

In [68]:
import torch 
import torchvision
import os

import random
import math
import torch.utils.data as tdata
import torch.optim as opt
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = [9, 6]

In [69]:
n_epochs = 20
batch_size_train = 64
batch_size_test = 512
learning_rate = 0.001
logging_interval = 1
valid_percent = 0.2