In [1]:
import pandas as pd
import os
import random
import numpy as np
from scipy.stats import variation
from matplotlib import pyplot as plt
import pickle
%matplotlib inline 

In [2]:
## Select activities from the CSV folder
filedir = ".\\data\\"
rawdir = ".\\CSV\\"
activities = ['cycling', 'driving', 'jogging', 'sleeping', 'walking']
epoch_length = 360

In [3]:
if not os.path.exists(filedir+'combined_data'):
    os.makedirs(filedir+'combined_data')

In [4]:
## Data fixes
df = pd.read_csv(".\\CSV\\static_cycling\\Josette_cycling.csv", skiprows=100,names=["datetime", "acc_x", "acc_y", "acc_z", "lumin", "button", "temperature"])
df.drop(columns={'lumin','button'},inplace=True)
df.to_csv(".\\CSV\\cycling\\Josette_cycling_FIX_clean.csv",index=False)
df = pd.read_csv(".\\CSV\\jogging\\marcia_jogging_clean.csv", skiprows=100,names=["datetime", "acc_x", "acc_y", "acc_z", "lumin", "button", "temperature"])
df.drop(columns={'lumin','button'},inplace=True)
df.to_csv(".\\CSV\\jogging\\marcia_jogging_FIX_clean.csv",index=False)
os.remove(".\\CSV\\jogging\\marcia_jogging_clean.csv")
print("File Removed!")

File Removed!


In [5]:
# Removes first 100 row of uncleaned csv files and gives them correct column names
for i in range(len(activities)):
    for filename in os.listdir(rawdir+activities[i]):
        if filename[-4:] == '.csv':
            if filename[-10:] != '_clean.csv':
                df = pd.read_csv(rawdir+activities[i]+'\\'+filename,skiprows=100,names=["datetime", "acc_x", "acc_y", "acc_z", "lumin", "button", "temperature"])
                df.drop(columns={'lumin','button'},inplace=True)
                df.to_csv(rawdir+activities[i]+'\\'+filename[:-4]+"_clean.csv",index=False)
            else:
                continue
        else:
            continue

In [6]:
# Combines each activity data into 1 file
for i in range(len(activities)):
    main_df = pd.DataFrame()
    for filename in os.listdir(rawdir+activities[i]):
        if filename[-10:] == '_clean.csv':
            df = pd.read_csv(rawdir+activities[i]+'\\'+filename)
            df = df[:((df.shape[0] // epoch_length) * epoch_length)]
            df = df[['acc_x', 'acc_y', 'acc_z']]
            df['label'] = activities[i]
            main_df = main_df.append(df)
        else:
            continue
    main_df.to_csv(filedir+'combined_data'+"\\"+activities[i]+"_combined.csv",index=False)

---

## Create Dataset

In [7]:
combined = os.listdir(filedir+"combined_data\\")
combined

['cycling_combined.csv',
 'driving_combined.csv',
 'jogging_combined.csv',
 'sleeping_combined.csv',
 'walking_combined.csv']

In [8]:
cycling = pd.read_csv(filedir+"combined_data\\"+combined[0])
driving = pd.read_csv(filedir+"combined_data\\"+combined[1])
jogging = pd.read_csv(filedir+"combined_data\\"+combined[2])
sleeping = pd.read_csv(filedir+"combined_data\\"+combined[3])
walking = pd.read_csv(filedir+"combined_data\\"+combined[4])

In [9]:
print(cycling.shape[0] / 3600, "minutes of cycling")
print(driving.shape[0] / 3600, "minutes of driving")
print(jogging.shape[0] / 3600, "minutes of jogging")
print(sleeping.shape[0] / 3600, "minutes of sleeping")
print(walking.shape[0] / 3600, "minutes of walking")

171.1 minutes of cycling
230.6 minutes of driving
162.8 minutes of jogging
15477.1 minutes of sleeping
345.3 minutes of walking


In [10]:
max_samples = int(min([(cycling.shape[0] / 360), (driving.shape[0] / 360), 
                       (jogging.shape[0] / 360), (sleeping.shape[0] / 360), (walking.shape[0] / 360)]))
max_samples

1628

In [11]:
arrays = []
labels = []
array_svm = []

In [12]:
random.seed(42)

In [13]:
for filename in os.listdir(filedir+"combined_data\\"):
    df = pd.read_csv(filedir+"combined_data\\"+filename)
    for i in random.sample(range(0, df.shape[0], 360), max_samples):
        arrays.append(df[i:i+360][['acc_x', 'acc_y', 'acc_z']].to_numpy())
        labels.append(df['label'].iloc[0])
        array_inner = []
        enmo = np.sqrt(df[i:i+360][['acc_x']].to_numpy()**2 + df[i:i+360][['acc_y']].to_numpy()**2 + df[i:i+360][['acc_z']].to_numpy()**2)
        array_inner.append(np.mean(df[i:i+360][['acc_x']].to_numpy())) ## MEAN
        array_inner.append(np.mean(df[i:i+360][['acc_y']].to_numpy()))
        array_inner.append(np.mean(df[i:i+360][['acc_z']].to_numpy()))
        array_inner.append(np.mean(enmo))
        array_inner.append(np.median(df[i:i+360][['acc_x']].to_numpy())) ## MEDIAN
        array_inner.append(np.median(df[i:i+360][['acc_y']].to_numpy()))
        array_inner.append(np.median(df[i:i+360][['acc_z']].to_numpy()))
        array_inner.append(np.median(enmo))
        array_inner.append(np.std(df[i:i+360][['acc_x']].to_numpy())) ## STANDARD DEVIATION
        array_inner.append(np.std(df[i:i+360][['acc_y']].to_numpy()))
        array_inner.append(np.std(df[i:i+360][['acc_z']].to_numpy()))
        array_inner.append(np.std(enmo))
        array_inner.append(variation(df[i:i+360][['acc_x']].to_numpy())[0]) ## VARIATION
        array_inner.append(variation(df[i:i+360][['acc_y']].to_numpy())[0])
        array_inner.append(variation(df[i:i+360][['acc_z']].to_numpy())[0])
        array_inner.append(variation(enmo)[0])
        array_inner.append(np.sqrt(np.mean(df[i:i+360][['acc_x']].to_numpy()**2))) ## ROOT MEAN SQUARE
        array_inner.append(np.sqrt(np.mean(df[i:i+360][['acc_y']].to_numpy()**2)))
        array_inner.append(np.sqrt(np.mean(df[i:i+360][['acc_z']].to_numpy()**2)))
        array_inner.append(np.sqrt(np.mean(enmo**2)))
        array_inner.append(np.percentile(df[i:i+360][['acc_x']].to_numpy(), 5, axis = 0)[0]) ## 5th PERCENTILE
        array_inner.append(np.percentile(df[i:i+360][['acc_y']].to_numpy(), 5, axis = 0)[0])
        array_inner.append(np.percentile(df[i:i+360][['acc_z']].to_numpy(), 5, axis = 0)[0])
        array_inner.append(np.percentile(enmo, 5, axis = 0)[0])
        array_inner.append(np.percentile(df[i:i+360][['acc_x']].to_numpy(), 25, axis = 0)[0]) ## 25th PERCENTILE
        array_inner.append(np.percentile(df[i:i+360][['acc_y']].to_numpy(), 25, axis = 0)[0])
        array_inner.append(np.percentile(df[i:i+360][['acc_z']].to_numpy(), 25, axis = 0)[0])
        array_inner.append(np.percentile(enmo, 25, axis = 0)[0])
        array_inner.append(np.percentile(df[i:i+360][['acc_x']].to_numpy(), 75, axis = 0)[0]) ## 75th PERCENTILE
        array_inner.append(np.percentile(df[i:i+360][['acc_y']].to_numpy(), 75, axis = 0)[0])
        array_inner.append(np.percentile(df[i:i+360][['acc_z']].to_numpy(), 75, axis = 0)[0])
        array_inner.append(np.percentile(enmo, 75, axis = 0)[0])
        array_inner.append(np.percentile(df[i:i+360][['acc_x']].to_numpy(), 95, axis = 0)[0]) ## 95th PERCENTILE
        array_inner.append(np.percentile(df[i:i+360][['acc_y']].to_numpy(), 95, axis = 0)[0])
        array_inner.append(np.percentile(df[i:i+360][['acc_z']].to_numpy(), 95, axis = 0)[0])
        array_inner.append(np.percentile(enmo, 95, axis = 0)[0])
        array_inner.append(np.sum(df[i:i+360][['acc_x']].to_numpy()**2, 0)[0] / df[i:i+360][['acc_x']].to_numpy().size) ## SIGNAL POWER
        array_inner.append(np.sum(df[i:i+360][['acc_y']].to_numpy()**2, 0)[0] / df[i:i+360][['acc_y']].to_numpy().size)
        array_inner.append(np.sum(df[i:i+360][['acc_z']].to_numpy()**2, 0)[0] / df[i:i+360][['acc_z']].to_numpy().size)
        array_inner.append(np.sum(enmo**2, 0)[0] / enmo.size)
        array_svm.append(array_inner)

In [14]:
data_dict = {"cnn_array": np.array(arrays), "svm_array": np.array(array_svm), "label":labels}

## Save Data to Disc

In [15]:
if not os.path.exists(filedir+'data'):
    os.makedirs(filedir+'data')

In [16]:
pickle_out = open(filedir+"data\\data.pickle","wb")
pickle.dump(data_dict, pickle_out)
pickle_out.close()

In [17]:
pickle_in = open(filedir+"data\\data.pickle","rb")
data_dict2 = pickle.load(pickle_in)
pickle_in.close()

In [18]:
## Check if saved data == to loaded data

In [19]:
np.array_equal(data_dict["cnn_array"], data_dict2["cnn_array"])

True

In [20]:
np.array_equal(data_dict["svm_array"], data_dict2["svm_array"])

True