# Setup

Prerequisite: check the upper right hand corner of the UI to make sure your notebook kernel is conda_tensorflow2_p36

We start by:
- Specify your S3 bucket name from the Stack Create section in CloudFormation.
- Importing various Python libraries we'll need S3 bucket to upload shaped data

Run cells by clicking either (1) the play symbol that appears to the left of In[] when you hover over it, or (2) the 'Run cell' button in the toolbar above, or (3) using Control + Enter from your keyboard.

In [None]:
import numpy as np
import pandas as pd
from numpy import save
import boto3 
import os

data_bucket_name='YOUR-BUCKET-HERE'

# Step 1
Import CSV from your notebook into a Pandas dataframe

In [None]:
train_ts = pd.read_csv('data/train_ts.csv')
test_ts = pd.read_csv('data/test_ts.csv')

train_ts = train_ts.drop(['Unnamed: 0'], axis=1)
test_ts = test_ts.drop(['Unnamed: 0'], axis=1)

# Step 1b
Use the next cell to view data for one participant
Activity Classes
0=downstairs; 1=upstairs; 2=walking; 3=jogging; 4=standing; 5=sitting

In [None]:
subset = train_ts[train_ts['act']==3]
subset = subset[subset['id']==0]
subset.plot(subplots=True,figsize = (20,30))
subset.describe()

# Step 2
Timeseries to seconds (ts_to_secs) is a function that reshapes the Pandas data frame in prior step into a n-dimensional numpy array

In [None]:
def ts_to_secs(dataset, w, s):
    
    data = dataset[dataset.columns[:-3]].values    
    act_labels = dataset["act"].values
    id_labels = dataset["id"].values
    trial_labels = dataset["trial"].values

    ## We want the Rows of matrices show each Feature and the Columns show time points.
    data = data.T

    m = data.shape[0]   # Data Dimension 
    ttp = data.shape[1] # Total Time Points
    number_of_secs = int(round(((ttp - w)/s)))

    ##  Create a 3D matrix for Storing Sections  
    secs_data = np.zeros((number_of_secs , m , w ))
    act_secs_labels = np.zeros(number_of_secs)
    id_secs_labels = np.zeros(number_of_secs)

    k=0
    for i in range(0 , ttp-w, s):
        j = i // s
        if j >= number_of_secs:
            break
        if id_labels[i] != id_labels[i+w-1]: 
            continue
        if act_labels[i] != act_labels[i+w-1]: 
            continue
        if trial_labels[i] != trial_labels[i+w-1]:
            continue
            
        secs_data[k] = data[:, i:i+w]
        act_secs_labels[k] = act_labels[i].astype(int)
        id_secs_labels[k] = id_labels[i].astype(int)
        k = k+1
        
    secs_data = secs_data[0:k]
    act_secs_labels = act_secs_labels[0:k]
    id_secs_labels = id_secs_labels[0:k]
    return secs_data, act_secs_labels

# Step 3
This cell works as is.  You may change the w and s parameters.  w indicates how many time-series steps to retain for training.  The data is captured at 50 Hz (samples per second); therefore w=128 is about ~2.5 seconds of observation.  The step size divides the 128 into smaller steps of N.  Note here 128 is a multiple of 32.

In [None]:
## This Variable Defines the Size of Sliding Window
## ( e.g. 100 means in each snapshot we just consider 100 consecutive observations of each sensor) 
w = 128 # 50 Equals to 1 second for MotionSense Dataset (it is on 50Hz samplig rate)
## Here We Choose Step Size for Building Diffrent Snapshots from Time-Series Data
## ( smaller step size will increase the amount of the instances and higher computational cost may be incurred )
s = 32
train_data, act_train = ts_to_secs(train_ts.copy(), w, s)

s = 32
test_data, act_test = ts_to_secs(test_ts.copy(), w, s)

# Step 4
Convert the activity train and set labels into a one-hot encoded array.  Natively the domain of values are 0-5 and serve as a class label.  This allows the predictions to have a probability by class.

In [None]:
from keras.utils import to_categorical

act_train_labels = to_categorical(act_train)
act_test_labels = to_categorical(act_test)
    
## 3 dimensions for Convolution2D
train_data = np.expand_dims(train_data,axis=3)
test_data = np.expand_dims(test_data,axis=3)

# Step 5
Shuffle the training data and label together

In [None]:
from sklearn.utils import shuffle
train_data, act_train_labels = shuffle(train_data, act_train_labels, random_state=0)

In [None]:
act_train_labels.shape

# Step 6
Create a local directory to house the numpy arrays as write them out as binary objects to local disk.

In [None]:
# save to npy file
!mkdir npydata
save('npydata/train_data.npy', train_data)
save('npydata/train_labels.npy', act_train_labels)
save('npydata/test_data.npy', test_data)
save('npydata/test_labels.npy', act_test_labels)

# Step 7
Upload the npy arrays to S3 so SageMaker Training instances are able to read them.

In [None]:
# name of the bucket

s3_client = boto3.client('s3')
response = s3_client.upload_file('npydata/train_data.npy', data_bucket_name, 'train/train_data.npy')
response = s3_client.upload_file('npydata/train_labels.npy', data_bucket_name, 'train/train_labels.npy')
response = s3_client.upload_file('npydata/test_data.npy', data_bucket_name, 'test/test_data.npy')
response = s3_client.upload_file('npydata/test_labels.npy', data_bucket_name, 'test/test_labels.npy')

# Step 8
The next cell allows your S3 bucket name to be carried over into the training notebook.

In [None]:
%store data_bucket_name