<a href="https://colab.research.google.com/github/colinjhorgan/MLTSAFinalProject/blob/main/Preprocessing/CNN_Preprocessing_Cart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports
---

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import pylab as plt
%pylab inline

from scipy.signal import resample_poly

from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import cross_val_score

from google.colab import drive
drive.mount('/content/gdrive')

import warnings
warnings.filterwarnings('ignore')

## Import and Fix Data
---

In [None]:
#Read in data
path = '/content/gdrive/MyDrive/3. MSDS Courses/Machine Learning Time Series/MLTSA22 Project/Data/Phones_accelerometer.csv' # path may not match; add path here
paccel = pd.read_csv(path, index_col=0)

paccel.tail(8)

In [None]:
#gt contains nan values so we impute them with 'no_task'
paccel['gt'] = paccel['gt'].fillna('no_task')

## Resample Data
---

In [None]:
#Resample all trials to 100Hz
x_fixed = paccel.reset_index().groupby(['User','Model','Device','gt'])['x'].apply(lambda x : resample_poly(x.values, 15050, x.values.shape[0]))
y_fixed = paccel.reset_index().groupby(['User','Model','Device','gt'])['y'].apply(lambda y : resample_poly(y.values, 15050, y.values.shape[0]))
z_fixed = paccel.reset_index().groupby(['User','Model','Device','gt'])['z'].apply(lambda z : resample_poly(z.values, 15050, z.values.shape[0]))

#Re-construct
new_df = x_fixed.to_frame()
new_df = new_df.merge(y_fixed.to_frame(), left_index=True, right_index=True)
new_df = new_df.merge(z_fixed.to_frame(), left_index=True, right_index=True)

## Min-max Normalization and Window
---

In [None]:
#Standardize each series, and apply sliding window with length 100 and 50% overlap

#sig = lambda x : 1/(1 + np.exp(-x)) # sigmoid function

images = []
for i in range(new_df.shape[0]):
    x = new_df.iloc[[i]].x.values[0]
    y = new_df.iloc[[i]].y.values[0]
    z = new_df.iloc[[i]].z.values[0]

    x = ((x - x.min() ) / (x.max() - x.min()))
    y = ((y - y.min() ) / (y.max() - y.min()))
    z = ((z - z.min() ) / (z.max() - z.min()))
    
    r = np.stack([x[i:100+i] for i in range(0, 15000, 50)]) # stack elements using 100-sample window with 50% overlap
    g = np.stack([y[i:100+i] for i in range(0, 15000, 50)])
    b = np.stack([z[i:100+i] for i in range(0, 15000, 50)])
    
    vals = np.dstack([r,g,b])
    images.append(vals)
    
im_arr = np.array(images)

In [None]:
im_arr.shape #(num_samples, height, length, num_layers)

(485, 300, 100, 3)

## Saving the Data
---

In [None]:
np.save('/content/gdrive/MyDrive/3. MSDS Courses/Machine Learning Time Series/MLTSA22 Project/Data/image_arrayminmax.npy', im_arr)