# Data Preparation
**Author:** [Divyanshu Raghuwanshi](https://www.linkedin.com/in/divyanshu-raghuwanshi-85037b160/)<br>
**Date created:** 2020/16/05<br>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy import stats

def removeNAN(X,y):
    # index of rows containing NAN value
    row_indices_NAN=np.isnan(X).any(axis=1)
    # to check the range of nan values
    L=-1
    R=-1
    flag=True
    for i in range(0,len(row_indices_NAN)):
        if (row_indices_NAN[i]==True and flag==True):
            L=i
            flag= False
        elif(flag==False and (row_indices_NAN[i]==False or i==(len(row_indices_NAN)-1))):
            flag=True
            R=i-1
            print('NAN values indices   :'+str(L)+" --- "+str(R))
    X=X[~row_indices_NAN]
    y=y[~row_indices_NAN]
    return X,y

def removeNULL(X,y):
    # remove having course label Null=0
    row_label_Null=(y==0)
    X = X[~ row_label_Null]
    y = y[~ row_label_Null]
    return X,y

def prepareData(X,y,W,s_W):
# =============================================================================
# RAW DATA
# 0 Speed rpm
# 1 CVX_Ampl [µm s_pp]
# 2 CVY_Ampl [µm s_pp]
# 3 SVX [µm t_pp]
# 4 SVY [µm t_pp]
# 5 SVX_1X_Ampl [µm s_pp]
# 6 SVX_1X_Phase [° lag]
# 7 SVX_2X_Ampl [µm s_pp]
# 8 SVX_2X_Phase [° lag]
# 9 SVX_3X_Ampl [µm s_pp]
# 10 SVX_3X_Phase [° lag]
# 11 SVY_1X_Ampl [µm s_pp]
# 12 SVY_1X_Phase [° lag]
# 13 SVY_2X_Ampl [µm s_pp]
# 14 SVY_2X_Phase [° lag]
# 15 SVY_3X_Ampl [µm s_pp]
# 16 SVY_3X_Phase [° lag]

# 
# LABELS
# Column What
# 0:'Healthy'
# 1:'S_Imbalance'
# 2:'C_Imbalance'
# 3:'D_Imbalance'
# 4:'Misalign'
# 5:'Ped_loose'

# =============================================================================
    # select Acceleration (X,Y,Z),Gyroscope(X,Y,Z), Magnetometer(X,Y,Z)
    X = X[:,1:4]
    # select Coarse label
    # remove NAN valuse from 
    X,y = removeNAN(X,y)
    X,y = removeNULL(X,y)
    X = stats.zscore(X,axis = 0)
    X_data=list()
    y_data=list()
    
    L=0
    R=W
    
    while(R <= X.shape[0]):
        #print('{}:{}'.format(L,R))
        sample=X[L:R,:]
        label=y[L:R]
        if len(set(label))==1 and len(label)==W:
            X_data.append(sample)
            y_data.append(label[0])

        L=L+s_W
        R=R+s_W

    X=np.array(X_data)
    y=np.array(y_data)
    return X,y

def loadDataset(homePath,fileName):
    filepath = homepath +'/'+ fileName
    data = pd.read_csv(filepath)
    data = data.iloc[:,2:]
    data = data.values
    X = data[:,:-1]
    y = data[:,-1]
    #np.save(user+'_data',X)
    #np.save(user+'_labels',y)
    return X,y

def prepare_train_test(X,y):
    # Standadizing with z-score
    trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.20, random_state=42)
    print(trainX.shape, trainy.shape, testX.shape, testy.shape)
    return trainX, trainy, testX, testy

In [4]:
W = 120
s_W = 60

homepath = '/home/aneesh/Python_Codes/Data/SRF_Data/Summary_Data'
filepath = 'MeggitSummaryCSV.csv'
X,y = loadDataset(homepath, filepath)
dataX, datay = prepareData(X,y,W,s_W)
trainX, trainy, testX, testy = prepare_train_test(dataX,datay)

(4338, 120, 16) (4338,) (1085, 120, 16) (1085,)


In [5]:
np.save(homepath+'/Train_data',trainX)
np.save(homepath+'/Train_labels',trainy)
np.save(homepath+'/Test_data',testX)
np.save(homepath+'/Test_labels',testy)

In [6]:
X = np.load(homepath+'/Test_data.npy')

In [89]:
# def load_dataset():
# #     X_U1R = np.load('User1_w512_s256_data.npy')
# #     X_U2R = np.load('User2_w512_s256_data.npy')
# #     X_U3R = np.load('User3_w512_s256_data.npy')

# #     y_U1R = np.load('User1_w512_s256_y.npy')
# #     y_U2R = np.load('User2_w512_s256_y.npy')
# #     y_U3R = np.load('User3_w512_s256_y.npy')
    
#     X_U1R = np.load('User1_w512_s512_data.npy')
#     X_U2R = np.load('User2_w512_s512_data.npy')
#     X_U3R = np.load('User3_w512_s512_data.npy')

#     y_U1R = np.load('User1_w512_s512_y.npy')
#     y_U2R = np.load('User2_w512_s512_y.npy')
#     y_U3R = np.load('User3_w512_s512_y.npy')
    
#     X=np.concatenate((X_U1R, X_U2R,X_U3R), axis=0)
#     y=np.concatenate((y_U1R, y_U2R,y_U3R), axis=0) 
    
#     y = y-1
#     return X, y

In [91]:
# # set parameters
# X,y = load_dataset()
# trainX, trainy, testX, testy = prepare_train_test(X,y)