## Setup

In [1]:
import numpy as np
import pandas as pd
#import keras
#from keras import layers
#from matplotlib import pyplot as plt
#import numpy as np
#import matplotlib.pyplot as plt
#from cycler import cycler
import scipy.special as sp
import os as os
#import pywt as py
#import statistics as st
import os as os
#import random
import multiprocessing
from joblib import Parallel, delayed
#import platform
from time import time as ti
from skimage.restoration import denoise_wavelet
#import tensorflow as tf
import pickle



In [2]:
folder = '/sciclone/scr10/dchendrickson01/Recordings2/'

In [3]:
TIME_STEPS = 500
Skips = 5

## Load the data

We will use the [Numenta Anomaly Benchmark(NAB)](
https://www.kaggle.com/boltzmannbrain/nab) dataset. It provides artificial
timeseries data containing labeled anomalous periods of behavior. Data are
ordered, timestamped, single-valued metrics.

We will use the `art_daily_small_noise.csv` file for training and the
`art_daily_jumpsup.csv` file for testing. The simplicity of this dataset
allows us to demonstrate anomaly detection effectively.

In [4]:
def RollingStdDev(RawData, SmoothData, RollSize = 25):
    StdDevs = []
    for i in range(RollSize):
        Diffs = RawData[0:i+1]-SmoothData[0:i+1]
        Sqs = Diffs * Diffs
        Var = sum(Sqs) / (i+1)
        StdDev = np.sqrt(Var)
        StdDevs.append(StdDev)
    for i in range(len(RawData)-RollSize-1):
        j = i + RollSize
        Diffs = RawData[i:j]-SmoothData[i:j]
        Sqs = Diffs * Diffs
        Var = sum(Sqs) / RollSize
        StdDev = np.sqrt(Var)
        StdDevs.append(StdDev)  
    
    return StdDevs

def RollingSum(Data, Length = 100):
    RollSumStdDev = []
    for i in range(Length):
        RollSumStdDev.append(sum(Data[0:i+1]))
    for i in range(len(Data) - Length):
        RollSumStdDev.append(sum(Data[i:i+Length]))
    return RollSumStdDev

def SquelchPattern(DataSet, StallRange = 5000, SquelchLevel = 0.02):
    SquelchSignal = np.ones(len(DataSet))

    for i in range(len(DataSet)-2*StallRange):
        if np.average(DataSet[i:i+StallRange]) < SquelchLevel:
            SquelchSignal[i+StallRange]=0

    return SquelchSignal

def getVelocity(Acceleration, Timestamps = 0.003, Squelch = [], corrected = 0):
    velocity = np.zeros(len(Acceleration))
    
    Acceleration -= np.average(Acceleration)
    
    if len(Timestamps) == 1:
        dTime = np.ones(len(Acceleration),dtype=float) * Timestamps
    elif len(Timestamps) == len(Acceleration):
        dTime = np.zeros(len(Timestamps), dtype=float)
        dTime[0]=1
        for i in range(len(Timestamps)-1):
            j = i+1
            if float(Timestamps[j]) > float(Timestamps[i]):
                dTime[j]=float(Timestamps[j])-float(Timestamps[i])
            else:
                dTime[j]=float(Timestamps[j])-float(Timestamps[i])+10000.0
        dTime /= 10000.0

    velocity[0] = Acceleration[0] * (dTime[0])

    for i in range(len(Acceleration)-1):
        j = i + 1
        if corrected ==2:
            if Squelch[j]==0:
                velocity[j]=0
            else:
                velocity[j] = velocity[i] + Acceleration[j] * dTime[j]                
        else:
            velocity[j] = velocity[i] + Acceleration[j] * dTime[j]

    if corrected == 1:
        PointVairance = velocity[-1:] / len(velocity)
        for i in range(len(velocity)):
            velocity[i] -=  PointVairance * i
    
    velocity *= 9.81

    return velocity

def MakeDTs(Seconds, Miliseconds):
    dts = np.zeros(len(Miliseconds), dtype=float)
    dts[0]=1
    for i in range(len(MiliSeconds)-1):
        j = i+1
        if Seconds[j]==Seconds[i]:
            dts[j]=Miliseconds[j]-Miliseconds[i]
        else:
            dts[j]=Miliseconds[j]-Miliseconds[i]+1000
    dts /= 10000
    return dts


def split_list_by_ones(original_list, ones_list):
    # Created with Bing AI support
    #  1st request: "python split list into chunks based on value"
    #  2nd request: "I want to split the list based on the values in a second list.  Second list is all 1s and 0s.  I want all 0s removed, and each set of consequtive ones as its own item"
    #  3rd request: "That is close.  Here is an example of the two lists, and what I would want returned: original_list = [1, 2, 3, 8, 7, 4, 5, 6, 4, 7, 8, 9]
    #                ones_list =     [1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1]
    #                return: [[1, 2, 3, 8], [4, 5, 6], [8,9]]"
    #
    #This is the function that was created and seems to work on the short lists, goin to use fo rlong lists
    
    result_sublists = []
    sublist = []

    for val, is_one in zip(original_list, ones_list):
        if is_one:
            sublist.append(val)
        elif sublist:
            result_sublists.append(sublist)
            sublist = []

    # Add the last sublist (if any)
    if sublist:
        result_sublists.append(sublist)

    return result_sublists

In [5]:
def MakeDataframe(file, noise=False):
    dataset = pd.read_csv(folder+file, delimiter =", ", header=None, engine='python',on_bad_lines='skip')
    if noise:
        print("File Read")
    dataset = dataset.rename(columns={0:"Day"})
    dataset = dataset.rename(columns={1:"Second"})
    dataset = dataset.rename(columns={2:"FracSec"})
    dataset = dataset.rename(columns={3:"p"})
    dataset = dataset.rename(columns={4:"h"})
    dataset = dataset.rename(columns={5:"v"})
    dataset = dataset.rename(columns={6:"Sensor"})

    dataset[['Day','Second']] = dataset[['Day','Second']].apply(lambda x: x.astype(int).astype(str).str.zfill(6))
    dataset[['FracSec']] = dataset[['FracSec']].apply(lambda x: x.astype(int).astype(str).str.zfill(4))

    dataset["timestamp"] = pd.to_datetime(dataset.Day+dataset.Second+dataset.FracSec,format='%y%m%d%H%M%S%f')
    dataset["timestamps"] = dataset["timestamp"]
    
    dataset["p"] = dataset.p - np.average(dataset.p)
    dataset["h"] = dataset.h - np.average(dataset.h)
    dataset["v"] = dataset.v - np.average(dataset.v)
    #dataset["r"] = np.sqrt(dataset.p**2 + dataset.h**2 + dataset.v**2)

    dataset.index = dataset.timestamp

    #dataset["smoothP"] = denoise_wavelet(dataset.p, method='VisuShrink', mode='soft', wavelet_levels=3, wavelet='sym2', rescale_sigma='True')
    #dataset["SmoothH"] = denoise_wavelet(dataset.h, method='VisuShrink', mode='soft', wavelet_levels=3, wavelet='sym2', rescale_sigma='True')
    dataset["SmoothV"] = denoise_wavelet(dataset.v, method='VisuShrink', mode='soft', wavelet_levels=3, wavelet='sym2', rescale_sigma='True')

    if noise:
        print("Data Cleaned")
    
    StdDevsZ = RollingStdDev(dataset.v, dataset.SmoothV)
    StdDevsZ.append(0)
    StdDevsZ = np.asarray(StdDevsZ)
    SmoothDevZ = denoise_wavelet(StdDevsZ, method='VisuShrink', mode='soft', wavelet_levels=3, wavelet='sym2', rescale_sigma='True')

    Max = np.max(SmoothDevZ)
    buckets = int(Max / 0.005) + 1
    bins = np.linspace(0,buckets*0.005,buckets+1)
    counts, bins = np.histogram(SmoothDevZ,bins=bins)

    CummCount = 0
    HalfWay = 0
    for i in range(len(counts)):
        CummCount += counts[i]
        if CummCount / len(SmoothDevZ) >= 0.5:
            if HalfWay == 0:
                HalfWay = i

    SquelchLevel = bins[HalfWay] 
    dataset["IsMoving"] = SquelchPattern(SmoothDevZ, 4000, SquelchLevel)
    if noise:
        print("Squelch Made")
    #dataset["velocity"] = getVelocity(dataset.p, dataset.FracSec, dataset.IsMoving, 2)
    #if noise:
    #    print("Velocity Calculated.  File done: ",file)
    return dataset

In [6]:
# Generated training sequences for use in the model.
def create_sequences(values, time_steps=TIME_STEPS, skips = Skips):
    output = []
    for i in range(int((len(values) - time_steps + skips)/skips)):
        output.append(values[i*skips : (i*skips + time_steps)])
    return np.stack(output)

In [8]:
def makePickles(file, index,start=0):
    if file[-4:] == '.csv':
        df_small_noise = MakeDataframe(file,False)
        df_ps = split_list_by_ones(df_small_noise.p, df_small_noise.IsMoving)
        df_hs = split_list_by_ones(df_small_noise.h, df_small_noise.IsMoving)
        df_vs = split_list_by_ones(df_small_noise.v, df_small_noise.IsMoving)

        del df_small_noise

        df_p=[0]
        df_h=[0]
        df_v=[0]
        for i in range(len(df_ps)):
            df_p += df_ps[i]
            df_h += df_hs[i]
            df_v += df_vs[i]

        del df_ps, df_hs, df_vs

        training_mean = np.average(df_p)
        training_std = np.std(df_p)
        df_training_value_p = (df_p - training_mean) / training_std

        training_mean = np.average(df_h)
        training_std = np.std(df_h)
        df_training_value_h = (df_h - training_mean) / training_std

        training_mean = np.average(df_v)
        training_std = np.std(df_v)
        df_training_value_v = (df_v - training_mean) / training_std

        del df_p, df_h, df_v

        x_train_p = create_sequences(df_training_value_p)
        x_train_h = create_sequences(df_training_value_h)
        x_train_v = create_sequences(df_training_value_v)

        del df_training_value_p, df_training_value_h, df_training_value_v

        x_train=[]
        for i in range(len(x_train_p)):
            #for i in range(1000000):
            x_train.append(np.matrix([x_train_p[i],x_train_h[i],x_train_v[i]]).flatten())

        del x_train_p, x_train_h, x_train_v

        x_t1 = np.array(x_train)

        del x_train

        f = open(folder+'DayMovePickle/PickledPrep'+str(index).zfill(5)+'.p','wb')
        pickle.dump(x_t1,f)
        f.close()
        if start != 0:
            print('A file done in ', str(int((ti()-start)/600)*10))
        if index % 100 == 0:
            print('At least one process is at '+str(index))
        return 1


In [9]:
files = os.listdir(folder)

In [10]:
tic = ti()
Results = Parallel(n_jobs=5)(delayed(makePickles)(file, index,tic) for index, file in enumerate(files[:5]))



220919 recording1.csv
220919 recording2.csv
A file done in  60
