In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.special as sp
import os as os
import multiprocessing
from joblib import Parallel, delayed
from time import time as ti
from time import ctime as ct
from skimage.restoration import denoise_wavelet
import pickle
import CoreFunctions as cf
import sys
import random
import psutil
import gc

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')

2024-11-13 09:56:31.277595: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-13 09:56:31.291526: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-13 09:56:31.302106: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-13 09:56:31.305154: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-13 09:56:31.314783: I tensorflow/core/platform/cpu_feature_guar

In [3]:
DataFolder = '/sciclone/scr10/dchendrickson01/Recordings2/'
DataFolder = '/scratch/Recordings2/'
model_directory = '/scratch/models/stopped/'


TIME_STEPS = 1200
Skips = 125
RollSize = 50

LastSuccesfull = 1
DateString = '1011'
MakeOnesOrZeros = 1
RunParallel = 1
FilesPerRun = 15
ConcurrentFiles = 5


tic = ti()
start = tic

MemoryProtection = True
noisy = True

In [4]:
%matplotlib inline

In [5]:
RunTwice = True

In [6]:
def RollingStdDevFaster(RawData, SmoothData, RollSize = 25):

    Diffs = RawData - SmoothData
    del RawData, SmoothData
    
    Sqs = Diffs * Diffs
    del Diffs
    
    Sqs = Sqs.tolist() 
    Sqs.extend(np.zeros(RollSize))
    mSqs = np.matrix(Sqs)
    
    for i in range(RollSize):
        Sqs.insert(0, Sqs.pop())
        mSqs = np.concatenate((np.matrix(Sqs),mSqs))
    
    sVect = mSqs.sum(axis=0)
    eVect = (mSqs!=0).sum(axis=0)
    del mSqs, Sqs
    
    VarVect = sVect / eVect
    StdDevs = np.sqrt(VarVect)
    return np.asarray(StdDevs[:-RollSize].T)

def SquelchPattern(DataSet, StallRange = 5000, SquelchLevel = 0.02, verbose = noisy):
    
    SquelchSignal = np.ones(len(DataSet))
    if verbose:
        print(len(SquelchSignal))
        
    for i in range(len(DataSet)-2*StallRange):
        if np.average(DataSet[i:i+StallRange]) < SquelchLevel:
            SquelchSignal[i+StallRange]=0

    return SquelchSignal

def split_list_by_ones(original_list, ones_list):
    # Created with Bing AI support
    #  1st request: "python split list into chunks based on value"
    #  2nd request: "I want to split the list based on the values in a second list.  Second list is all 1s and 0s.  I want all 0s removed, and each set of consequtive ones as its own item"
    #  3rd request: "That is close.  Here is an example of the two lists, and what I would want returned: original_list = [1, 2, 3, 8, 7, 4, 5, 6, 4, 7, 8, 9]
    #                ones_list =     [1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1]
    #                return: [[1, 2, 3, 8], [4, 5, 6], [8,9]]"
    #
    #This is the function that was created and seems to work on the short lists, goin to use fo rlong lists
    
    result_sublists = []
    sublist = []

    for val, is_one in zip(original_list, ones_list):
        if is_one:
            sublist.append(val)
        elif sublist:
            result_sublists.append(sublist)
            sublist = []

    # Add the last sublist (if any)
    if sublist:
        result_sublists.append(sublist)

    return result_sublists

def split_list_by_zeros(original_list, ones_list):
    # modified split_list_by_ones function to instead split by the zeros.
    #
    #
    # Created with Bing AI support
    #  1st request: "python split list into chunks based on value"
    #  2nd request: "I want to split the list based on the values in a second list.  Second list is all 1s and 0s.  I want all 0s removed, and each set of consequtive ones as its own item"
    #  3rd request: "That is close.  Here is an example of the two lists, and what I would want returned: original_list = [1, 2, 3, 8, 7, 4, 5, 6, 4, 7, 8, 9]
    #                ones_list =     [1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1]
    #                return: [[1, 2, 3, 8], [4, 5, 6], [8,9]]"
    #
    #This is the function that was created and seems to work on the short lists, going to use for long lists
    
    result_sublists = []
    sublist = []

    for val, is_one in zip(original_list, ones_list):
        if not is_one:
            sublist.append(val)
        elif sublist:
            result_sublists.append(sublist)
            sublist = []

    # Add the last sublist (if any)
    if sublist:
        result_sublists.append(sublist)

    return result_sublists

# Generated training sequences for use in the model.
def create_sequences(values, time_steps=TIME_STEPS, skips = Skips):
    output = []
    for i in range(int((len(values) - time_steps + skips)/skips)):
        output.append(values[i*skips : (i*skips + time_steps)])
    return np.stack(output)

In [7]:
def runFile(file, verbose = noisy, small = False, index=0, start=ti()):
    noise = verbose
    if file[-4:] == '.csv':    
        dataset = pd.read_csv(DataFolder+file, delimiter =",", header=None, engine='python',on_bad_lines='skip')
        if noise:
            print("File Read", ti()-start)
        dataset = dataset.rename(columns={0:"Day"})
        dataset = dataset.rename(columns={1:"Second"})
        dataset = dataset.rename(columns={2:"FracSec"})
        dataset = dataset.rename(columns={3:"p"})
        dataset = dataset.rename(columns={4:"h"})
        dataset = dataset.rename(columns={5:"v"})
        dataset = dataset.rename(columns={6:"Sensor"})

        #dataset['Second'].replace('',0)
        #dataset['FracSec'].replace('',0)
        #dataset.replace([np.nan, np.inf, -np.inf],0,inplace=True)
        
        #dataset[['Day','Second']] = dataset[['Day','Second']].apply(lambda x: x.astype(int).astype(str).str.zfill(6))
        #dataset[['FracSec']] = dataset[['FracSec']].apply(lambda x: x.astype(int).astype(str).str.zfill(4))

        #dataset["timestamp"] = pd.to_datetime(dataset.Day+dataset.Second+dataset.FracSec,format='%y%m%d%H%M%S%f')
        #dataset["timestamps"] = dataset["timestamp"]

        dataset["p"] = dataset.p - np.average(dataset.p)
        dataset["h"] = dataset.h - np.average(dataset.h)
        dataset["v"] = dataset.v - np.average(dataset.v)
        #dataset["r"] = np.sqrt(dataset.p**2 + dataset.h**2 + dataset.v**2)

        #dataset.index = dataset.timestamp

        dataset["SmoothP"] = denoise_wavelet(dataset.p, method='VisuShrink', mode='soft', wavelet_levels=3, wavelet='sym2', rescale_sigma='True')
        dataset["SmoothH"] = denoise_wavelet(dataset.h, method='VisuShrink', mode='soft', wavelet_levels=3, wavelet='sym2', rescale_sigma='True')
        dataset["SmoothV"] = denoise_wavelet(dataset.v, method='VisuShrink', mode='soft', wavelet_levels=3, wavelet='sym2', rescale_sigma='True')
        #dataset["SmoothR"] = denoise_wavelet(dataset.r, method='VisuShrink', mode='soft', wavelet_levels=3, wavelet='sym2', rescale_sigma='True')

        if noise:
            print("Data Cleaned", ti()-start, len(dataset.p))

        RawData = dataset.v
        SmoothData = dataset.SmoothV
        RollSize = 25

        Diffs = RawData - SmoothData

        Sqs = Diffs * Diffs

        Sqs = Sqs.tolist() 

        Sqs.extend(np.zeros(RollSize))

        mSqs = np.matrix(Sqs)

        for i in range(RollSize):
            Sqs.insert(0, Sqs.pop())
            mSqs = np.concatenate((np.matrix(Sqs),mSqs))

        sVect = mSqs.sum(axis=0)
        eVect = (mSqs!=0).sum(axis=0)

        VarVect = sVect / eVect

        StdDevs = np.sqrt(VarVect)

        StdDevsZ = np.asarray(StdDevs)

        StdDevsZ=np.append(StdDevsZ,[0])

        StdDevsZ = np.asarray(StdDevsZ.T[:len(dataset.p)])

        if noise:
            print("Size StdDevsZ", ti()-start, np.shape(StdDevsZ))

        #StdDevsZ = np.nan_to_num(StdDevsZ)

        #StdDevsZ[StdDevsZ == np.inf] = 0
        #StdDevsZ[StdDevsZ == -np.inf] = 0

        if noise:
            print("cleaned", ti()-start, np.shape(StdDevsZ))

        SmoothDevZ = denoise_wavelet(StdDevsZ, method='VisuShrink', mode='soft', wavelet='sym2', rescale_sigma='True')

        if noise:
            print("denoise 1", ti()-start, np.shape(StdDevsZ))

        #SmoothDevZa = cf.Smoothing(StdDevsZ, 3, wvt='sym2', dets_to_remove=2, levels=3)
        #SmoothDevZ = np.ravel(SmoothDevZ[0,:])

        #SmoothDevZ = SmoothDevZ.tolist()

        if noise:
            print("denoise 2", ti()-start, np.shape(SmoothDevZ))

        #ataset["SmoothDevZ"] = SmoothDevZ

        SmoothDevZ[np.isnan(SmoothDevZ)]=0
        
        Max = np.max(SmoothDevZ)

        
        
        if noise:
            print("Max", ti()-start, np.shape(Max), Max)

        buckets = int(Max / 0.005) + 1
        bins = np.linspace(0,buckets*0.005,buckets+1)
        counts, bins = np.histogram(SmoothDevZ,bins=bins)

        CummCount = 0
        HalfWay = 0
        for i in range(len(counts)):
            CummCount += counts[i]
            if CummCount / len(SmoothDevZ) >= 0.5:
                if HalfWay == 0:
                    HalfWay = i

        SquelchLevel = bins[HalfWay] 
        if noise:
            print("SmoothDevz size", np.shape(SmoothDevZ))

        dataset["IsMoving"] = SquelchPattern(SmoothDevZ, 4000, SquelchLevel, verbose=noise)

        if noise:
            print("Squelch Made", ti()-start)
        #dataset["velocity"] = getVelocity(dataset.p, dataset.FracSec, dataset.IsMoving, 2)
        #if noise:
        #    print("Velocity Calculated.  File done: ",file)

        #df_pr = split_list_by_zeros(dataset.p, dataset.IsMoving)
        #df_hr = split_list_by_ones(dataset.h, dataset.IsMoving)
        #df_vr = split_list_by_ones(dataset.v, dataset.IsMoving)
        #df_rrr = split_list_by_ones(dataset.r, dataset.IsMoving)
        if MakeOnesOrZeros == 1:
            df_ps = split_list_by_ones(dataset.SmoothP, dataset.IsMoving)
            df_hs = split_list_by_ones(dataset.SmoothH, dataset.IsMoving)
            df_vs = split_list_by_ones(dataset.SmoothV, dataset.IsMoving)
            #df_rs = split_list_by_ones(dataset.SmoothR, dataset.IsMoving)
        else:
            df_ps = split_list_by_zeros(dataset.SmoothP, dataset.IsMoving)
            df_hs = split_list_by_zeros(dataset.SmoothH, dataset.IsMoving)
            df_vs = split_list_by_zeros(dataset.SmoothV, dataset.IsMoving)
            #df_rs = split_list_by_zeros(dataset.SmoothR, dataset.IsMoving)
            

        del dataset
        
        MatsSmooth = []
        for i in range(len(df_ps)):
            MatsSmooth.append(np.vstack((df_ps[i],df_hs[i],df_vs[i])))#,df_rs[i])))
        
        if verbose:
            print("Split by ones", ti()-start)

        if verbose:
            print('format changed', ti()-start, len(MatsSmooth))

        return MatsSmooth
    else:
        return ['fail','fail']
        
 

In [8]:
def runWrapper(file_path, verbose=noisy, small=False, index=0, start=ti()):
    try:
        rtrn = runFile(file_path, verbose, small, index, start)
        return rtrn
    except Exception as e:
        with open('BadInputs.text', 'a') as bad_file:
            bad_file.write(file_path + '\n')
        return np.zeros((10, 10, 3))

In [9]:
def CleanNanInf(data):
    # Create a copy of the array to avoid in-place operation issues 
    data_copy = np.copy(data) 

    # Replace NaNs with 0 
    data_copy = np.nan_to_num(data_copy, nan=0.0) 

    # Replace positive and negative infinities with the maximum finite value in the array 
    finite_values = data_copy[np.isfinite(data_copy)] 
    max_finite_value = np.max(finite_values) 
    data_copy[np.isinf(data_copy)] = max_finite_value 

    # Convert to integers 
    # data_copy = data_copy.astype(float32) 
    
    return data_copy

In [10]:
def PrintWrap(data):
    localPrints = []

    Mat = CleanNanInf(data)

    lenm = np.shape(Mat)[1]
    slices = int(lenm/TIME_STEPS)

    for i in range(slices):
        temp = (cf.makeMPFast(Mat[:3,i*TIME_STEPS:(i+1)*TIME_STEPS], wvt = 'sym4', scales = 32, spacer = 2, title = ''))
        localPrints.append(temp.astype(np.float32)/255.0)
    return localPrints

In [11]:
with open(f'CurrentFileList{DateString}.text','r') as file:
    files = file.readlines()
files=[item.strip() for item in files]

In [12]:
toc=ti()

In [13]:
np.savetxt('filelist.csv', files, '%s', delimiter=",")

In [None]:
if RunTwice:
    LoopsToGetAll = int(len(files)/FilesPerRun)-LastSuccesfull
    print(f'Additional Loops Needed: {LoopsToGetAll}, at current time {ct(ti())}')
    ToDoList = [58,60,63,66]
    ToDoList = ToDoList[::-1]
    for j in ToDoList:   #27,28
        #j+=LastSuccesfull
        Mats=[]
        if RunParallel ==1:
            AllDatas = Parallel(n_jobs=ConcurrentFiles,timeout=1800)(delayed(runWrapper)(files[(j*FilesPerRun+i)], False, False, 0, ti()) for i in range(FilesPerRun))
        else:
            AllDatas = []
            for i in range(FilesPerRun):
                FileIndex = int(j*FilesPerRun+i)
                AllDatas.append(runWrapper(files[FileIndex], False, False, 0, ti()))
                print(f'Got data on {i} of {FilesPerRun} in {int((ti()-toc)/.6)/100} minutes, at current time {ct(ti())}.')
        
        for fileResponse in AllDatas:
            for Mat in fileResponse:
                Mats.append(Mat)
        
        if MemoryProtection:
            del AllDatas
            print('RAM after AllData:', psutil.virtual_memory()[2],int((ti()-toc)/.6)/100)        
        lengths = []
        rejects = []
        Keeps = []
        
        for Mat in Mats:
            spm = np.shape(Mat)
            if len(spm) > 1:
                lenM = spm[1]
            else:
                lenM = 1
            if (lenM > 1250):
                lengths.append(lenM)
                Keeps.append(Mat)
            else:
                rejects.append(lenM)
        
        if MemoryProtection:
            del Mats, rejects, lengths
        
        Prints = []
        
        
        if RunParallel ==1:
            AllPrints = Parallel(n_jobs=ConcurrentFiles)(delayed(PrintWrap)(Mat) for Mat in Keeps)
        else:
            AllPrints = []
            for i, Mat in enumerate(Keeps):
                AllPrints.append(PrintWrap(Mat))
                if i % 25 == 0:
                    print(f'Through {i} of {len(Keeps)} moves. In {int((ti()-toc)/.6)/100} minutes, at current time {ct(ti())}.')
        
        if MemoryProtection:
            del Keeps
            print('RAM after Keeps:', psutil.virtual_memory()[2],int((ti()-toc)/.6)/100)
        for group in AllPrints:
            for fprint in group:
                Prints.append(fprint[:, ::2, :])
        
        if MemoryProtection:
            del AllPrints
        
        random.shuffle(Prints)
        
        for i, image in enumerate(Prints):
            if not isinstance(image, np.ndarray):
                Prints[i] = np.array(image, dtype=np.float32)
            elif image.dtype != np.float32:
                Prints[i] = image.astype(np.float32)
        
        # Stack the images into a single NumPy array
        prints_array = np.stack(Prints, axis=0)
        
        if MemoryProtection:
            del Prints
            print('RAM after Prints:', psutil.virtual_memory()[2],int((ti()-toc)/.6)/100)
        # Convert the NumPy array to a TensorFlow tensor

        if psutil.virtual_memory()[2] > 50:
            print('Cant get all memory use too high')
            memInUse = psutil.virtual_memory()[2]
            ExtraMem = memInUse - 50
            ExtraPercent = ExtraMem / memInUse
            cutPoint = len(prints_array) * (1-ExtraPercent)
            Cut = int(cutPoint) - 1
            trX = tf.convert_to_tensor(prints_array[:Cut])
        else:
            trX = tf.convert_to_tensor(prints_array)
        if MemoryProtection:
            del prints_array

        if MakeOnesOrZeros ==1:
            MoveStation = 'Moving'
        elif MakeOnesOrZeros == 0:
            MoveStation = 'Stationary'
            
        with open(DataFolder + f'MLPickles/{MoveStation}Dataset_{str(j).zfill(4)}_{str(trX.shape[0]).zfill(6)}.p', 'wb') as handle:
            pickle.dump(trX, handle)

        if MemoryProtection:
            del trX
            gc.collect()
    
        print(f'{j} of {LoopsToGetAll+LastSuccesfull} in {int((ti()-toc)/.6)/100} minutes. Using { psutil.virtual_memory()[2]} of RAM, at current time {ct(ti())}')
        #%whos

Additional Loops Needed: 70, at current time Wed Nov 13 09:56:33 2024




RAM after AllData: 35.5 20.76
RAM after Keeps: 61.3 26.98
RAM after Prints: 69.1 27.96
Cant get all memory use too high
70 of 71 in 29.98 minutes. Using 65.6 of RAM, at current time Wed Nov 13 10:26:32 2024
RAM after AllData: 66.3 48.51


  cfX /= highest
  return new.astype(intype)


RAM after Keeps: 69.0 54.77
RAM after Prints: 79.4 55.53
Cant get all memory use too high
68 of 71 in 56.93 minutes. Using 68.9 of RAM, at current time Wed Nov 13 10:53:29 2024
RAM after AllData: 70.1 77.5


  cfX /= highest
  return new.astype(intype)


RAM after Keeps: 76.1 84.7
RAM after Prints: 88.9 85.85
Cant get all memory use too high
67 of 71 in 88.15 minutes. Using 75.7 of RAM, at current time Wed Nov 13 11:24:42 2024
RAM after AllData: 76.7 109.51
RAM after Keeps: 77.1 117.61
RAM after Prints: 85.8 126.66
Cant get all memory use too high
66 of 71 in 129.69 minutes. Using 73.4 of RAM, at current time Wed Nov 13 12:06:14 2024
