In [None]:
import os
import pandas as pd
import numpy as np
from scipy.fft import fft
from sklearn.preprocessing import StandardScaler


# Define the chunk size and anomaly threshold
chunk_size = 10000  # process 10000 rows at a time
# anomaly_thresh = 0.5  # set the anomaly threshold

# Read the data in chunks
datafolder = 'data/s3Files/'
data_files = os.listdir(datafolder)

# Iterate over the files and process the data in chunks
for file in data_files:
    # Load the data chunk by chunk
    for chunk in pd.read_csv(datafolder + file, header=None, chunksize=chunk_size):


        # Generate random indices to modify
        num_anomalies = 100
        anomaly_indices = np.random.choice(chunk.index, num_anomalies, replace=False)

        # Modify the values at the anomaly indices
        max_deviation = 24
        for index in anomaly_indices:
            row = chunk.loc[index]
            col = np.random.choice(chunk.columns)
            deviation = max_deviation * np.random.random()
            chunk.loc[index, col] = row[col] + deviation
       
        scaler = StandardScaler()
        chunk = scaler.fit_transform(chunk)

        # Process the chunk data
        fft_data = fft(chunk, axis=1)
        peak_freqs = np.argmax(fft_data, axis=1)
        mean_currents = np.mean(chunk, axis=1)
        mean_freq = np.mean(peak_freqs)
        std_freq = np.std(peak_freqs)
        thresh_freq = mean_freq + 3 * std_freq
        anomalies_freq = np.where(peak_freqs > thresh_freq)[0]
        # print(thresh_freq)
        
        # Check for anomalies and alert if detected
        if len(anomalies_freq) > 0:
            print(f"Anomaly detected in {file}: {len(anomalies_freq)} anomalies detected in chunk {chunk.index[0]} - {chunk.index[-1]}")


In [2]:
import os
import pandas as pd
import numpy as np
from scipy.fft import fft
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('data/s3Files/data10.txt',header=None,sep=',')
df.drop(columns=[0,4],inplace=True)
df.columns = ['current_1', 'current_2', 'current_3']

In [4]:
num_anomalies = 10
anomaly_indices = np.random.choice(df.index, num_anomalies, replace=False)
for index in anomaly_indices:
    row = df.loc[index]
    col = np.random.choice(df.columns)
    df.loc[index, col] = row[col] + 50000

In [5]:
def get_ano(chunk):
    scaler = StandardScaler()
    chunk = scaler.fit_transform(chunk)

    # Process the chunk data
    fft_data = fft(chunk, axis=1)
    peak_freqs = np.argmax(fft_data, axis=1)
    mean_currents = np.mean(chunk, axis=1)
    mean_freq = np.mean(peak_freqs)
    std_freq = np.std(peak_freqs)
    thresh_freq = mean_freq + 3 * std_freq
    anomalies_freq = np.where(peak_freqs > thresh_freq)[0]
    # print(thresh_freq)
    
    # Check for anomalies and alert if detected
    if len(anomalies_freq) > 0:
        print(f"Anomaly detected{len(anomalies_freq)} ")

In [6]:
get_ano(df)

In [38]:
df = pd.read_csv('data/s3Files/data12.txt',header=None,sep=',')
df.drop(columns=[0,4],inplace=True)
df.columns = ['current_1', 'current_2', 'current_3']

In [39]:
df.max()

current_1    2255
current_2    2453
current_3     846
dtype: int64

In [29]:
num_anomalies = 10
anomaly_indices = np.random.choice(df.index, num_anomalies, replace=False)
for index in anomaly_indices:
    row = df.loc[index]
    col = np.random.choice(df.columns)
    df.loc[index, col] = row[col] + 50000

In [48]:
df

Unnamed: 0,current_1,current_2,current_3
0,1961,2398,122
1,1955,2398,131
2,1948,2399,140
3,1942,2398,150
4,1938,2398,156
...,...,...,...
9995,2218,1902,493
9996,2215,1925,464
9997,2233,1927,481
9998,2220,1933,476


In [40]:
# scaler = StandardScaler()
# chunk = scaler.fit_transform(df)
chunk = (df - df.mean()) / df.std()

In [41]:
fft_data = fft(chunk.to_numpy(), axis=1)
peak_freqs = np.argmax(fft_data, axis=1)
mean_currents = np.mean(chunk, axis=1)
mean_freq = np.mean(peak_freqs)
std_freq = np.std(peak_freqs)
thresh_freq = mean_freq + 3 * std_freq / np.mean(chunk.std(axis=1))
anomalies_freq = np.where(peak_freqs > thresh_freq)[0]
anomalies_freq

array([], dtype=int64)

In [None]:
df.loc[10:100].values

In [45]:
peak_freqs[10:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int64)

peak freqs will always be within 0,1,2 and thresh will always be something > 2

array([], dtype=int64)

In [40]:
np.where(peak_freqs > thresh_freq)[0]

array([], dtype=int64)