In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [22]:

# Load the dataset into a pandas dataframe
df = pd.read_csv('data/s3Files/data12.txt',header=None,sep=',')
df.drop(columns=[0,4],inplace=True)
df.dropna(inplace=True)
df.columns = ['current_1', 'current_2', 'current_3']

In [24]:
# Create condition indicators based on MCSA and VI analysis techniques
df['BrokenRotorBars'] = abs(df['current_1']) - abs(df['current_2'])
df['BearingFault'] = abs(df['current_2']) - abs(df['current_3'])
df['Eccentricity'] = abs(df['current_3']) - abs(df['current_1'])


One common unsupervised learning technique for anomaly detection is clustering. We can use clustering algorithms to group similar data points together and identify any data points that are significantly different from the others as potential anomalies.
In this code, we load the current data into a Pandas dataframe, drop any NaN values, and then perform clustering using the KMeans algorithm with 10 clusters. We then use the distances between each data point and its cluster center to identify potential anomalies. Any data point whose distance to its cluster center is more than 3 standard deviations away from the mean distance is considered a potential anomaly.

The anomalies variable contains the data points that are identified as potential anomalies. These data points can then be further analyzed to determine if they are truly anomalous or if they are just noise in the data.

In [25]:
import numpy as np
# Generate random indices to modify
num_anomalies = 10
anomaly_indices = np.random.choice(df.index, num_anomalies, replace=False)

# Modify the values at the anomaly indices
max_deviation = 4
for index in anomaly_indices:
    row = df.loc[index]
    col = np.random.choice(df.columns)
    deviation = max_deviation * np.random.random()
    df.loc[index, col] = row[col] + deviation

In [28]:
df_scaled = (df - df.mean()) / df.std()

In [48]:

from sklearn.cluster import KMeans

# Perform clustering
kmeans = KMeans(n_clusters=10)
kmeans.fit(df_scaled)

# Get cluster labels and distances to cluster centers
labels = kmeans.labels_
distances = kmeans.transform(df_scaled)

# Identify potential anomalies
df_scaled[(distances > distances.mean() + 1.5 * distances.std()).any(axis=1)]


Unnamed: 0,current_1,current_2,current_3,BrokenRotorBars,BearingFault,Eccentricity
1634,-1.384284,-0.811185,1.362104,-0.326259,-1.318009,1.593228
2025,-1.37327,0.390772,1.387678,-1.002224,-0.783149,1.60599
2221,1.40765,0.875966,-1.410883,0.302765,1.381638,-1.63814
2612,1.402143,0.931101,-1.399923,0.268341,1.399382,-1.627931
2808,-1.378777,-0.904916,1.380371,-0.269928,-1.373776,1.603438
3199,-1.367763,0.308068,1.398639,-0.952153,-0.828777,1.611095
3592,-1.345736,-0.987619,1.394985,-0.204209,-1.421939,1.598333
3788,1.413157,-0.303937,-1.425497,0.975602,0.849313,-1.650902
3985,-1.37327,0.269473,1.405946,-0.933376,-0.851591,1.618753
4181,1.40765,-0.281883,-1.443764,0.959954,0.872127,-1.661112


Indices of anomalies dont match


In [47]:
anomaly_indices

array([8993, 2460, 3813, 9485, 7053, 3890, 1155, 6224, 8753, 3369],
      dtype=int64)

In [128]:
from sklearn.ensemble import IsolationForest

# Load the dataset into a pandas dataframe
df = pd.read_csv('data/s3Files/data12.txt', header=None, sep=',')
df.drop(columns=[0, 4], inplace=True)
df.dropna(inplace=True)
df.columns = ['current_1', 'current_2', 'current_3']

# Create condition indicators based on MCSA and VI analysis techniques
df['BrokenRotorBars'] = abs(df['current_1']) - abs(df['current_2'])
df['BearingFault'] = abs(df['current_2']) - abs(df['current_3'])
df['Eccentricity'] = abs(df['current_3']) - abs(df['current_1'])

# Generate random indices to modify
num_anomalies = 10
anomaly_indices = np.random.choice(df.index, num_anomalies, replace=False)

# Modify the values at the anomaly indices
max_deviation = 4
for index in anomaly_indices:
    row = df.loc[index]
    col = np.random.choice(df.columns)
    deviation = max_deviation * np.random.random()
    df.loc[index, col] = row[col] + deviation

# # Scale the data
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# df_scaled = scaler.fit_transform(df)

# Fit the isolation forest model
model = IsolationForest(n_estimators=200, contamination=0.001)
model.fit(df)

# Predict the anomalies
y_pred = model.predict(df)
indices = np.where(y_pred == -1)[0]





In [129]:
np.sort(anomaly_indices),np.sort(indices)

(array([1647, 2529, 3280, 3478, 3882, 6404, 6802, 7772, 9139, 9182],
       dtype=int64),
 array([2221, 2612, 4966, 5358, 6534, 6964, 7356, 7486, 7916, 8879],
       dtype=int64))

In [69]:
print(len(anomaly_indices))

10


 periodicity in the current readings can affect the model as it can create false positives for anomaly detection. To rectify this, we can use a technique called Fourier Transform to extract the frequency components of the data and remove the periodicity.

 In this code, we first apply Fourier Transform to the original data df to extract the frequency components. We then create a mask to remove the frequency components corresponding to the periodicity, and apply the mask to the frequency domain data. We then use the inverse Fourier Transform to obtain the filtered data df_filtered.

Finally, we create the condition indicators based on the filtered data and train the machine learning model on the filtered data.


In [162]:
import copy
# Load the dataset into a pandas dataframe
df = pd.read_csv('data/s3Files/data12.txt',header=None,sep=',')
df.drop(columns=[0,4],inplace=True)
df.dropna(inplace=True)
df.columns = ['current_1', 'current_2', 'current_3']

# Apply Fourier Transform to remove periodicity
from scipy.fft import fft, ifft


# Apply bandpass filter
df_fft = fft(df)
freqs = np.fft.fftfreq(len(df))


# Create mask to filter out frequencies in 0.1-0.9 Hz range
mask = np.where((freqs > 0.1) & (freqs < 0.9), 0, 1)
# mask = np.tile(mask, (1, 3)) # reshape mask to match df_fft shape

# df_fft_filtered = df_fft * mask.reshape(-1, 1)
# df_filtered = np.real(ifft(df_fft_filtered))

df_filtered = copy.deepcopy(df.loc[mask.astype(bool)])


# Generate random indices to modify
num_anomalies = 10
anomaly_indices = np.random.choice(df_filtered.index, num_anomalies, replace=False)
# Modify the values at the anomaly indices
max_deviation = 4
for index in anomaly_indices:
    row = df_filtered.loc[index]
    col = np.random.choice(df_filtered.columns)
    deviation = max_deviation * np.random.random()
    df_filtered.loc[index, col] = row[col] + deviation

# Create condition indicators based on MCSA and VI analysis techniques
df_filtered['BrokenRotorBars'] = abs(df_filtered['current_1']) - abs(df_filtered['current_2'])
df_filtered['BearingFault'] = abs(df_filtered['current_2']) - abs(df_filtered['current_3'])
df_filtered['Eccentricity'] = abs(df_filtered['current_3']) - abs(df_filtered['current_1'])
model = IsolationForest(n_estimators=100, contamination=0.002)
model.fit(df_filtered.to_numpy())

# Predict the anomalies
y_pred = model.predict(df_filtered.to_numpy())
indices = np.where(y_pred == -1)[0]
indices,anomaly_indices

(array([1359, 2535, 2731, 3161, 3200, 3487, 3721, 4543, 4763, 4880, 5019,
        5588], dtype=int64),
 array([9468,  218,  490, 7216, 9140, 9045, 8862, 8780, 5724,  266],
       dtype=int64))

In [165]:

sum(y_pred)

5977

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Load the dataset into a pandas dataframe
df = pd.read_csv('data/s3Files/data12.txt',header=None,sep=',')
df.drop(columns=[0,4],inplace=True)
df.dropna(inplace=True)
df.columns = ['current_1', 'current_2', 'current_3']

# Create condition indicators based on MCSA and VI analysis techniques
df['BrokenRotorBars'] = abs(df['current_1']) - abs(df['current_2'])
df['BearingFault'] = abs(df['current_2']) - abs(df['current_3'])
df['Eccentricity'] = abs(df['current_3']) - abs(df['current_1'])





In [12]:
df.columns

Index(['current_1', 'current_2', 'current_3', 'BrokenRotorBars',
       'BearingFault', 'Eccentricity'],
      dtype='object')

In [19]:

# Apply MinMax scaling to normalize the data
scaler = MinMaxScaler()
df_norm = scaler.fit_transform(df)

# Split the dataset into training and testing sets
train_size = int(len(df_norm) * 0.8)
test_size = len(df_norm) - train_size
train, test = df_norm[0:train_size,:], df_norm[train_size:len(df_norm),:]

In [21]:
test = pd.DataFrame(scaler.inverse_transform(test),columns=df.columns)

In [22]:
# Generate random indices to modify
num_anomalies = 10
anomaly_indices = np.random.choice(test.index, num_anomalies, replace=False)
# Modify the values at the anomaly indices
max_deviation = 4
for index in anomaly_indices:
    row = test.loc[index]
    col = np.random.choice(test.columns)
    deviation = max_deviation * np.random.random()
    test.loc[index, col] = row[col] + deviation

In [23]:
test = scaler.fit_transform(test)

In [24]:
# Function to reshape the data into timesteps for LSTM input
def create_dataset(X, look_back=1):
    X_data, Y_data = [], []
    for i in range(len(X)-look_back-1):
        a = X[i:(i+look_back), :]
        X_data.append(a)
        Y_data.append(X[i + look_back, :])
    return np.array(X_data), np.array(Y_data)

# Define the number of timesteps and features for LSTM input
timesteps = 50
features = 6

# Reshape the data into timesteps for LSTM input
X_train, y_train = create_dataset(train, timesteps)
X_test, y_test = create_dataset(test, timesteps)


In [25]:
# Define the LSTM model architecture
model = Sequential()
model.add(LSTM(50, input_shape=(timesteps, features)))
model.add(Dense(features))
model.compile(loss='mse', optimizer='adam')

In [26]:
# Train the LSTM model
model.fit(X_train, y_train, epochs=50, batch_size=128, verbose=1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2050c7f69d0>

In [27]:
# Use the trained LSTM model to predict the test data
y_pred = model.predict(X_test)



In [36]:
# Calculate the reconstruction error for each sample in the test data
recon_errors = np.linalg.norm(y_test - y_pred, axis=1)

# Define a threshold for anomaly detection
threshold = np.mean(recon_errors) + np.std(recon_errors) * 4

# Detect the anomalies
anomalies = np.where(recon_errors > threshold)[0]

# Print the indices of the anomalous samples
print(anomalies)

[  23   62  140  219  257  297  336  414  453  594  673  712  752  790
  829  830  890  929 1007 1086 1164 1242 1321 1341 1537 1577 1597 1676
 1716 1754 1833 1872]


In [33]:
np.sort(anomaly_indices)

array([ 313,  559,  627,  671,  775,  927, 1150, 1544, 1681, 1811],
      dtype=int64)

In [31]:
len(anomalies)

32

In [34]:
df.columns

Index(['current_1', 'current_2', 'current_3', 'BrokenRotorBars',
       'BearingFault', 'Eccentricity'],
      dtype='object')

In [None]:
model.predict()

In [168]:
import gc
gc.collect()

1438

In [171]:
from numba import cuda

cuda.select_device(0)
cuda.close()

In [21]:



# # Create features from the data that can be used to train a machine learning model
# features = ['current_1', 'current_2', 'current_3', 'BrokenRotorBars', 'BearingFault', 'Eccentricity']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(df[features], df['label'], test_size=0.2)

# # Train a machine learning model to predict future anomalies
# clf = RandomForestClassifier()
# clf.fit(X_train, y_train)

# # Predict on the testing set and evaluate the model's performance
# y_pred = clf.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print('Accuracy:', accuracy)


KeyError: 'label'