# Ottawa dataset processing

1. extract anomaly column
2. resampling to 20kHz
3. adding label
4. combine health and faulty data
5. extract features same as NASA data
6. save features to features_ottawa.csv


In [1]:
import pandas as pd
from scipy.signal import butter, filtfilt, resample
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.stats import kurtosis, skew
import joblib

# File paths
healthy_file = "datat/H_H_8_1.csv" # data without anomaly, 10 seconds of data, sampling rate of 42 kHz
faulty_file = "datat/F_B_8_1.csv" # data with all anomaly, 10 seconds of data, sampling rate of 42 kHz
output_features = "Ottawa_features.csv"

# Read files
healthy_data = pd.read_csv(healthy_file)
faulty_data = pd.read_csv(faulty_file)

# Select Accelerometer 1, which contains anomalies
healthy_channel = healthy_data["Accelerometer 1 (m/s^2)"]
faulty_channel = faulty_data["Accelerometer 1 (m/s^2)"]

In [2]:
# Define a low-pass filter
def low_pass_filter(signal, cutoff, fs, order=4):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return filtfilt(b, a, signal)

In [3]:
# Parameters
sampling_rate_original = 42000  # 42 kHz
sampling_rate_target = 20000    # 20 kHz
cutoff_frequency = 10000        # 10 kHz
window_size = 512               # Window size for segmentation
stride = 256                    # Stride for sliding window

# Apply low-pass filtering
healthy_filtered = low_pass_filter(healthy_channel, cutoff_frequency, sampling_rate_original)
faulty_filtered = low_pass_filter(faulty_channel, cutoff_frequency, sampling_rate_original)


In [4]:
# Downsample the data
downsample_factor = sampling_rate_original / sampling_rate_target
healthy_downsampled = resample(healthy_filtered, int(len(healthy_filtered) / downsample_factor))
faulty_downsampled = resample(faulty_filtered, int(len(faulty_filtered) / downsample_factor))

In [5]:
print(f"Healthy Downsampled: {len(healthy_downsampled)}, Expected: {sampling_rate_target * 10}")
print(f"Faulty Downsampled: {len(faulty_downsampled)}, Expected: {sampling_rate_target * 10}")


Healthy Downsampled: 200000, Expected: 200000
Faulty Downsampled: 200000, Expected: 200000


In [6]:
# Segment the downsampled data with stride
def segment_signal_with_stride(signal, window_size, stride):
    return [signal[i:i+window_size] for i in range(0, len(signal) - window_size + 1, stride)]

# Segment the downsampled data with stride and ensure NumPy array output
healthy_segments = np.array(segment_signal_with_stride(healthy_downsampled, window_size, stride))
faulty_segments = np.array(segment_signal_with_stride(faulty_downsampled, window_size, stride))

# Extract features from each window
def extract_features(windows):
    return np.column_stack([
        np.mean(windows, axis=1),
        np.std(windows, axis=1),
        np.max(windows, axis=1),
        np.min(windows, axis=1),
        kurtosis(windows, axis=1),
        skew(windows, axis=1),
        np.sqrt(np.mean(windows**2, axis=1)),  # RMS
        np.ptp(windows, axis=1),  # Amplitude Range
        np.max(windows, axis=1) / (np.sqrt(np.mean(windows**2, axis=1)) + 1e-10),  # Peak Factor
        np.sum(np.diff(np.sign(windows), axis=1) != 0, axis=1)  # Zero-Crossing Rate
    ])


In [7]:
# Extract features
X_ottawa = np.vstack([
    extract_features(healthy_segments),
    extract_features(faulty_segments)
])

In [8]:
# Train a new StandardScaler on Ottawa dataset
scaler_ottawa = StandardScaler()
X_ottawa_scaled = scaler_ottawa.fit_transform(X_ottawa)  # Compute scaling

# Save this new scaler for future use
joblib.dump(scaler_ottawa, "scaler_ottawa.pkl")
print("New Ottawa Scaler saved as 'scaler_ottawa.pkl'.")

New Ottawa Scaler saved as 'scaler_ottawa.pkl'.


In [9]:
# Combine features and labels
y_combined = np.concatenate((np.zeros(len(healthy_segments)), np.ones(len(faulty_segments))), axis=0)

# Save the standardized features to CSV
features_df = pd.DataFrame(X_ottawa_scaled, columns=[
    'Mean', 'Std', 'Max', 'Min', 'Kurtosis', 'Skewness', 'RMS', 'AmplitudeRange', 'PeakFactor', 'ZeroCrossings'
])
features_df['AnomalyLabel'] = y_combined
features_df.to_csv(output_features, index=False)
print(f"features_ottawa saved as: {output_features}")

features_ottawa saved as: Ottawa_features.csv
