In [3]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.signal
import scipy.interpolate
import scipy.stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import os
import warnings

# Suppress warnings for a cleaner notebook
warnings.filterwarnings('ignore')

print("Libraries imported successfully.")

Libraries imported successfully.


In [4]:
# --- Signal Processing Constants ---
GEAR_RATIO = 5.095238095
SAMPLE_RATE_HZ = 93750.0
NYQUIST_HZ = SAMPLE_RATE_HZ / 2.0

# Bearing fault orders (cycles / revolution)
# [cage, ball, inner race, outer race]
FAULT_ORDERS_BPF = [0.43, 7.05, 10.78, 8.22]
FAULT_NAMES = ['Cage', 'Ball', 'InnerRace', 'OuterRace']

# --- Improved Design Constants ---

# 1. Multi-Band Analysis (instead of one fixed band)
# We will analyze these four bands to find the most sensitive one
BANDS_HZ = [
    (10000.0, 20000.0),
    (20000.0, 30000.0),
    (30000.0, 40000.0),
    (40000.0, 46000.0)  # Up to just below Nyquist
]

# 2. Rich Feature Extraction
HARMONICS_TO_EXTRACT = [1, 2, 3] # Extract 1x, 2x, and 3x harmonics
SEARCH_WIDTH_ORDER = 0.1 # Width to search around a peak

In [10]:
# --- File List ---
# Generate all 52 filenames
FILE_LIST = [f"file_{i:02d}.csv" for i in range(1, 54)]
print(f"File list generated, from {FILE_LIST[0]} to {FILE_LIST[-1]}")

File list generated, from file_01.csv to file_53.csv


In [11]:
def get_crest_factor(x):
    """Calculates the Crest Factor of a signal."""
    return np.max(np.abs(x)) / np.sqrt(np.mean(np.square(x)))

In [12]:
def extract_peak_amplitude(orders, spectrum_amplitude, target_order, search_width):
    """
    Finds the peak amplitude in the spectrum within a small window
    around the target_order.
    """
    order_min = target_order - search_width / 2.0
    order_max = target_order + search_width / 2.0

    band_indices = np.where((orders >= order_min) & (orders <= order_max))[0]

    if len(band_indices) > 0:
        return np.max(spectrum_amplitude[band_indices])
    else:
        return 0.0

In [13]:
def process_file_for_features(filename):
    """
    Loads a single file and performs the full end-to-end
    feature extraction pipeline.
    """
    # --- 1. Load Data ---
    df = pd.read_csv(filename)
    v_signal = df['v'].values
    zct_times = df['zct'].dropna().values

    if len(zct_times) < 2:
        raise ValueError("Insufficient tach data")

    # --- 2. Angular Resampling (as before) ---
    t_v_axis = np.arange(len(v_signal)) / SAMPLE_RATE_HZ

    # Create Phase-Time Map
    phi_at_zct = np.arange(len(zct_times)) * 2 * np.pi
    phase_interpolator = scipy.interpolate.interp1d(
        zct_times, phi_at_zct, kind='linear', fill_value="extrapolate"
    )
    phi_v_unwrapped = phase_interpolator(t_v_axis)

    # Create evenly spaced angular axis
    phi_regular = np.linspace(
        phi_v_unwrapped.min(),
        phi_v_unwrapped.max(),
        len(phi_v_unwrapped)
    )

    # Resample vibration signal v(t) -> v(phi)
    v_resampled = np.interp(phi_regular, phi_v_unwrapped, v_signal)

    # Calculate angular sampling rate (samples per revolution)
    total_revolutions = (phi_regular[-1] - phi_regular[0]) / (2 * np.pi)
    num_samples = len(phi_regular)
    samples_per_rev_avg = num_samples / total_revolutions
    nyquist_order = samples_per_rev_avg / 2.0

    # Get mean turbine speed (for converting Hz to Order)
    tach_periods = np.diff(zct_times)
    f_tach_inst = 1.0 / tach_periods
    mean_speed_hz = np.mean(f_tach_inst * GEAR_RATIO)

    # --- 3. Multi-Band Feature Extraction ---
    file_features = {} # Dictionary to store all features for this file

    for (band_low_hz, band_high_hz) in BANDS_HZ:

        band_name = f"band_{int(band_low_hz/1000)}k" # e.g., "band_10k"

        # Convert Hz band to Order band
        band_low_order = band_low_hz / mean_speed_hz
        band_high_order = band_high_hz / mean_speed_hz

        # Clamp to Nyquist
        if band_high_order >= nyquist_order:
            band_high_order = nyquist_order * 0.99

        # --- 3a. Filter & Envelope ---
        try:
            b, a = scipy.signal.butter(
                4, [band_low_order, band_high_order],
                btype='band', fs=samples_per_rev_avg
            )
            v_resampled_filtered = scipy.signal.lfilter(b, a, v_resampled)

            # Get envelope
            analytic_signal = scipy.signal.hilbert(v_resampled_filtered)
            envelope = np.abs(analytic_signal)

        except ValueError as e:
            # Filter creation might fail if bands are bad
            print(f"  Warning: Filter failed for {filename} {band_name}: {e}")
            envelope = np.zeros_like(v_resampled) # Use zero envelope


        # --- 3b. Envelope Time-Domain Stats ---
        file_features[f"kurtosis_{band_name}"] = scipy.stats.kurtosis(envelope, fisher=False)
        file_features[f"crest_factor_{band_name}"] = get_crest_factor(envelope)

        # --- 3c. Envelope Order Spectrum & Features ---
        envelope_demeaned = envelope - np.mean(envelope)

        # Get Amplitude Spectrum
        spectrum_amplitude = np.abs(np.fft.fft(envelope_demeaned))

        # Create Order axis
        order_axis = np.fft.fftfreq(num_samples) * samples_per_rev_avg

        # Keep only positive half
        positive_mask = order_axis >= 0
        orders = order_axis[positive_mask]
        spectrum_amplitude = spectrum_amplitude[positive_mask]

        # Extract peaks for all faults and harmonics
        for fault_name, fault_order in zip(FAULT_NAMES, FAULT_ORDERS_BPF):
            for h in HARMONICS_TO_EXTRACT:
                target_order = fault_order * h
                feature_name = f"{fault_name}_{h}x_{band_name}"

                peak_amp = extract_peak_amplitude(
                    orders, spectrum_amplitude, target_order, SEARCH_WIDTH_ORDER
                )
                file_features[feature_name] = peak_amp

    return file_features

In [14]:
# This cell will take a few minutes to run
all_features_list = []

print(f"Starting feature extraction for {len(FILE_LIST)} files...")

for filename in FILE_LIST:
    try:
        # Check if file exists. This assumes the files are in the
        # same directory as the notebook.
        if not os.path.exists(filename):
            print(f"  Skipping: {filename} not found.")
            continue

        print(f"Processing {filename}...")

        # Extract the feature vector
        features = process_file_for_features(filename)
        features['filename'] = filename
        all_features_list.append(features)

    except Exception as e:
        print(f"  *** ERROR processing {filename}: {e} ***")

print(f"\nSuccessfully processed {len(all_features_list)} files.")

# --- Create the main Feature DataFrame ---
feature_df = pd.DataFrame(all_features_list)
feature_df = feature_df.set_index('filename')

# Fill any NaNs (from failed filters, etc.) with 0
# A 0-amplitude peak is a "healthy" sign
feature_df = feature_df.fillna(0)

print("Feature Matrix created:")
feature_df.head()

Starting feature extraction for 53 files...
Processing file_01.csv...
Processing file_02.csv...
Processing file_03.csv...
Processing file_04.csv...
Processing file_05.csv...
Processing file_06.csv...
Processing file_07.csv...
Processing file_08.csv...
Processing file_09.csv...
Processing file_10.csv...
Processing file_11.csv...
Processing file_12.csv...
Processing file_13.csv...
Processing file_14.csv...
Processing file_15.csv...
Processing file_16.csv...
Processing file_17.csv...
Processing file_18.csv...
Processing file_19.csv...
Processing file_20.csv...
Processing file_21.csv...
Processing file_22.csv...
Processing file_23.csv...
Processing file_24.csv...
Processing file_25.csv...
Processing file_26.csv...
Processing file_27.csv...
Processing file_28.csv...
Processing file_29.csv...
Processing file_30.csv...
Processing file_31.csv...
Processing file_32.csv...
Processing file_33.csv...
Processing file_34.csv...
Processing file_35.csv...
Processing file_36.csv...
Processing file_37.c

Unnamed: 0_level_0,kurtosis_band_10k,crest_factor_band_10k,Cage_1x_band_10k,Cage_2x_band_10k,Cage_3x_band_10k,Ball_1x_band_10k,Ball_2x_band_10k,Ball_3x_band_10k,InnerRace_1x_band_10k,InnerRace_2x_band_10k,...,Cage_3x_band_40k,Ball_1x_band_40k,Ball_2x_band_40k,Ball_3x_band_40k,InnerRace_1x_band_40k,InnerRace_2x_band_40k,InnerRace_3x_band_40k,OuterRace_1x_band_40k,OuterRace_2x_band_40k,OuterRace_3x_band_40k
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
file_01.csv,3.184177,3.070182,12898.783298,15524.629285,14174.548928,10840.218951,6190.969926,4248.808599,20513.326535,4663.865049,...,10505.477195,5429.668471,1553.488071,358.496731,2719.138891,516.879727,159.53423,5201.266545,834.015001,211.978276
file_02.csv,3.172063,3.272234,11163.958733,10653.376547,14830.360406,17131.44182,8841.569051,10557.116702,11941.175792,4182.367252,...,9989.935855,4663.515708,1157.824782,466.236838,4017.941953,402.858302,157.841628,5165.24952,1037.799281,404.964536
file_03.csv,3.10951,3.208691,26417.979762,11709.052853,11082.027954,12815.856293,6598.650119,4372.753925,11500.308086,3096.728871,...,7960.554194,4141.439084,1015.417329,439.013949,2049.374569,444.19085,137.085461,3071.84732,751.771283,215.810316
file_04.csv,3.25428,3.249656,25610.44555,16253.508983,14084.605208,11326.750872,9914.278732,5634.739982,18063.908012,4406.128729,...,9333.872768,6155.258487,1413.2915,460.044324,4078.586666,471.346199,250.231546,6836.410004,985.89601,312.056355
file_05.csv,3.095714,3.01044,20979.137856,11383.097576,14553.89119,11284.305444,8221.521913,4924.598092,10885.878262,4799.042614,...,6957.341173,5411.694726,1089.299331,503.698732,3246.246928,363.440741,110.092127,7786.550931,773.655667,347.716267


In [15]:
print("Starting PCA for dimensionality reduction...")

# Separate features from index
feature_names = feature_df.columns
X = feature_df.values

# --- 1. Standardize features ---
# This is CRITICAL for PCA.
# It ensures all features (kurtosis, amplitudes)
# are weighted equally.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 2. Apply PCA ---
# We'll compute 5 components to inspect them,
# but we only care about PC1 for ranking.
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

# --- 3. Analyze Explained Variance ---
print(f"Explained variance by component: {pca.explained_variance_ratio_}")
print(f"Total variance explained by 5 components: {np.sum(pca.explained_variance_ratio_):.2f}")

# We hope PC1 is very large (e.g., > 0.4 or 40%)
# as this implies a single, strong "axis" of variance,
# which we hypothesize is the degradation timeline.

# --- 4. Add PC1 Score to our DataFrame ---
feature_df['PC1_Score'] = X_pca[:, 0]

print("\nPCA calculation complete.")
feature_df[['PC1_Score']].head()

Starting PCA for dimensionality reduction...
Explained variance by component: [0.15274986 0.11919256 0.05748846 0.05257904 0.04463751]
Total variance explained by 5 components: 0.43

PCA calculation complete.


Unnamed: 0_level_0,PC1_Score
filename,Unnamed: 1_level_1
file_01.csv,0.88907
file_02.csv,3.481844
file_03.csv,-2.123504
file_04.csv,5.403396
file_05.csv,1.439912


In [16]:
#
# IMPORTANT: INTERPRETATION
#
# PCA is "blind." It finds the axis of max variance, but it doesn't
# know if a high score is "good" or "bad".
# The sign of the PC1 vector is arbitrary.
#
# We must check what PC1 correlates with. We *expect* degradation
# (high kurtosis, high fault amplitudes) to be our main axis.

print("Interpreting PC1...")

# Let's check PC1's correlation with the 20-30k Kurtosis
# (a strong degradation indicator)
try:
    kurtosis_corr = feature_df['PC1_Score'].corr(feature_df['kurtosis_band_20k'])
    print(f"Correlation between PC1 and Kurtosis (20k-30k band): {kurtosis_corr:.2f}")

    # If correlation is negative, it means a *low* PC1 score = *high* kurtosis
    # (i.e., high degradation).
    # If correlation is positive, a *high* PC1 score = *high* degradation.

    if kurtosis_corr < 0:
        # A low PC1 score means high degradation, so we sort in *ascending* order
        # to get the rank (1 = lowest PC1 = most degraded)
        # But we want rank 1 to be the *least* degraded, so we still rank
        # ascending=True
        print("PC1 is NEGATIVELY correlated with kurtosis.")
        print("A *lower* PC1 score means *more* degraded.")
        sort_ascending = True # Ranks [low...high] -> [1...52]
    else:
        # A high PC1 score means high degradation
        print("PC1 is POSITIVELY correlated with kurtosis.")
        print("A *higher* PC1 score means *more* degraded.")
        sort_ascending = True # Ranks [low...high] -> [1...52]

    # In both cases, .rank(ascending=True) will give rank 1 to the
    # file with the lowest PC1 score (which we assume is "healthiest").
    # If the correlation was negative, this is what we want.
    # If the correlation was positive, this is also what we want.

    # ***
    # Sanity Check: If your results look inverted, flip `sort_ascending` to False.
    # This is the single most important "human-in-the-loop" step.
    # For now, we assume Rank 1 = Lowest PC1 Score = Healthiest.
    # ***

    feature_df['prediction'] = feature_df['PC1_Score'].rank(ascending=sort_ascending).astype(int)

except Exception as e:
    print(f"Could not calculate correlation, using default ranking: {e}")
    # Default: assume Rank 1 = Lowest PC1 Score = Healthiest
    feature_df['prediction'] = feature_df['PC1_Score'].rank(ascending=True).astype(int)


# --- 5. Create Final Submission File ---
submission_df = feature_df.reset_index()[['filename', 'prediction']]
submission_df = submission_df.sort_values('prediction')

submission_df.to_csv('pca_ranked_submission.csv', index=False)

print("\nFinal ranking created:")
print(submission_df.head())

print("\n---")
print(submission_df.tail())
print("\n---")
print("Submission file 'pca_ranked_submission.csv' saved!")

Interpreting PC1...
Correlation between PC1 and Kurtosis (20k-30k band): -0.17
PC1 is NEGATIVELY correlated with kurtosis.
A *lower* PC1 score means *more* degraded.

Final ranking created:
       filename  prediction
5   file_06.csv           1
15  file_16.csv           2
21  file_22.csv           3
33  file_34.csv           4
46  file_47.csv           5

---
       filename  prediction
50  file_51.csv          49
41  file_42.csv          50
3   file_04.csv          51
22  file_23.csv          52
48  file_49.csv          53

---
Submission file 'pca_ranked_submission.csv' saved!
