Import

In [None]:
import os
import scipy.io as sio
from scipy.signal import decimate
import numpy as np
import pandas as pd

## Load Data
The KAIST Dataset (Jung2022) has 4 sensed variables: acoustic, current, temperature, and vibration. In this case, only vibration will be used.
- **Machine Conditions (Normal, BPFI, BPFO, Misalign, and Unbalance):** Fault types and normal operating condition
- **Motor load (0Nm, 2Nm, 4Nm):** Torque applied to the motor simulating load
- **Fault Severity:** Depending on the type of fault there are at least 3 severities. For example BPFI has a fault severity represented in the crack width (0.3mm, 1mm, 3mm)


In [26]:
# ------------------------------ KAIST Dataset ------------------------------
# Paths and parameters
vibration_mat_folder_kaist = '../../data/raw_kaist/vibration_mat_25.6kHz'
v_length_kaist = 1536000  # length of the smaller vector in the dataset
sampling_rate_kaist = 25600  # fixed at 25.6 kHz

# Class labels for different domains
class_labels_kaist0 = ['0Nm_Normal', '0Nm_BPFI_03', '0Nm_BPFO_03', '0Nm_Misalign_01', '0Nm_Unbalance_0583mg']
class_labels_kaist1 = ['2Nm_Normal', '2Nm_BPFI_03', '2Nm_BPFO_03', '2Nm_Misalign_01', '2Nm_Unbalance_0583mg']
class_labels_kaist2 = ['4Nm_Normal', '4Nm_BPFI_03', '4Nm_BPFO_03', '4Nm_Misalign_01', '4Nm_Unbalance_0583mg']

def load_domain_data_kaist(class_labels, domain_name):
    """Load vibration data for a domain, using CSV cache if available."""
    vibration_data = {}

    folder_path = f'../../data/flex-data/vibration{sampling_rate_kaist/1000}kHz_kaist_domain{domain_name}'
    os.makedirs(folder_path, exist_ok=True)

    print(f"Loading domain {domain_name}...")

    for label in class_labels:
        csv_path = f'{folder_path}/{label}.csv'
        mat_path = os.path.join(vibration_mat_folder_kaist, f"{label}.mat")

        # Try CSV first, fall back to .mat
        if os.path.exists(csv_path):
            print(f"  Loading {label} from CSV...")
            vibration_data[label] = pd.read_csv(csv_path).values
        elif os.path.exists(mat_path):
            print(f"  Loading {label} from .mat file...")
            mat_data = sio.loadmat(mat_path)
            vibration = mat_data['Signal']['y_values'][0][0][0][0][0][:v_length_kaist, :]
            vibration_data[label] = vibration

            # Save to CSV for next time
            print(f"  Saving {label} to CSV...")
            pd.DataFrame(vibration_data[label]).to_csv(csv_path, index=False)
        else:
            print(f"  Warning: {label} not found")

    print(f"Domain {domain_name} loaded: {len(vibration_data)} classes")
    return vibration_data

# Load all domains
vibration_data_kaist0 = load_domain_data_kaist(class_labels_kaist0, '0Nm')
vibration_data_kaist1 = load_domain_data_kaist(class_labels_kaist1, '2Nm')
vibration_data_kaist2 = load_domain_data_kaist(class_labels_kaist2, '4Nm')

vibration_data_kaist = {**vibration_data_kaist0, **vibration_data_kaist1, **vibration_data_kaist2}
class_labels_kaist = list(vibration_data_kaist.keys())


Loading domain 0...
  Loading 0Nm_Normal from .mat file...
  Saving 0Nm_Normal to CSV...
  Loading 0Nm_BPFI_03 from .mat file...
  Saving 0Nm_BPFI_03 to CSV...
  Loading 0Nm_BPFO_03 from .mat file...
  Saving 0Nm_BPFO_03 to CSV...
  Loading 0Nm_Misalign_01 from .mat file...
  Saving 0Nm_Misalign_01 to CSV...
  Loading 0Nm_Unbalance_0583mg from .mat file...
  Saving 0Nm_Unbalance_0583mg to CSV...
Domain 0 loaded: 5 classes
Loading domain 1...
  Loading 2Nm_Normal from .mat file...
  Saving 2Nm_Normal to CSV...
  Loading 2Nm_BPFI_03 from .mat file...
  Saving 2Nm_BPFI_03 to CSV...
  Loading 2Nm_BPFO_03 from .mat file...
  Saving 2Nm_BPFO_03 to CSV...
  Loading 2Nm_Misalign_01 from .mat file...
  Saving 2Nm_Misalign_01 to CSV...
  Loading 2Nm_Unbalance_0583mg from .mat file...
  Saving 2Nm_Unbalance_0583mg to CSV...
Domain 1 loaded: 5 classes
Loading domain 2...
  Loading 4Nm_Normal from .mat file...
  Saving 4Nm_Normal to CSV...
  Loading 4Nm_BPFI_03 from .mat file...
  Saving 4Nm_BPFI_0

In [None]:
# ------------------------------ CWRU Dataset ------------------------------
# Paths and parameters
vibration_mat_folder_cwru = '../../data/raw_cwru/vibration_mat'
v_length_cwru = 1536000 # Lenght of the smaller vector in the dataset
decimation_factor_cwru = 1
sampling_rate_cwru = 12000

# Class labels for different domains
class_labels_cwru0 = ['0Nm_Normal', '0Nm_BPFI_03', '0Nm_BPFO_03', '0Nm_Misalign_01', '0Nm_Unbalance_0583mg']
class_labels_cwru1 = ['2Nm_Normal', '2Nm_BPFI_03', '2Nm_BPFO_03', '2Nm_Misalign_01', '2Nm_Unbalance_0583mg']
class_labels_cwru2 = ['4Nm_Normal', '4Nm_BPFI_03', '4Nm_BPFO_03', '4Nm_Misalign_01', '4Nm_Unbalance_0583mg']

def load_domain_data_cwru(class_labels, domain_name):
    """Load vibration data for a domain, using CSV cache if available."""
    vibration_data = {}
    folder_path = f'../../data/flex-data/vibration{int(25600/decimation_factor_cwru)/1000}kHz_cwru_domain{domain_name}'
    os.makedirs(folder_path, exist_ok=True)
    
    print(f"Loading domain {domain_name}...")
    
    for label in class_labels:
        csv_path = f'{folder_path}/{label}.csv'
        mat_path = os.path.join(vibration_mat_folder, f"{label}.mat")
        
        # Try CSV first, fall back to .mat
        if os.path.exists(csv_path):
            print(f"  Loading {label} from CSV...")
            vibration_data[label] = pd.read_csv(csv_path).values
        elif os.path.exists(mat_path):
            print(f"  Loading {label} from .mat file...")
            mat_data = sio.loadmat(mat_path)
            vibration = mat_data['Signal']['y_values'][0][0][0][0][0][:v_length, :]
            vibration_data[label] = decimate(vibration[:v_length,:], decimation_factor_cwru, axis=0)
            
            # Save to CSV for next time
            print(f"  Saving {label} to CSV...")
            pd.DataFrame(vibration_data[label]).to_csv(csv_path, index=False)
        else:
            print(f"  Warning: {label} not found")
    
    print(f"Domain {domain_name} loaded: {len(vibration_data)} classes")
    return vibration_data

# Load all domains
vibration_data_cwru0 = load_domain_data_cwru(class_labels_cwru0, '0')
vibration_data_cwru1 = load_domain_data_cwru(class_labels_cwru1, '1')
vibration_data_cwru2 = load_domain_data_cwru(class_labels_cwru2, '2')

vibration_data_cwru = {**vibration_data_cwru0, **vibration_data_cwru1, **vibration_data_cwru2}
class_labels_cwru = list(vibration_data_cwru.keys())

In [3]:
# Python 3.x
# pip install scipy numpy pandas
import json
import re
import pathlib
import numpy as np
from fractions import Fraction
from scipy.io import loadmat
from scipy.signal import resample_poly

# ------- config -------
TARGET_FS = 25600  # Hz to match KAIST
OUT_DIR = pathlib.Path("../../data/out")

# Helpers to find time-domain arrays in CWRU .mat
CWRU_KEY_ORDER = [
    # common CWRU naming patterns
    r".*_DE_time$",   # drive end accel
    r".*_FE_time$",   # fan end accel
    r".*_BA_time$",   # base accel
    r".*_AE_time$",   # acoustic emission if present
    r".*DE$", r".*FE$", r".*BA$",
    r".*X\d?$", r".*Y\d?$",    # add patterns for KAIST accelerometers
]

def _flatten_1d(x):
    x = np.asarray(x)
    if x.ndim == 2 and 1 in x.shape:
        x = x.reshape(-1)
    return x.squeeze()

def _extract_channels(matdict):
    # collect all 1D numeric arrays
    candidates = {}
    for k, v in matdict.items():
        if k.startswith("__"):
            continue
        try:
            arr = _flatten_1d(v)
        except Exception:
            continue
        if np.issubdtype(arr.dtype, np.number) and arr.ndim == 1 and arr.size > 10:
            candidates[k] = arr

    # order channels by regex priority
    ordered = []
    used = set()
    for pat in CWRU_KEY_ORDER:
        rx = re.compile(pat, re.IGNORECASE)
        for k in list(candidates.keys()):
            if k in used:
                continue
            if rx.match(k):
                ordered.append(candidates[k])
                used.add(k)
    # add any leftover numeric vectors
    for k, arr in candidates.items():
        if k not in used:
            ordered.append(arr)

    return ordered[:4]  # up to four channels

def _infer_fs_from_mat(matdict, default_fs=12000):
    # try common field names
    for k in matdict.keys():
        if "fs" == k.lower() or "samplingrate" in k.lower():
            fs_val = np.asarray(matdict[k]).astype(float).squeeze()
            if fs_val.size >= 1:
                return float(fs_val.flat[0])
    # try to guess from CWRU naming like X097_DE_time -> 12 kHz or 48 kHz are common
    return float(default_fs)

def _stack_to_four(chans):
    """stack list of 1D arrays to shape [N,4], padding with zeros if fewer than 4."""
    maxlen = min([len(c) for c in chans]) if len(chans) > 0 else 0
    if len(chans) == 0 or maxlen == 0:
        return np.zeros((0, 4), dtype=np.float32)
    # align to shortest to keep synchronous
    chans = [c[:maxlen] for c in chans]
    X = np.zeros((maxlen, 4), dtype=np.float32)
    for i, c in enumerate(chans[:4]):
        X[:, i] = c.astype(np.float32)
    return X

def _resample_if_needed(X, fs_in, fs_out):
    fs_in = int(round(float(fs_in)))
    fs_out = int(round(float(fs_out)))
    if X.size == 0 or fs_in == fs_out:
        return X
    frac = Fraction(fs_out, fs_in)  # now both ints
    up, down = frac.numerator, frac.denominator
    Y = resample_poly(X, up, down, axis=0)
    return Y.astype(np.float32)

def convert_mat_to_csv(mat_path, label="Normal", load_nm=None, dataset_id="cwru",
                       or_loc=None, out_dir=OUT_DIR):
    mat_path = pathlib.Path(mat_path)
    md = loadmat(mat_path.as_posix(), squeeze_me=False, struct_as_record=False)

    chans = _extract_channels(md)
    fs_in = _infer_fs_from_mat(md, default_fs=12000.0)
    X = _stack_to_four(chans)
    X = _resample_if_needed(X, fs_in, TARGET_FS)

    # write CSV with no header, four columns: sensor1..sensor4
    csv_name = mat_path.stem + ".csv"
    csv_path = out_dir / csv_name
    np.savetxt(csv_path.as_posix(), X, delimiter=",", fmt="%.7f")

    # write sidecar metadata
    meta = {
        "source_file": mat_path.name,
        "dataset_id": dataset_id,
        "label": label,                 # e.g., normal, inner_race, ball, outer_race
        "outer_race_location": or_loc,  # e.g., "3", "6", "12" oclock if known
        "load_Nm": load_nm,             # if known
        "sampling_rate_hz": TARGET_FS,
        "channels": ["sensor1", "sensor2", "sensor3", "sensor4"],
        "notes": "headerless CSV for CNN. Values are vibration samples. Missing channels are zero padded.",
    }
    json_path = out_dir / (mat_path.stem + ".json")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)

    return csv_path.as_posix(), json_path.as_posix()

CWRU Dataset

In [12]:

cwru_dir = pathlib.Path("../../data/vibrationCWRU")
cwru_dir.mkdir(parents=True, exist_ok=True)
convert_mat_to_csv("../../data/raw_cwru/vibration_mat/IR007_0.mat", label="inner_race", load_nm=0, dataset_id="cwru", out_dir=cwru_dir)
convert_mat_to_csv("../../data/raw_cwru/vibration_mat/B007_0.mat", label="ball", load_nm=0, dataset_id="cwru", out_dir=cwru_dir)
convert_mat_to_csv("../../data/raw_cwru/vibration_mat/Normal_0.mat", label="normal", load_nm=0, dataset_id="cwru", out_dir=cwru_dir)
convert_mat_to_csv("../../data/raw_cwru/vibration_mat/OR007@3_0.mat", label="outer_race", load_nm=0, dataset_id="cwru", out_dir=cwru_dir)
convert_mat_to_csv("../../data/raw_cwru/vibration_mat/OR007@6_0.mat", label="outer_race", load_nm=0, dataset_id="cwru", out_dir=cwru_dir)
convert_mat_to_csv("../../data/raw_cwru/vibration_mat/OR007@12_0.mat", label="outer_race", load_nm=0, dataset_id="cwru", out_dir=cwru_dir)

('../../data/vibrationCWRU/OR007@12_0.csv',
 '../../data/vibrationCWRU/OR007@12_0.json')

KAIST Dataset

In [13]:
# ------- config -------
kaist_dir = pathlib.Path("../../data/vibrationKAIST")
kaist_dir.mkdir(parents=True, exist_ok=True)

convert_mat_to_csv("../../data/raw_kaist/vibration_mat/0Nm_BPFI_03.mat", label="inner_race", load_nm=0, dataset_id="kaist", out_dir=kaist_dir)
convert_mat_to_csv("../../data/raw_kaist/vibration_mat/0Nm_BPFO_03.mat", label="outer_race", load_nm=0, dataset_id="kaist", out_dir=kaist_dir)
convert_mat_to_csv("../../data/raw_kaist/vibration_mat/0Nm_Misalign_01.mat", label="misalignment", load_nm=0, dataset_id="kaist", out_dir=kaist_dir)
convert_mat_to_csv("../../data/raw_kaist/vibration_mat/0Nm_Unbalance_0583mg.mat", label="unbalance", load_nm=0, dataset_id="kaist", out_dir=kaist_dir)
convert_mat_to_csv("../../data/raw_kaist/vibration_mat/0Nm_Normal.mat", label="normal", load_nm=0, dataset_id="kaist", out_dir=kaist_dir)

('../../data/vibrationKAIST/0Nm_Normal.csv',
 '../../data/vibrationKAIST/0Nm_Normal.json')

Manifest


In [19]:
import json, hashlib, pathlib
import pandas as pd
import numpy as np

# Define your two dataset folders
ROOTS = [
    pathlib.Path("../../data/vibrationCWRU"),
    pathlib.Path("../../data/vibrationKAIST")
]

MANIFEST_CSV = "../../data/manifest.csv"
TRAIN_CSV = "../../data/train_manifest.csv"
VAL_CSV   = "../../data/val_manifest.csv"
TEST_CSV  = "../../data/test_manifest.csv"

def infer_or_location_from_name(name: str):
    n = name.lower()
    if "@3" in n: return "3"
    if "@6" in n: return "6"
    if "@12" in n: return "12"
    return None

rows = []
for ROOT in ROOTS:
    for json_path in sorted(ROOT.glob("*.json")):
        meta = json.loads(json_path.read_text(encoding="utf-8"))
        csv_path = ROOT / (pathlib.Path(meta["source_file"]).stem + ".csv")
        if not csv_path.exists():
            print(f"skip no csv for {json_path.name}")
            continue

        # fill missing outer race location for CWRU
        if meta.get("dataset_id") == "cwru" and meta.get("label") == "outer_race":
            if not meta.get("outer_race_location"):
                meta["outer_race_location"] = infer_or_location_from_name(meta["source_file"])

        # quick integrity check
        sample = pd.read_csv(csv_path, header=None, nrows=5)
        assert sample.shape[1] == 4, f"{csv_path} must have 4 columns"
        assert np.isfinite(sample.values).all(), f"{csv_path} has non finite values"

        n_rows = sum(1 for _ in open(csv_path, "r", encoding="utf-8"))
        row = {
            "filepath": str(csv_path.resolve()),
            "dataset_id": meta.get("dataset_id"),
            "label": meta.get("label"),
            "load_Nm": meta.get("load_Nm"),
            "sampling_rate_hz": meta.get("sampling_rate_hz"),
            "outer_race_location": meta.get("outer_race_location"),
            "n_rows": n_rows,
            "n_channels": 4,
            "_source_json": json_path.name
        }
        rows.append(row)

manifest = pd.DataFrame(rows)
manifest.to_csv(MANIFEST_CSV, index=False)
print(f"wrote manifest with {len(manifest)} files")

# deterministic split
def stable_bin(path: str):
    h = hashlib.sha1(path.encode("utf-8")).hexdigest()
    return int(h[:8], 16) / 0xFFFFFFFF

manifest = manifest.sample(frac=1.0, random_state=13)
print("Columns in manifest:", manifest.columns.tolist())
print(manifest.head())
bins = manifest["filepath"].map(stable_bin)
manifest_train = manifest[bins < 0.80]
manifest_val   = manifest[(bins >= 0.80) & (bins < 0.90)]
manifest_test  = manifest[bins >= 0.90]

manifest_train.to_csv(TRAIN_CSV, index=False)
manifest_val.to_csv(VAL_CSV, index=False)
manifest_test.to_csv(TEST_CSV, index=False)

print(f"train {len(manifest_train)}  val {len(manifest_val)}  test {len(manifest_test)}")


EmptyDataError: No columns to parse from file

In [17]:
import json, hashlib, pathlib
import pandas as pd
import numpy as np

# Define your two dataset folders
ROOTS = [
    pathlib.Path("../../data/vibrationCWRU"),
    pathlib.Path("../../data/vibrationKAIST")
]

MANIFEST_CSV = "./manifest.csv"
TRAIN_CSV = "./train_manifest.csv"
VAL_CSV   = "./val_manifest.csv"
TEST_CSV  = "./test_manifest.csv"

def infer_or_location_from_name(name: str):
    n = name.lower()
    if "@3" in n: return "3"
    if "@6" in n: return "6"
    if "@12" in n: return "12"
    return None

rows = []
for ROOT in ROOTS:
    print(f"\nüîé Scanning folder: {ROOT.resolve()}")
    json_files = list(ROOT.glob("*.json"))
    print(f"   Found {len(json_files)} JSON files: {[f.name for f in json_files]}")

    for json_path in sorted(json_files):
        print(f"‚û°Ô∏è Processing {json_path.name}")
        try:
            meta = json.loads(json_path.read_text(encoding="utf-8"))
        except Exception as e:
            print(f"   ‚ùå Could not load {json_path.name}: {e}")
            continue

        # If meta is a list, print size and first element
        if isinstance(meta, list):
            print(f"   ‚ö†Ô∏è {json_path.name} contains a list with {len(meta)} entries (not a single dict)")
        elif isinstance(meta, dict):
            print(f"   ‚úÖ {json_path.name} is a dict with keys: {list(meta.keys())}")

        # If list, skip for now (because your code expects dict per file)
        if not isinstance(meta, dict):
            continue

        csv_path = ROOT / (pathlib.Path(meta["source_file"]).stem + ".csv")
        print(f"   Looking for CSV: {csv_path.name} ‚Üí exists={csv_path.exists()}")

        if not csv_path.exists():
            print(f"   ‚ùå skip no csv for {json_path.name}")
            continue

        # fill missing outer race location for CWRU
        if meta.get("dataset_id") == "cwru" and meta.get("label") == "outer_race":
            if not meta.get("outer_race_location"):
                meta["outer_race_location"] = infer_or_location_from_name(meta["source_file"])

        # quick integrity check
        try:
            sample = pd.read_csv(csv_path, header=None, nrows=5)
            assert sample.shape[1] == 4, f"{csv_path} must have 4 columns"
            assert np.isfinite(sample.values).all(), f"{csv_path} has non finite values"
        except Exception as e:
            print(f"   ‚ùå Integrity check failed for {csv_path.name}: {e}")
            continue

        n_rows = sum(1 for _ in open(csv_path, "r", encoding="utf-8"))
        row = {
            "filepath": str(csv_path.resolve()),
            "dataset_id": meta.get("dataset_id"),
            "label": meta.get("label"),
            "load_Nm": meta.get("load_Nm"),
            "sampling_rate_hz": meta.get("sampling_rate_hz"),
            "outer_race_location": meta.get("outer_race_location"),
            "n_rows": n_rows,
            "n_channels": 4,
            "_source_json": json_path.name
        }
        rows.append(row)

manifest = pd.DataFrame(rows)
manifest.to_csv(MANIFEST_CSV, index=False)
print(f"\n‚úÖ wrote manifest with {len(manifest)} files to {MANIFEST_CSV}")

# deterministic split
def stable_bin(path: str):
    h = hashlib.sha1(path.encode("utf-8")).hexdigest()
    return int(h[:8], 16) / 0xFFFFFFFF

if not manifest.empty:
    manifest = manifest.sample(frac=1.0, random_state=13)
    print("Columns in manifest:", manifest.columns.tolist())
    print(manifest.head())
    bins = manifest["filepath"].map(stable_bin)
    manifest_train = manifest[bins < 0.80]
    manifest_val   = manifest[(bins >= 0.80) & (bins < 0.90)]
    manifest_test  = manifest[bins >= 0.90]

    manifest_train.to_csv(TRAIN_CSV, index=False)
    manifest_val.to_csv(VAL_CSV, index=False)
    manifest_test.to_csv(TEST_CSV, index=False)

    print(f"train {len(manifest_train)}  val {len(manifest_val)}  test {len(manifest_test)}")
else:
    print("‚ö†Ô∏è Manifest is empty ‚Äî no rows collected.")



üîé Scanning folder: C:\Users\giljo\Desktop\Master\FlexUNS-Framework\data\vibrationCWRU
   Found 6 JSON files: ['B007_0.json', 'IR007_0.json', 'Normal_0.json', 'OR007@12_0.json', 'OR007@3_0.json', 'OR007@6_0.json']
‚û°Ô∏è Processing B007_0.json
   ‚úÖ B007_0.json is a dict with keys: ['source_file', 'dataset_id', 'label', 'outer_race_location', 'load_Nm', 'sampling_rate_hz', 'channels', 'notes']
   Looking for CSV: B007_0.csv ‚Üí exists=True
‚û°Ô∏è Processing IR007_0.json
   ‚úÖ IR007_0.json is a dict with keys: ['source_file', 'dataset_id', 'label', 'outer_race_location', 'load_Nm', 'sampling_rate_hz', 'channels', 'notes']
   Looking for CSV: IR007_0.csv ‚Üí exists=True
‚û°Ô∏è Processing Normal_0.json
   ‚úÖ Normal_0.json is a dict with keys: ['source_file', 'dataset_id', 'label', 'outer_race_location', 'load_Nm', 'sampling_rate_hz', 'channels', 'notes']
   Looking for CSV: Normal_0.csv ‚Üí exists=True
‚û°Ô∏è Processing OR007@12_0.json
   ‚úÖ OR007@12_0.json is a dict with keys: ['s