In [3]:
# Python 3.x
# pip install scipy numpy pandas
import json
import re
import pathlib
import numpy as np
from fractions import Fraction
from scipy.io import loadmat
from scipy.signal import resample_poly

# ------- config -------
TARGET_FS = 25600  # Hz to match KAIST
OUT_DIR = pathlib.Path("../../data/vibrationCWRU")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Helpers to find time-domain arrays in CWRU .mat
CWRU_KEY_ORDER = [
    # common CWRU naming patterns
    r".*_DE_time$",   # drive end accel
    r".*_FE_time$",   # fan end accel
    r".*_BA_time$",   # base accel
    r".*_AE_time$",   # acoustic emission if present
    r".*DE$", r".*FE$", r".*BA$",
]

def _flatten_1d(x):
    x = np.asarray(x)
    if x.ndim == 2 and 1 in x.shape:
        x = x.reshape(-1)
    return x.squeeze()

def _extract_channels(matdict):
    # collect all 1D numeric arrays
    candidates = {}
    for k, v in matdict.items():
        if k.startswith("__"):
            continue
        try:
            arr = _flatten_1d(v)
        except Exception:
            continue
        if np.issubdtype(arr.dtype, np.number) and arr.ndim == 1 and arr.size > 10:
            candidates[k] = arr

    # order channels by regex priority
    ordered = []
    used = set()
    for pat in CWRU_KEY_ORDER:
        rx = re.compile(pat, re.IGNORECASE)
        for k in list(candidates.keys()):
            if k in used:
                continue
            if rx.match(k):
                ordered.append(candidates[k])
                used.add(k)
    # add any leftover numeric vectors
    for k, arr in candidates.items():
        if k not in used:
            ordered.append(arr)

    return ordered[:4]  # up to four channels

def _infer_fs_from_mat(matdict, default_fs=12000):
    # try common field names
    for k in matdict.keys():
        if "fs" == k.lower() or "samplingrate" in k.lower():
            fs_val = np.asarray(matdict[k]).astype(float).squeeze()
            if fs_val.size >= 1:
                return float(fs_val.flat[0])
    # try to guess from CWRU naming like X097_DE_time -> 12 kHz or 48 kHz are common
    return float(default_fs)

def _stack_to_four(chans):
    """stack list of 1D arrays to shape [N,4], padding with zeros if fewer than 4."""
    maxlen = min([len(c) for c in chans]) if len(chans) > 0 else 0
    if len(chans) == 0 or maxlen == 0:
        return np.zeros((0, 4), dtype=np.float32)
    # align to shortest to keep synchronous
    chans = [c[:maxlen] for c in chans]
    X = np.zeros((maxlen, 4), dtype=np.float32)
    for i, c in enumerate(chans[:4]):
        X[:, i] = c.astype(np.float32)
    return X

def _resample_if_needed(X, fs_in, fs_out):
    fs_in = int(round(float(fs_in)))
    fs_out = int(round(float(fs_out)))
    if X.size == 0 or fs_in == fs_out:
        return X
    frac = Fraction(fs_out, fs_in)  # now both ints
    up, down = frac.numerator, frac.denominator
    Y = resample_poly(X, up, down, axis=0)
    return Y.astype(np.float32)

def convert_mat_to_csv(mat_path, label="Normal", load_nm=None, dataset_id="cwru",
                       or_loc=None, out_dir=OUT_DIR):
    mat_path = pathlib.Path(mat_path)
    md = loadmat(mat_path.as_posix(), squeeze_me=False, struct_as_record=False)

    chans = _extract_channels(md)
    fs_in = _infer_fs_from_mat(md, default_fs=12000.0)
    X = _stack_to_four(chans)
    X = _resample_if_needed(X, fs_in, TARGET_FS)

    # write CSV with no header, four columns: sensor1..sensor4
    csv_name = mat_path.stem + ".csv"
    csv_path = out_dir / csv_name
    np.savetxt(csv_path.as_posix(), X, delimiter=",", fmt="%.7f")

    # write sidecar metadata
    meta = {
        "source_file": mat_path.name,
        "dataset_id": dataset_id,
        "label": label,                 # e.g., normal, inner_race, ball, outer_race
        "outer_race_location": or_loc,  # e.g., "3", "6", "12" oclock if known
        "load_Nm": load_nm,             # if known
        "sampling_rate_hz": TARGET_FS,
        "channels": ["sensor1", "sensor2", "sensor3", "sensor4"],
        "notes": "headerless CSV for CNN. Values are vibration samples. Missing channels are zero padded.",
    }
    json_path = out_dir / (mat_path.stem + ".json")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)

    return csv_path.as_posix(), json_path.as_posix()

# Example
# convert_mat_to_csv("./Normal_0.mat", label="normal", load_nm=0, dataset_id="cwru")
convert_mat_to_csv("../../data/raw_cwru/vibration_mat/IR007_0.mat", label="inner_race", load_nm=0, dataset_id="cwru")
convert_mat_to_csv("../../data/raw_cwru/vibration_mat/B007_0.mat", label="inner_race", load_nm=0, dataset_id="cwru")
convert_mat_to_csv("../../data/raw_cwru/vibration_mat/Normal_0.mat", label="inner_race", load_nm=0, dataset_id="cwru")
convert_mat_to_csv("../../data/raw_cwru/vibration_mat/OR007@3_0.mat", label="inner_race", load_nm=0, dataset_id="cwru")
convert_mat_to_csv("../../data/raw_cwru/vibration_mat/OR007@6_0.mat", label="inner_race", load_nm=0, dataset_id="cwru")
convert_mat_to_csv("../../data/raw_cwru/vibration_mat/OR007@12_0.mat", label="inner_race", load_nm=0, dataset_id="cwru")

('../../data/vibrationCWRU/OR007@12_0.csv',
 '../../data/vibrationCWRU/OR007@12_0.json')

Join JSON

In [5]:
import json
import pathlib

def join_jsons(folder, out_file="merged.json"):
    folder = pathlib.Path(folder)
    json_files = sorted(folder.glob("*.json"))

    all_data = []
    for jf in json_files:
        try:
            with open(jf, "r", encoding="utf-8") as f:
                data = json.load(f)
                # keep track of original file
                if isinstance(data, dict):
                    data["_source_file"] = jf.name
                all_data.append(data)
        except Exception as e:
            print(f"⚠️ Skipping {jf}: {e}")

    out_path = folder / out_file
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(all_data, f, indent=2)

    # delete original JSON files except the merged one
    for jf in json_files:
        try:
            if jf.name != out_file:
                jf.unlink()
        except Exception as e:
            print(f"⚠️ Could not delete {jf}: {e}")

    print(f"✅ Merged {len(all_data)} JSON files into {out_path} and deleted originals.")
    return out_path

# Merge all JSONs in ./out_csv into merged.json
join_jsons(OUT_DIR, out_file="cwru_metadata.json")


✅ Merged 7 JSON files into ..\..\data\vibrationCWRU\cwru_metadata.json and deleted originals.


WindowsPath('../../data/vibrationCWRU/cwru_metadata.json')