In [3]:
from __future__ import annotations

import re
from pathlib import Path
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from scipy.io import loadmat


NORMAL_BASELINE_PAGE = "https://engineering.case.edu/bearingdatacenter/normal-baseline-data"
DRIVE_END_12K_PAGE    = "https://engineering.case.edu/bearingdatacenter/12k-drive-end-bearing-fault-data"

BASE = "https://engineering.case.edu"

def label_from_filename(stem: str):
    s = stem.upper()

    s = s.split("__", 1)[0]

    if "NORMAL" in s:         
        return "normal"
    if s.startswith("IR"):
        return "inner_race"
    if s.startswith("OR"):
        return "outer_race"
    if s.startswith("B") or "BALL" in s:  
        return "ball"
    return "unknown"


def scrape_mat_links(page_url: str) -> dict[str, str]:
    """
    CWRU 데이터 페이지에서 .mat 링크를 수집.
    반환: {링크텍스트(예: 'Normal_0', 'IR007_0', ...): 다운로드URL}
    """
    r = requests.get(page_url, timeout=30, headers={"User-Agent": "Mozilla/5.0"})
    r.raise_for_status()

    soup = BeautifulSoup(r.text, "html.parser")
    out: dict[str, str] = {}

    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        text = (a.get_text() or "").strip()

        if re.search(r"/sites/default/files/\d+\.mat$", href):
            full = urljoin(BASE, href)

            if not text:
                text = Path(href).name
            out[text] = full

    if not out:
        raise RuntimeError(
            "이 페이지에서 .mat 링크를 찾지 못했습니다. "
            "페이지 구조가 바뀌었을 수 있어요."
        )
    return out

def download(url: str, out_path: Path, chunk_size: int = 1 << 20) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if out_path.exists() and out_path.stat().st_size > 0:
        return  # 이미 받았으면 스킵

    with requests.get(url, stream=True, timeout=120, headers={"User-Agent": "Mozilla/5.0"}) as r:
        r.raise_for_status()
        with open(out_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=chunk_size):
                if chunk:
                    f.write(chunk)

import re
from pathlib import Path
import requests

def safe_name(s: str) -> str:
    s = s.strip()
    s = re.sub(r"[^\w\-@.]+", "_", s)
    return s

def download_with_label(links: dict[str, str], out_dir: str | Path):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    saved = []
    for link_text, url in links.items():
        base = Path(url).name               
        fname = f"{safe_name(link_text)}__{base}" 
        dst = out_dir / fname

        if not dst.exists():
            r = requests.get(url, timeout=60, headers={"User-Agent": "Mozilla/5.0"})
            r.raise_for_status()
            dst.write_bytes(r.content)

        saved.append(dst)
    return saved


def download_many(links: dict[str, str], out_dir: Path) -> list[Path]:
    paths = []
    for name, url in links.items():
        filename = Path(url).name
        dst = out_dir / filename
        print(f"↓ {name:15s} -> {dst}")
        download(url, dst)
        paths.append(dst)
    return paths


def extract_cwru_signals(mat_path: Path) -> dict[str, object]:
    """
    CWRU .mat 파일에서 DE/FE/RPM 추출 (가능한 항목만).
    키 이름은 파일마다 달라서 suffix로 찾습니다.
    """
    md = loadmat(mat_path, squeeze_me=True, struct_as_record=False)
    keys = [k for k in md.keys() if not k.startswith("__")]

    def pick(suffix: str):
        for k in keys:
            if k.endswith(suffix):
                return md[k]
        return None

    de = pick("_DE_time")
    fe = pick("_FE_time")
    rpm = pick("RPM")

    return {
        "path": str(mat_path),
        "keys": keys,
        "DE_time": de,
        "FE_time": fe,
        "RPM": rpm,
    }


if __name__ == "__main__":
    out_root = Path("./CWRU")

    normal_links = scrape_mat_links(NORMAL_BASELINE_PAGE)
    normal_dir = out_root / "normal_baseline_labeled"
    normal_files = download_with_label(normal_links, normal_dir)

    fault_links_all = scrape_mat_links(DRIVE_END_12K_PAGE)

    wanted_prefixes = ("IR007_", "B007_", "OR007@6_")
    fault_links_sel = {
        k: v for k, v in fault_links_all.items()
        if k.startswith(wanted_prefixes) and k.endswith("_0")
    }

    fault_dir = out_root / "12k_drive_end_fault_subset_labeled"
    fault_files = download_with_label(fault_links_sel, fault_dir)  

    sample = fault_files[0]
    sig = extract_cwru_signals(sample)
    print("\n[Sample]", sig["path"])
    print("  DE len =", None if sig["DE_time"] is None else len(sig["DE_time"]))
    print("  FE len =", None if sig["FE_time"] is None else len(sig["FE_time"]))
    print("  RPM    =", sig["RPM"])


[Sample] CWRU/12k_drive_end_fault_subset_labeled/IR007_0__105.mat
  DE len = 121265
  FE len = 121265
  RPM    = 1797


In [5]:
from pathlib import Path
from collections import Counter

folder = Path("./CWRU/12k_drive_end_fault_subset_labeled")
files = sorted(list(folder.glob("*.mat")) + list(folder.glob("*.MAT")))
print("num files:", len(files))

labels = [label_from_filename(p.stem) for p in files]
print("label counts:", Counter(labels))

unknowns = [p.name for p in files if label_from_filename(p.stem) == "unknown"]
print("unknown examples:", unknowns[:20])

num files: 3
label counts: Counter({'ball': 1, 'inner_race': 1, 'outer_race': 1})
unknown examples: []
