In [1]:
"""
Feature generation for LSTM & Transformer models
- Input  : /kaggle/input/machine-learning-basic/event_01.csv ~ event_17.csv
- Output : 
    ./processed_features/X_lstm.npy               (N, seq_len, n_features)
    ./processed_features/y.npy                   (N,)
    ./processed_features/transformer_features.parquet  (N, n_tf_features + meta)
"""

import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import skew, kurtosis

# ==================== 기본 세팅 ====================

BASE_DIR = "/kaggle/input/machine-learning-basic"
OUTPUT_DIR = "./processed_features"
os.makedirs(OUTPUT_DIR, exist_ok=True)

SEGMENT_SIZE = 150_000          # 세그먼트 길이
DOWNSAMPLE_STEP = 100           # LSTM 시퀀스 다운샘플링 스텝 (150,000 -> 1,500)
ROLLING_WINDOWS = (100, 1000, 10000)  # 롤링 윈도우 크기들 (샘플 단위)

# ==================== 유틸 함수들 ====================

def segment_iterator(signal: np.ndarray, ttf: np.ndarray, segment_size: int):
    """
    1D 시계열을 segment_size 단위로 잘라서 반환하는 제너레이터.
    마지막에 남는 자투리는 버린다.
    """
    total_len = len(signal)
    n_segments = total_len // segment_size
    for i in range(n_segments):
        start = i * segment_size
        end = start + segment_size
        seg_sig = signal[start:end]
        seg_ttf = ttf[start:end]
        yield i, seg_sig, seg_ttf


def build_lstm_sequence(seg_signal: np.ndarray) -> np.ndarray:
    """
    하나의 세그먼트(길이 150,000)의 원신호에서
    LSTM용 시퀀스 피처 (1500, n_features)를 생성한다.
    - 원신호 다운샘플
    - 롤링 mean/std (다양한 window)
    - zero crossing 시퀀스
    - peak 시퀀스
    """
    # pandas Series로 감싸서 rolling, ewm 사용
    s = pd.Series(seg_signal.astype(np.float32))
    
    # 다운샘플 인덱스 (0, 100, 200, ..., 149900)
    ds_idx = np.arange(0, len(s), DOWNSAMPLE_STEP)
    
    # 1) 원신호 다운샘플
    raw_down = s.iloc[ds_idx].values.reshape(-1, 1)

    # 2) 롤링 mean/std
    rolling_feats = []
    for w in ROLLING_WINDOWS:
        roll_mean = (
            s.rolling(window=w, min_periods=1)
             .mean()
             .iloc[ds_idx]
             .values
             .astype(np.float32)
        )
        roll_std = (
            s.rolling(window=w, min_periods=1)
             .std(ddof=0)
             .fillna(0)
             .iloc[ds_idx]
             .values
             .astype(np.float32)
        )
        rolling_feats.append(roll_mean.reshape(-1, 1))
        rolling_feats.append(roll_std.reshape(-1, 1))

    # 3) zero crossing 시퀀스 (부호가 달라지는 지점)
    sign = np.sign(seg_signal)
    sign[sign == 0] = 1  # 0은 +1로 처리
    zc = np.zeros_like(seg_signal, dtype=np.float32)
    zc[1:] = (sign[1:] != sign[:-1]).astype(np.float32)
    zc_down = zc[ds_idx].reshape(-1, 1)

    # 4) peak 시퀀스 (양옆보다 큰 지점)
    # 간단한 1차 peak 탐지 (경계는 0으로)
    peaks = np.zeros_like(seg_signal, dtype=np.float32)
    mid = np.arange(1, len(seg_signal) - 1)
    cond = (seg_signal[mid] > seg_signal[mid - 1]) & (seg_signal[mid] > seg_signal[mid + 1])
    peaks[mid[cond]] = 1.0
    peaks_down = peaks[ds_idx].reshape(-1, 1)

    # 모든 피처 concat
    seq_features = np.concatenate(
        [raw_down] + rolling_feats + [zc_down, peaks_down],
        axis=1
    )
    # shape: (seq_len=1500, n_features)
    return seq_features.astype(np.float32)


def compute_spectral_features(seg_signal: np.ndarray, eps: float = 1e-12) -> dict:
    """
    FFT 기반 spectral 피처 계산 (Transformer용 전역 피처 일부)
    """
    # DC 제거를 위해 평균 0으로 정규화
    x = seg_signal.astype(np.float32)
    x = x - np.mean(x)

    # rFFT
    fft_vals = np.fft.rfft(x)
    mag = np.abs(fft_vals)
    power = mag ** 2

    # 주파수 축 (정규화된 인덱스 기반)
    freqs = np.fft.rfftfreq(len(x), d=1.0)

    # 총 에너지
    total_power = np.sum(power) + eps

    # Spectral centroid
    spectral_centroid = float(np.sum(freqs * power) / total_power)

    # Spectral rolloff (총 에너지의 85%가 되는 지점)
    cumulative_power = np.cumsum(power)
    rolloff_threshold = 0.85 * total_power
    rolloff_idx = np.searchsorted(cumulative_power, rolloff_threshold)
    spectral_rolloff = float(freqs[min(rolloff_idx, len(freqs) - 1)])

    # 몇 개 대역의 에너지 비율 (저/중/고)
    # 전체 구간을 3분할
    thirds = len(power) // 3
    if thirds > 0:
        low = float(np.sum(power[:thirds]) / total_power)
        mid = float(np.sum(power[thirds:2*thirds]) / total_power)
        high = float(np.sum(power[2*thirds:]) / total_power)
    else:
        low = mid = high = 0.0

    return {
        "spec_total_power": float(total_power),
        "spec_centroid": spectral_centroid,
        "spec_rolloff": spectral_rolloff,
        "spec_band_low": low,
        "spec_band_mid": mid,
        "spec_band_high": high,
    }


def compute_transformer_features(seg_signal: np.ndarray) -> dict:
    """
    하나의 세그먼트(150,000)에 대해 Transformer용 전역(summary) 피처 계산.
    - 기본 통계
    - 신호 기반 전체 지표
    - FFT 기반 spectral 피처
    - tsfresh 유사 전역 피처 몇 개
    """
    x = seg_signal.astype(np.float32)

    feats = {}

    # ----- 기본 통계 -----
    feats["mean"] = float(np.mean(x))
    feats["std"] = float(np.std(x))
    feats["min"] = float(np.min(x))
    feats["max"] = float(np.max(x))
    feats["range"] = float(feats["max"] - feats["min"])
    feats["q25"] = float(np.quantile(x, 0.25))
    feats["q50"] = float(np.quantile(x, 0.50))
    feats["q75"] = float(np.quantile(x, 0.75))
    feats["iqr"] = float(feats["q75"] - feats["q25"])
    feats["skew"] = float(skew(x))
    feats["kurtosis"] = float(kurtosis(x))

    # ----- 신호 기반 전역 피처 -----
    # zero crossing rate (전체 구간에서 비율)
    sign = np.sign(x)
    sign[sign == 0] = 1
    zc = np.sum(sign[1:] != sign[:-1])
    feats["zero_cross_rate"] = float(zc / (len(x) - 1))

    # peak 개수 (단순 1차 peak)
    mid = np.arange(1, len(x) - 1)
    cond = (x[mid] > x[mid - 1]) & (x[mid] > x[mid + 1])
    feats["num_peaks"] = float(np.sum(cond))

    # ----- tsfresh 유사 전역 피처 -----
    # abs_energy
    feats["abs_energy"] = float(np.sum(x ** 2))

    # cid_ce (complexity-invariant distance 기반 간단 버전)
    diff = np.diff(x)
    feats["cid_ce"] = float(np.sqrt(np.sum(diff ** 2)))

    # ratio_beyond_2_sigma
    mu = feats["mean"]
    sigma = feats["std"] + 1e-12
    feats["ratio_beyond_2_sigma"] = float(
        np.mean(np.abs(x - mu) > 2 * sigma)
    )

    # ----- spectral 피처 -----
    spec_feats = compute_spectral_features(x)
    feats.update(spec_feats)

    return feats


# ==================== 메인 루프 ====================

lstm_sequences = []   # (N, seq_len, n_features)
targets = []          # (N,)
tf_feature_rows = []  # Transformer용 row(dict)의 리스트

event_files = [f for f in os.listdir(BASE_DIR) if f.startswith("event_") and f.endswith(".csv")]
event_files = sorted(event_files)  # event_01, event_02, ...

print("Found event files:", event_files)

for ev_file in event_files:
    ev_path = os.path.join(BASE_DIR, ev_file)
    print(f"\n▶ Processing {ev_file} ...")
    
    df = pd.read_csv(ev_path)
    # 예상 컬럼: acoustic_data, time_to_failure
    signal = df["acoustic_data"].values
    ttf = df["time_to_failure"].values

    del df
    gc.collect()

    for seg_idx, seg_sig, seg_ttf in tqdm(
        segment_iterator(signal, ttf, SEGMENT_SIZE),
        desc=f"Segments in {ev_file}",
        leave=False
    ):
        # ----- LSTM 시퀀스 -----
        seq_features = build_lstm_sequence(seg_sig)
        lstm_sequences.append(seq_features)

        # ----- y (segment의 마지막 time_to_failure) -----
        y_val = float(seg_ttf[-1])
        targets.append(y_val)

        # ----- Transformer summary features -----
        tf_feats = compute_transformer_features(seg_sig)
        tf_feats["event_file"] = ev_file
        tf_feats["segment_index"] = int(seg_idx)
        tf_feats["y"] = y_val
        tf_feature_rows.append(tf_feats)

    # event별 메모리 정리
    del signal, ttf
    gc.collect()

# ==================== 배열/데이터프레임로 변환 및 저장 ====================

X_lstm = np.stack(lstm_sequences, axis=0)   # (N, seq_len, n_features)
y = np.array(targets, dtype=np.float32)

print("\nFinal shapes:")
print("X_lstm:", X_lstm.shape)
print("y:", y.shape)

# 저장
np.save(os.path.join(OUTPUT_DIR, "X_lstm.npy"), X_lstm)
np.save(os.path.join(OUTPUT_DIR, "y.npy"), y)

tf_df = pd.DataFrame(tf_feature_rows)
tf_path = os.path.join(OUTPUT_DIR, "transformer_features.parquet")
tf_df.to_parquet(tf_path, index=False)

print(f"\nSaved:")
print(f"- {os.path.join(OUTPUT_DIR, 'X_lstm.npy')}")
print(f"- {os.path.join(OUTPUT_DIR, 'y.npy')}")
print(f"- {tf_path}")


Found event files: ['event_01.csv', 'event_02.csv', 'event_03.csv', 'event_04.csv', 'event_05.csv', 'event_06.csv', 'event_07.csv', 'event_08.csv', 'event_09.csv', 'event_10.csv', 'event_11.csv', 'event_12.csv', 'event_13.csv', 'event_14.csv', 'event_15.csv', 'event_16.csv', 'event_17.csv']

▶ Processing event_01.csv ...


                                                 


▶ Processing event_02.csv ...


                                                  


▶ Processing event_03.csv ...


                                                  


▶ Processing event_04.csv ...


                                                  


▶ Processing event_05.csv ...


                                                  


▶ Processing event_06.csv ...


                                                  


▶ Processing event_07.csv ...


                                                  


▶ Processing event_08.csv ...


                                                  


▶ Processing event_09.csv ...


                                                  


▶ Processing event_10.csv ...


                                                  


▶ Processing event_11.csv ...


                                                  


▶ Processing event_12.csv ...


                                                  


▶ Processing event_13.csv ...


                                                  


▶ Processing event_14.csv ...


                                                  


▶ Processing event_15.csv ...


                                                  


▶ Processing event_16.csv ...


                                                  


▶ Processing event_17.csv ...


                                                 


Final shapes:
X_lstm: (4184, 1500, 9)
y: (4184,)

Saved:
- ./processed_features/X_lstm.npy
- ./processed_features/y.npy
- ./processed_features/transformer_features.parquet


In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ================= 1. 데이터 로드 =================
X_lstm = np.load("./processed_features/X_lstm.npy")   # (N,1500,9)
y = np.load("./processed_features/y.npy")

tf_df = pd.read_parquet("./processed_features/transformer_features.parquet")
drop_cols = ["event_file", "segment_index", "y"]
feature_cols = [c for c in tf_df.columns if c not in drop_cols]

X_tf_raw = tf_df[feature_cols].values.astype("float32")
y_tf = tf_df["y"].values.astype("float32")

assert len(X_lstm) == len(X_tf_raw) == len(y)
assert np.allclose(y, y_tf)

indices = np.arange(len(y))
train_idx, valid_idx = train_test_split(indices, test_size=0.1, random_state=42)

X_lstm_train = X_lstm[train_idx]
X_lstm_valid = X_lstm[valid_idx]
y_train = y[train_idx]
y_valid = y[valid_idx]

X_tf_train_raw = X_tf_raw[train_idx]
X_tf_valid_raw = X_tf_raw[valid_idx]

print("Train size:", len(train_idx), " Valid size:", len(valid_idx))

# --------- NaN/inf 처리 + 스케일링 (Transformer용) ---------
X_tf_train_raw = np.nan_to_num(X_tf_train_raw, nan=0.0, posinf=0.0, neginf=0.0)
X_tf_valid_raw = np.nan_to_num(X_tf_valid_raw, nan=0.0, posinf=0.0, neginf=0.0)

scaler = StandardScaler()
X_tf_train = scaler.fit_transform(X_tf_train_raw)
X_tf_valid = scaler.transform(X_tf_valid_raw)

X_tf_train = np.nan_to_num(X_tf_train, nan=0.0, posinf=0.0, neginf=0.0)
X_tf_valid = np.nan_to_num(X_tf_valid, nan=0.0, posinf=0.0, neginf=0.0)

# ================= 2. Dataset =================
class MultiModalDataset(Dataset):
    def __init__(self, X_lstm, X_tf, y):
        self.X_lstm = torch.tensor(X_lstm, dtype=torch.float32)
        self.X_tf = torch.tensor(X_tf, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X_lstm[idx], self.X_tf[idx], self.y[idx]

train_ds = MultiModalDataset(X_lstm_train, X_tf_train, y_train)
valid_ds = MultiModalDataset(X_lstm_valid, X_tf_valid, y_valid)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=128, shuffle=False)

# ================= 3. 모델 정의 =================
class LSTMModel(nn.Module):
    def __init__(self, input_size=9, hidden_size=128, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.1
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

class TabularTransformer(nn.Module):
    def __init__(self, num_features, d_model=128, nhead=4,
                 num_layers=2, dim_feedforward=256):
        super().__init__()
        self.input_proj = nn.Linear(1, d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.fc = nn.Sequential(
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        # x: (B, F)
        x = x.view(x.size(0), -1, 1)   # (B,F,1)
        h = self.input_proj(x)         # (B,F,d_model)
        h = self.encoder(h)            # (B,F,d_model)
        h = h.mean(dim=1)              # (B,d_model)
        out = self.fc(h)               # (B,1)
        return out

lstm_model = LSTMModel().to(device)
tf_model = TabularTransformer(num_features=X_tf_train.shape[1]).to(device)

criterion = nn.L1Loss()
opt_lstm = torch.optim.Adam(lstm_model.parameters(), lr=1e-3)
opt_tf = torch.optim.Adam(tf_model.parameters(), lr=5e-4)

# ================= 4. LSTM 학습 =================
print("\n===== LSTM Training =====")
EPOCHS_LSTM = 12
for epoch in range(EPOCHS_LSTM):
    lstm_model.train()
    train_loss = 0.0
    for xb_lstm, xb_tf, yb in train_loader:
        xb_lstm = xb_lstm.to(device)
        yb = yb.to(device)

        pred = lstm_model(xb_lstm)
        loss = criterion(pred, yb)

        opt_lstm.zero_grad()
        loss.backward()
        opt_lstm.step()
        train_loss += loss.item()

    lstm_model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb_lstm, xb_tf, yb in valid_loader:
            xb_lstm = xb_lstm.to(device)
            yb = yb.to(device)
            pred = lstm_model(xb_lstm)
            val_loss += criterion(pred, yb).item()

    print(f"[LSTM][{epoch+1}] Train={train_loss/len(train_loader):.4f} "
          f"Valid={val_loss/len(valid_loader):.4f}")

# ================= 5. Transformer 학습 =================
print("\n===== Transformer Training (clean) =====")
EPOCHS_TF = 15
for epoch in range(EPOCHS_TF):
    tf_model.train()
    train_loss = 0.0
    for xb_lstm, xb_tf, yb in train_loader:
        xb_tf = xb_tf.to(device)
        yb = yb.to(device)

        pred = tf_model(xb_tf)
        loss = criterion(pred, yb)

        opt_tf.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(tf_model.parameters(), max_norm=5.0)
        opt_tf.step()
        train_loss += loss.item()

    tf_model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb_lstm, xb_tf, yb in valid_loader:
            xb_tf = xb_tf.to(device)
            yb = yb.to(device)
            pred = tf_model(xb_tf)
            val_loss += criterion(pred, yb).item()

    print(f"[TF][{epoch+1}] Train={train_loss/len(train_loader):.4f} "
          f"Valid={val_loss/len(valid_loader):.4f}")

# ================= 6. 앙상블 MAE =================
lstm_model.eval()
tf_model.eval()

y_true, p_lstm, p_tf, p_ens = [], [], [], []

with torch.no_grad():
    for xb_lstm, xb_tf, yb in valid_loader:
        xb_lstm = xb_lstm.to(device)
        xb_tf = xb_tf.to(device)
        yb = yb.to(device)

        out_lstm = lstm_model(xb_lstm)
        out_tf = tf_model(xb_tf)
        out_ens = 0.5 * out_lstm + 0.5 * out_tf

        y_true.append(yb.cpu().numpy())
        p_lstm.append(out_lstm.cpu().numpy())
        p_tf.append(out_tf.cpu().numpy())
        p_ens.append(out_ens.cpu().numpy())

y_true = np.concatenate(y_true).ravel()
p_lstm = np.concatenate(p_lstm).ravel()
p_tf = np.concatenate(p_tf).ravel()
p_ens = np.concatenate(p_ens).ravel()

print("\n===== Validation MAE =====")
print("LSTM MAE       :", mean_absolute_error(y_true, p_lstm))
print("Transformer MAE:", mean_absolute_error(y_true, p_tf))
print("Ensemble MAE   :", mean_absolute_error(y_true, p_ens))


Using device: cuda
Train size: 3765  Valid size: 419

===== LSTM Training =====
[LSTM][1] Train=3.5964 Valid=3.1531
[LSTM][2] Train=3.0331 Valid=3.1694
[LSTM][3] Train=3.0301 Valid=3.1691
[LSTM][4] Train=3.0433 Valid=3.1199
[LSTM][5] Train=2.9181 Valid=2.9034
[LSTM][6] Train=2.7660 Valid=2.7305
[LSTM][7] Train=2.6673 Valid=2.5717
[LSTM][8] Train=2.8188 Valid=3.1058
[LSTM][9] Train=3.0010 Valid=3.0855
[LSTM][10] Train=2.9332 Valid=2.9890
[LSTM][11] Train=2.8175 Valid=2.7736
[LSTM][12] Train=2.6890 Valid=2.5755

===== Transformer Training (clean) =====
[TF][1] Train=3.2953 Valid=3.1514
[TF][2] Train=3.0018 Valid=3.0329
[TF][3] Train=2.8036 Valid=2.7142
[TF][4] Train=2.6527 Valid=2.6737
[TF][5] Train=2.6456 Valid=2.6129
[TF][6] Train=2.5709 Valid=2.5603
[TF][7] Train=2.5015 Valid=2.8208
[TF][8] Train=2.4882 Valid=2.5298
[TF][9] Train=2.4666 Valid=2.4064
[TF][10] Train=2.4326 Valid=2.4123
[TF][11] Train=2.3687 Valid=2.3880
[TF][12] Train=2.3802 Valid=2.4532
[TF][13] Train=2.3845 Valid=2.35

In [3]:
import os
import json
import numpy as np
import pandas as pd

os.makedirs("./results", exist_ok=True)

# ============ 1) metrics.json 저장 ============
metrics = {
    "lstm_mae": float(mean_absolute_error(y_true, p_lstm)),
    "transformer_mae": float(mean_absolute_error(y_true, p_tf)),
    "ensemble_mae": float(mean_absolute_error(y_true, p_ens)),
    "n_valid": int(len(y_true))
}

with open("./results/metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

print("Saved: ./results/metrics.json")

# ============ 2) validation_predictions.csv 저장 ============
meta_valid = tf_df.iloc[valid_idx][["event_file", "segment_index", "y"]].reset_index(drop=True)

df_out = meta_valid.copy()
df_out["y_true"] = y_true
df_out["y_pred_lstm"] = p_lstm
df_out["y_pred_tf"] = p_tf
df_out["y_pred_ensemble"] = p_ens

df_out.to_csv("./results/validation_predictions.csv", index=False)
print("Saved: ./results/validation_predictions.csv")

# ============ 3) final_report_summary.txt 저장 ============
report_text = f"""
[기계학습기초 프로젝트 결과 요약]

Validation Set (n={len(y_true)})

- LSTM MAE       : {metrics['lstm_mae']:.4f}
- Transformer MAE: {metrics['transformer_mae']:.4f}
- Ensemble MAE   : {metrics['ensemble_mae']:.4f}

Ensemble(LSTM+TF)의 성능이 가장 우수함을 확인.
"""

with open("./results/final_report_summary.txt", "w", encoding="utf-8") as f:
    f.write(report_text)

print("Saved: ./results/final_report_summary.txt")
print("\n✅ Output 저장 완료. 이제 Save Version 눌러서 영구 저장하세요.")


Saved: ./results/metrics.json
Saved: ./results/validation_predictions.csv
Saved: ./results/final_report_summary.txt

✅ Output 저장 완료. 이제 Save Version 눌러서 영구 저장하세요.
