In [1]:
import os, math, gc, json, random, glob, re
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

columns =  [
    "acc_chest_x","acc_chest_y","acc_chest_z",
    "ecg_1","ecg_2",
    "acc_ankle_x","acc_ankle_y","acc_ankle_z",
    "gyro_ankle_x","gyro_ankle_y","gyro_ankle_z",
    "mag_ankle_x","mag_ankle_y","mag_ankle_z",
    "acc_arm_x","acc_arm_y","acc_arm_z",
    "gyro_arm_x","gyro_arm_y","gyro_arm_z",
    "mag_arm_x","mag_arm_y","mag_arm_z",
    "label"
]

In [2]:
from google.colab import drive
drive.mount('/content/drive')

data_dir = "/content/drive/MyDrive/Colab Notebooks/MHEALTHDATASET"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
paths = sorted(glob.glob(os.path.join(data_dir, "*.log")))
print("find FILE :", len(paths))

find FILE : 10


In [4]:
df = []
for i in paths:
    df_t = pd.read_csv(i, sep=r"\s+", header=None, names=columns)
    num = re.search(r"subject(\d+)", i, re.IGNORECASE)
    df_t["subject"] = int(num.group(1)) if num else None
    df.append(df_t)

df = pd.concat(df, ignore_index=True)

print(df.shape)
print(df["subject"].unique())
print(df.head())

print(df.isnull().sum())

(1215745, 25)
[ 1 10  2  3  4  5  6  7  8  9]
   acc_chest_x  acc_chest_y  acc_chest_z     ecg_1     ecg_2  acc_ankle_x  \
0      -9.8184     0.009971      0.29563  0.004186  0.004186       2.1849   
1      -9.8489     0.524040      0.37348  0.004186  0.016745       2.3876   
2      -9.6602     0.181850      0.43742  0.016745  0.037677       2.4086   
3      -9.6507     0.214220      0.24033  0.079540  0.117220       2.1814   
4      -9.7030     0.303890      0.31156  0.221870  0.205130       2.4173   

   acc_ankle_y  acc_ankle_z  gyro_ankle_x  gyro_ankle_y  ...  acc_arm_y  \
0      -9.6967      0.63077      0.103900      -0.84053  ...    -4.5781   
1      -9.5080      0.68389      0.085343      -0.83865  ...    -4.3198   
2      -9.5674      0.68113      0.085343      -0.83865  ...    -4.2772   
3      -9.4301      0.55031      0.085343      -0.83865  ...    -4.3163   
4      -9.3889      0.71098      0.085343      -0.83865  ...    -4.1459   

   acc_arm_z  gyro_arm_x  gyro_arm_y  gy

In [5]:
feature_columns = [c for c in df.columns if c not in ["label","subject"]]

X = df[feature_columns].values.astype("float32")
y = df["label"].values.astype("int64")

mu = X.mean(axis=0, keepdims=True)
std = X.std(axis=0, keepdims=True) + 1e-8
X_norm = (X - mu) / std

print(X[:5, :5])
print(X.mean(axis=0)[:5])
print(X.std(axis=0)[:5])

print(X_norm[:5, :5])
print(X_norm.mean(axis=0)[:5])
print(X_norm.std(axis=0)[:5])

[[-9.8184e+00  9.9710e-03  2.9563e-01  4.1863e-03  4.1863e-03]
 [-9.8489e+00  5.2404e-01  3.7348e-01  4.1863e-03  1.6745e-02]
 [-9.6602e+00  1.8185e-01  4.3742e-01  1.6745e-02  3.7677e-02]
 [-9.6507e+00  2.1422e-01  2.4033e-01  7.9540e-02  1.1722e-01]
 [-9.7030e+00  3.0389e-01  3.1156e-01  2.2187e-01  2.0513e-01]]
[-8.5223637e+00 -2.1400130e-01 -1.0559497e+00 -5.1224027e-03
 -4.5064338e-03]
[4.0752907 2.1389537 3.5741613 0.7465967 0.7270393]
[[-0.31802315  0.10471115  0.37815297  0.01246818  0.01195635]
 [-0.32550713  0.3450478   0.3999343   0.01246818  0.0292301 ]
 [-0.27920374  0.18506773  0.41782382  0.02928944  0.05802085]
 [-0.27687252  0.20020129  0.3626808   0.11339777  0.16742758]
 [-0.28970605  0.24212366  0.38260996  0.3040362   0.28834265]]
[ 6.2092198e-07 -2.0081565e-08  2.8917453e-08  4.4681481e-09
 -1.3555056e-09]
[1.0000001 0.9999999 1.        1.        0.9999998]


In [6]:
# 슬라이딩 윈도우
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler

Fs=50
WINDOW_SIZE = Fs*4
STRIDE = Fs*2

def get_frames(X, y, WINDOW_SIZE, STRIDE):
    frames = []
    labels = []
    for start in range(0, len(X) - WINDOW_SIZE+1, STRIDE):
        end = start + WINDOW_SIZE
        sensor_x = X[start:end]
        sensor_y = y[start:end]
        main_label = Counter(sensor_y).most_common(1)[0][0]
        sensor_X = sensor_x.T  # (채널, 시간)
        frames.append(sensor_X)
        labels.append(main_label)
    return np.stack(frames), np.array(labels)

frames, labels = get_frames(X, y, WINDOW_SIZE, STRIDE)
print(frames.shape)
print(labels.shape)

print(df.head())

(12156, 23, 200)
(12156,)
   acc_chest_x  acc_chest_y  acc_chest_z     ecg_1     ecg_2  acc_ankle_x  \
0      -9.8184     0.009971      0.29563  0.004186  0.004186       2.1849   
1      -9.8489     0.524040      0.37348  0.004186  0.016745       2.3876   
2      -9.6602     0.181850      0.43742  0.016745  0.037677       2.4086   
3      -9.6507     0.214220      0.24033  0.079540  0.117220       2.1814   
4      -9.7030     0.303890      0.31156  0.221870  0.205130       2.4173   

   acc_ankle_y  acc_ankle_z  gyro_ankle_x  gyro_ankle_y  ...  acc_arm_y  \
0      -9.6967      0.63077      0.103900      -0.84053  ...    -4.5781   
1      -9.5080      0.68389      0.085343      -0.83865  ...    -4.3198   
2      -9.5674      0.68113      0.085343      -0.83865  ...    -4.2772   
3      -9.4301      0.55031      0.085343      -0.83865  ...    -4.3163   
4      -9.3889      0.71098      0.085343      -0.83865  ...    -4.1459   

   acc_arm_z  gyro_arm_x  gyro_arm_y  gyro_arm_z  mag_arm_x 

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class WindowDataset(Dataset):
    def __init__(self, X_window, y_label):
        self.X = torch.tensor(X_window, dtype=torch.float32)
        self.y = torch.tensor(y_label, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [8]:
from torch.utils.data import WeightedRandomSampler, DataLoader

Full_Dataset = WindowDataset(frames, labels)

n_classes = int(labels.max() + 1)
class_count = np.bincount(labels, minlength=n_classes)
sample_weights = 1.0 / (class_count[labels] + 1e-12)
sample_weights = torch.from_numpy(sample_weights).float()

sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)


BATCH_SIZE   = 256
NUM_WORKERS  = max(2, min(8, os.cpu_count() or 2))
PREFETCH     = 4

Full_Dataloader = DataLoader(
    Full_Dataset,
    batch_size=BATCH_SIZE,
    sampler=sampler,
    drop_last=False,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=PREFETCH
)

print("BATCH_SIZE:", BATCH_SIZE, "| NUM_WORKERS:", NUM_WORKERS, "| PREFETCH:", PREFETCH)
print("steps/epoch ≈", int(len(Full_Dataset)/BATCH_SIZE))

BATCH_SIZE: 256 | NUM_WORKERS: 2 | PREFETCH: 4
steps/epoch ≈ 47


In [9]:
from sklearn.model_selection import train_test_split
subjects = np.sort(df["subject"].unique())

print("사람", subjects)

random_num = np.random.default_rng(seed=42)
random_num.shuffle(subjects)
train = subjects[:8]
test = subjects[8:]
train_subj, val_subj = train_test_split(train, test_size=1, random_state=42)

print("Train subjects:", train_subj)
print("Val subjects:", val_subj)
print("Test subjects:", test)

train_df = df[df["subject"].isin(train_subj)]
val_df   = df[df["subject"].isin(val_subj)]
test_df  = df[df["subject"].isin(test)]

print("Train shape:", train_df.shape)
print("Val shape:", val_df.shape)
print("Test shape:", test_df.shape)

사람 [ 1  2  3  4  5  6  7  8  9 10]
Train subjects: [ 3  6 10  1  4  8  5]
Val subjects: [7]
Test subjects: [2 9]
Train shape: (845568, 25)
Val shape: (104448, 25)
Test shape: (265729, 25)


In [10]:
import torch.nn as nn

class CNNLSTM1D(nn.Module):

   #Conv1d로 특징 추출 (C -> F 채널), MaxPool로 T 로 줄인 뒤
   #LSTM이 (B, T', F) 최종 분류

    def __init__(self, in_ch=23, n_classes=12,
                 cnn_channels=(64, 128),
                 lstm_hidden=128, lstm_layers=2,
                 bidir=True, dropout=0.2):
        super().__init__()
        c1, c2 = cnn_channels

        # conv(B, C, T) -> (B, F, T)
        self.conv = nn.Sequential(
            nn.Conv1d(in_ch, c1, kernel_size=7, padding=3),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(c1, c2, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool1d(2),
        )

        # LSTM = (B, T, F)
        self.lstm = nn.LSTM(
            input_size=c2,
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=bidir,
            dropout=dropout if lstm_layers > 1 else 0.0
        )
        feat_dim = lstm_hidden * (2 if bidir else 1)
        self.fc = nn.Linear(feat_dim, n_classes)

    def forward(self, x):
        h = self.conv(x)
        h = h.permute(0, 2, 1)
        out, _ = self.lstm(h)
        h_last = out[:, -1, :]
        return self.fc(h_last)


In [11]:
import torch

device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
in_ch     = int(frames.shape[1])
n_classes = int(labels.max() + 1)

model = CNNLSTM1D(in_ch=in_ch, n_classes=n_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

def acc_from_logits(logits, y):
    return (logits.argmax(dim=1) == y).float().mean().item()

In [12]:
scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)

EPOCHS = 20
for epoch in range(1, EPOCHS+1):
    model.train()
    tot_loss = tot_acc = tot_n = 0

    for xb, yb in Full_Dataloader:
        xb = xb.to(device, non_blocking=USE_CUDA)
        yb = yb.to(device, non_blocking=USE_CUDA)

        optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast(enabled=USE_AMP):
            out  = model(xb)
            loss = criterion(out, yb)

        if USE_AMP:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()

        bs = yb.size(0)
        tot_loss += loss.item() * bs
        tot_acc  += (out.argmax(dim=1) == yb).float().sum().item()
        tot_n    += bs

    print(f"[{epoch:02d}/{EPOCHS}] loss={tot_loss/tot_n:.4f} acc={tot_acc/tot_n:.4f}")



[01/20] loss=0.9758 acc=0.7608
[02/20] loss=0.1957 acc=0.9519
[03/20] loss=0.1484 acc=0.9642
[04/20] loss=0.1200 acc=0.9706
[05/20] loss=0.1084 acc=0.9748
[06/20] loss=0.1086 acc=0.9726
[07/20] loss=0.1042 acc=0.9753
[08/20] loss=0.0942 acc=0.9790
[09/20] loss=0.0905 acc=0.9789
[10/20] loss=0.0927 acc=0.9777
[11/20] loss=0.1231 acc=0.9675
[12/20] loss=0.0842 acc=0.9817
[13/20] loss=0.0811 acc=0.9817
[14/20] loss=0.0930 acc=0.9794
[15/20] loss=0.0826 acc=0.9803
[16/20] loss=0.0868 acc=0.9806
[17/20] loss=0.0758 acc=0.9832
[18/20] loss=0.0818 acc=0.9812
[19/20] loss=0.0830 acc=0.9815
[20/20] loss=0.0867 acc=0.9798


In [13]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report

@torch.no_grad()
def get_preds(model, loader, device):
    model.eval()
    all_y, all_pred = [], []
    for xb, yb in loader:
        xb = xb.to(device)
        logits = model(xb)
        pred = logits.argmax(dim=1).cpu().numpy()
        all_pred.append(pred)
        all_y.append(yb.numpy())
    return np.concatenate(all_y), np.concatenate(all_pred)

y_true, y_pred = get_preds(model, Full_Dataloader, device)

print("F1 (macro)   :", f1_score(y_true, y_pred, average="macro"))
print("F1 (weighted):", f1_score(y_true, y_pred, average="weighted"))

cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix (counts):")
print(cm)

cm_norm = cm / (cm.sum(axis=1, keepdims=True) + 1e-12)
print("Confusion Matrix (row-normalized):")
print(np.round(cm_norm, 3))

print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4))

F1 (macro)   : 0.9830802218735639
F1 (weighted): 0.9832185617358106
Confusion Matrix (counts):
[[766  37  10  15  20  13  11  13  17   5   5   8  11]
 [  0 880   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0 992   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 924   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0 962   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 981   0   0   0   0   0   0   0]
 [  1   0   0   0   0   0 902   0   0   0   0   0   0]
 [  3   0   0   0   0   0   0 933   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0 888   0   0   0   0]
 [ 25   0   0   0   0   0   0   0   3 899   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0 948   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0 988   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0 896]]
Confusion Matrix (row-normalized):
[[0.823 0.04  0.011 0.016 0.021 0.014 0.012 0.014 0.018 0.005 0.005 0.009
  0.012]
 [0.    1.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
