In [2]:
import os, math, gc, json, random, glob, re
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

In [3]:
columns =  [
    "acc_chest_x","acc_chest_y","acc_chest_z",
    "ecg_1","ecg_2",
    "acc_ankle_x","acc_ankle_y","acc_ankle_z",
    "gyro_ankle_x","gyro_ankle_y","gyro_ankle_z",
    "mag_ankle_x","mag_ankle_y","mag_ankle_z",
    "acc_arm_x","acc_arm_y","acc_arm_z",
    "gyro_arm_x","gyro_arm_y","gyro_arm_z",
    "mag_arm_x","mag_arm_y","mag_arm_z",
    "label"
]

In [4]:
from google.colab import drive
drive.mount('/content/drive')

data_dir = "/content/drive/MyDrive/Colab Notebooks/MHEALTHDATASET"

Mounted at /content/drive


In [5]:
paths = sorted(glob.glob(os.path.join(data_dir, "*.log")))

print("find FILE :", len(paths))

find FILE : 10


In [6]:
df = []
for i in paths:
    df_t = pd.read_csv(i, sep=r"\s+", header=None, names=columns)

    #subject 번호 추출
    num = re.search(r"subject(\d+)", i, re.IGNORECASE)
    df_t["subject"] = int(num.group(1)) if num else None

    df.append(df_t)


df = pd.concat(df, ignore_index=True)

print(df.shape)
print(df["subject"].unique())
print(df.head())

(1215745, 25)
[ 1 10  2  3  4  5  6  7  8  9]
   acc_chest_x  acc_chest_y  acc_chest_z     ecg_1     ecg_2  acc_ankle_x  \
0      -9.8184     0.009971      0.29563  0.004186  0.004186       2.1849   
1      -9.8489     0.524040      0.37348  0.004186  0.016745       2.3876   
2      -9.6602     0.181850      0.43742  0.016745  0.037677       2.4086   
3      -9.6507     0.214220      0.24033  0.079540  0.117220       2.1814   
4      -9.7030     0.303890      0.31156  0.221870  0.205130       2.4173   

   acc_ankle_y  acc_ankle_z  gyro_ankle_x  gyro_ankle_y  ...  acc_arm_y  \
0      -9.6967      0.63077      0.103900      -0.84053  ...    -4.5781   
1      -9.5080      0.68389      0.085343      -0.83865  ...    -4.3198   
2      -9.5674      0.68113      0.085343      -0.83865  ...    -4.2772   
3      -9.4301      0.55031      0.085343      -0.83865  ...    -4.3163   
4      -9.3889      0.71098      0.085343      -0.83865  ...    -4.1459   

   acc_arm_z  gyro_arm_x  gyro_arm_y  gy

In [7]:
df.isnull().sum()

Unnamed: 0,0
acc_chest_x,0
acc_chest_y,0
acc_chest_z,0
ecg_1,0
ecg_2,0
acc_ankle_x,0
acc_ankle_y,0
acc_ankle_z,0
gyro_ankle_x,0
gyro_ankle_y,0


In [8]:
feature_columns = [c for c in df.columns if c not in ["label","subject"]]

X = df[feature_columns].values.astype("float32")
y = df["label"].values.astype("int64")

mu = X.mean(axis=0, keepdims=True)
std = X.std(axis=0, keepdims=True) + 1e-8
X_norm = (X - mu) / std

print(X[:5, :5])
print(X.mean(axis=0)[:5])
print(X.std(axis=0)[:5])

[[-9.8184e+00  9.9710e-03  2.9563e-01  4.1863e-03  4.1863e-03]
 [-9.8489e+00  5.2404e-01  3.7348e-01  4.1863e-03  1.6745e-02]
 [-9.6602e+00  1.8185e-01  4.3742e-01  1.6745e-02  3.7677e-02]
 [-9.6507e+00  2.1422e-01  2.4033e-01  7.9540e-02  1.1722e-01]
 [-9.7030e+00  3.0389e-01  3.1156e-01  2.2187e-01  2.0513e-01]]
[-8.5223637e+00 -2.1400130e-01 -1.0559497e+00 -5.1224027e-03
 -4.5064338e-03]
[4.0752907 2.1389537 3.5741613 0.7465967 0.7270393]


In [9]:
# 0->1 사이로 센서값 정규화
print(X_norm[:5, :5])
print(X_norm.mean(axis=0)[:5])
print(X_norm.std(axis=0)[:5])

[[-0.31802315  0.10471115  0.37815297  0.01246818  0.01195635]
 [-0.32550713  0.3450478   0.3999343   0.01246818  0.0292301 ]
 [-0.27920374  0.18506773  0.41782382  0.02928944  0.05802085]
 [-0.27687252  0.20020129  0.3626808   0.11339777  0.16742758]
 [-0.28970605  0.24212366  0.38260996  0.3040362   0.28834265]]
[ 6.2092198e-07 -2.0081565e-08  2.8917453e-08  4.4681481e-09
 -1.3555056e-09]
[1.0000001 0.9999999 1.        1.        0.9999998]


In [10]:
#슬라이딩 윈도우
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler

Fs=50
WINDOW_SIZE = Fs*4
STRIDE = Fs*2

def get_frames(X, y, WINDOW_SIZE, STRIDE):

    frames = []
    labels = []

    for start in range(0, len(X) - WINDOW_SIZE+1, STRIDE):
      end = start + WINDOW_SIZE

      sensor_x = X[start:end]
      sensor_y = y[start:end]
      main_label = Counter(sensor_y).most_common(1)[0][0] # 그 구간의 대표 레이블

      sensor_X = sensor_x.T # sensor(win,채널수) -> .T means: transform sensor(채널, 시간)

      frames.append(sensor_X)
      labels.append(main_label)
    return np.stack(frames), np.array(labels)

frames, labels = get_frames(X,y, WINDOW_SIZE, STRIDE) # 슬라이딩 윈도우 한 결과 frames와 labels에넣기
#frames는 (윈도우개수, 채널수, 길이) [모델에 들어갈 입력데이터] , labels는 윈도우에 붙여진 대표 라벨[모델이 맞춰야되는 정답]

print(frames.shape)
print(labels.shape)


(12156, 23, 200)
(12156,)


In [11]:
df.head()

Unnamed: 0,acc_chest_x,acc_chest_y,acc_chest_z,ecg_1,ecg_2,acc_ankle_x,acc_ankle_y,acc_ankle_z,gyro_ankle_x,gyro_ankle_y,...,acc_arm_y,acc_arm_z,gyro_arm_x,gyro_arm_y,gyro_arm_z,mag_arm_x,mag_arm_y,mag_arm_z,label,subject
0,-9.8184,0.009971,0.29563,0.004186,0.004186,2.1849,-9.6967,0.63077,0.1039,-0.84053,...,-4.5781,0.18776,-0.44902,-1.0103,0.034483,-2.35,-1.6102,-0.030899,0,1
1,-9.8489,0.52404,0.37348,0.004186,0.016745,2.3876,-9.508,0.68389,0.085343,-0.83865,...,-4.3198,0.023595,-0.44902,-1.0103,0.034483,-2.1632,-0.88254,0.32657,0,1
2,-9.6602,0.18185,0.43742,0.016745,0.037677,2.4086,-9.5674,0.68113,0.085343,-0.83865,...,-4.2772,0.27572,-0.44902,-1.0103,0.034483,-1.6175,-0.16562,-0.030693,0,1
3,-9.6507,0.21422,0.24033,0.07954,0.11722,2.1814,-9.4301,0.55031,0.085343,-0.83865,...,-4.3163,0.36752,-0.45686,-1.0082,0.025862,-1.0771,0.006945,-0.38262,0,1
4,-9.703,0.30389,0.31156,0.22187,0.20513,2.4173,-9.3889,0.71098,0.085343,-0.83865,...,-4.1459,0.40729,-0.45686,-1.0082,0.025862,-0.53684,0.1759,-1.0955,0,1


In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

class WindowDataset(Dataset):
  def __init__(self, X_window, y_label): #init-> 처음 한번만실행 + 데이터 받아오기
    self.X = torch.tensor(X_window, dtype=torch.float32)
    self.y = torch.tensor(y_label, dtype=torch.long)

  def __len__(self): #샘플 개수 세는 함수
    return len(self.y)

  def __getitem__(self, idx): #윈도우와 라벨 꺼내기 정의
    return self.X[idx], self.y[idx]



Full_Dataset = WindowDataset(frames,labels) #Full_Dataset -> (윈도우 데이터 + 라벨값 들어감)

Full_Dataloader = DataLoader(Full_Dataset, batch_size=64, shuffle=True) #batch 사이즈 64니까 -> 한번에 윈도우 64개씩 뽑아서 줌, shuffle은 매 학습 마다 데이터 꺼내는 순서를 섞어주는거.
# 그러면 이제 학습 루프에서 for xb, yb, in Full_Dataloarder 하면 자동으로 for문 돌면서 xb,yb에 (배치데이터, 배치라벨) 쭉쭉 넣어줌


x_sample, y_sample = next(iter(Full_Dataloader))
print(x_sample.shape) #(배치,센서채널,윈도우크기)
print(y_sample.shape)


torch.Size([64, 23, 200])
torch.Size([64])


In [13]:
from sklearn.model_selection import train_test_split

subjects = np.sort(df["subject"].unique())
print("사람", subjects)

사람 [ 1  2  3  4  5  6  7  8  9 10]


In [14]:
random_num = np.random.default_rng(seed=42)#난수 생성기 (어렵게 생각하지마셈)
random_num.shuffle(subjects)

#앞부터 8명 train넣고 뒤에서 2명 test 넣음
train = subjects[:8]
test = subjects[8:]

train_subj, val_subj = train_test_split(
    train,
    test_size=1,
    random_state=42
)

print("Train subjects:", train_subj)
print("Val subjects:", val_subj)
print("Test subjects:", test)

Train subjects: [ 3  6 10  1  4  8  5]
Val subjects: [7]
Test subjects: [2 9]


In [15]:
train_df = df[df["subject"].isin(train_subj)]
val_df   = df[df["subject"].isin(val_subj)]
test_df  = df[df["subject"].isin(test)]

In [16]:
print("Train shape:", train_df.shape)
print("Val shape:", val_df.shape)
print("Test shape:", test_df.shape)

Train shape: (845568, 25)
Val shape: (104448, 25)
Test shape: (265729, 25)


In [17]:
import torch.nn as nn

class CNN1D(nn.Module):
    def __init__(self, in_ch=23, n_classes=12):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(in_ch, 64, kernel_size=7, padding=3),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(64, 128, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool1d(2),
        )
        self.fc = nn.Linear(128, n_classes)

    def forward(self, x):
        h = self.conv(x)
        h = h.mean(dim=-1)
        return self.fc(h)

In [19]:
import numpy as np

if labels.min() != 0:
    print("Shifting labels to 0-based for CrossEntropyLoss.")
    labels = labels - labels.min()
    # Dataset/Dataloader 다시 만들어주기 (라벨이 바뀌었으니까)
    Full_Dataset = WindowDataset(frames, labels)
    Full_Dataloader = DataLoader(Full_Dataset, batch_size=64, shuffle=True)

In [20]:
import torch
import torch.nn as nn

device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
in_ch     = int(frames.shape[1])          # 채널 수 (보통 23)
n_classes = int(labels.max() + 1)         # 클래스 개수 (보통 12)

model = CNN1D(in_ch=in_ch, n_classes=n_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

def acc_from_logits(logits, y):
    return (logits.argmax(dim=1) == y).float().mean().item()

In [23]:
EPOCHS = 20
for epoch in range(1, EPOCHS+1):
    model.train()
    tot_loss = tot_acc = tot_n = 0

    for xb, yb in Full_Dataloader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        out  = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()

        bs = yb.size(0)
        tot_loss += loss.item() * bs
        tot_acc  += acc_from_logits(out, yb) * bs
        tot_n    += bs

    print(f"[{epoch:02d}/{EPOCHS}] loss={tot_loss/tot_n:.4f} acc={tot_acc/tot_n:.4f}")

[01/20] loss=0.1415 acc=0.9423
[02/20] loss=0.1426 acc=0.9420
[03/20] loss=0.1375 acc=0.9423
[04/20] loss=0.1419 acc=0.9411
[05/20] loss=0.1300 acc=0.9475
[06/20] loss=0.1280 acc=0.9461
[07/20] loss=0.1239 acc=0.9504
[08/20] loss=0.1237 acc=0.9506
[09/20] loss=0.1207 acc=0.9527
[10/20] loss=0.1169 acc=0.9521
[11/20] loss=0.1178 acc=0.9512
[12/20] loss=0.1115 acc=0.9553
[13/20] loss=0.1106 acc=0.9559
[14/20] loss=0.1072 acc=0.9571
[15/20] loss=0.1073 acc=0.9571
[16/20] loss=0.1016 acc=0.9581
[17/20] loss=0.1057 acc=0.9582
[18/20] loss=0.0950 acc=0.9614
[19/20] loss=0.0948 acc=0.9628
[20/20] loss=0.0973 acc=0.9608


In [24]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
import numpy as np
import torch

@torch.no_grad()
def get_preds(model, loader, device):
    model.eval()
    all_y, all_pred = [], []
    for xb, yb in loader:
        xb = xb.to(device)
        logits = model(xb)
        pred = logits.argmax(dim=1).cpu().numpy()
        all_pred.append(pred)
        all_y.append(yb.numpy())
    return np.concatenate(all_y), np.concatenate(all_pred)

y_true, y_pred = get_preds(model, Full_Dataloader, device)

# F1S
print("F1 (macro)   :", f1_score(y_true, y_pred, average="macro"))
print("F1 (weighted):", f1_score(y_true, y_pred, average="weighted"))

#CM
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix (counts):")
print(cm)

# CM 정규화
cm_norm = cm / (cm.sum(axis=1, keepdims=True) + 1e-12)
print("Confusion Matrix (row-normalized):")
print(np.round(cm_norm, 3))

print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4))

F1 (macro)   : 0.9472421642173775
F1 (weighted): 0.96577588401592
Confusion Matrix (counts):
[[8310   66   88   77   31    8   27   43   10    9   31   14    4]
 [  16  293    0    0    0    0    0    0    0    0    0    0    0]
 [   2    0  304    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0  309    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0  306    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0  310    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0  282    0    0    0    0    0    0]
 [   1    0    0    0    0    0    0  293    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0  294    0    0    0    0]
 [   1    0    0    0    0    0    0    0    0  307    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0  309    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0  308    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0  103]]
Confusion Matrix (row-