Setting path of database

In [1]:
import os, ast, numpy as np, pandas as pd, wfdb
from scipy.signal import butter, filtfilt
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# === SỬA đường dẫn này cho đúng máy của bạn ===
PTBXL_ROOT = "../database/physionet.org/files/ptb-xl/1.0.1"  # nhớ sửa <user>
METADATA_CSV = os.path.join(PTBXL_ROOT, "ptbxl_database.csv")  # metadata
# 100 Hz dùng cột filename_lr, sẽ join với PTBXL_ROOT nên không cần WFDB_DIR riêng

assert os.path.exists(METADATA_CSV), "Sai đường dẫn METADATA_CSV"


PTB-XL load & lọc nhãn (Lead I, chỉ SR vs AFIB, cân bằng)

In [3]:
# 1.1 Đọc metadata
Y = pd.read_csv(METADATA_CSV)

# 1.2 Hàm lấy nhãn SR/AFIB từ cột scp_codes (kiểu chuỗi dict)
# PTB-XL dùng mã chẩn đoán: AFIB cho rung nhĩ; SR có thể được gán là 'SR', 'NSR' hoặc 'NORM' tuỳ phiên bản.
def label_from_scp_codes(scp_str):
    d = ast.literal_eval(scp_str) if isinstance(scp_str, str) else {}
    keys = set(d.keys())
    if "AFIB" in keys:
        return "AFIB"
    # các khả năng khác nhau cho sinus rhythm
    for k in ("SR", "NSR", "NORM", "SINUS"):
        if k in keys:
            return "SR"
    return None

Y["target"] = Y["scp_codes"].apply(label_from_scp_codes)
df = Y[Y["target"].isin(["SR", "AFIB"])].copy()

print("Tổng mẫu sau lọc label:", len(df), "— SR:", (df.target=="SR").sum(), "AFIB:", (df.target=="AFIB").sum())

# 1.3 Nạp sóng thô 100 Hz và lấy LEAD I
def read_lead_I_row(row):
    # filename_lr là đường dẫn tương đối tới bản ghi 100 Hz (không có đuôi .hea/.dat)
    rec_rel  = row["filename_lr"]                     # ví dụ: records100/00000/00001_lr
    rec_path = os.path.join(PTBXL_ROOT, rec_rel)

    # Ưu tiên rdrecord vì có .sig_name
    try:
        rec = wfdb.rdrecord(rec_path, physical=True)
        sig = rec.p_signal.astype(np.float32)         # (T, n_leads)
        lead_names = [s.upper() for s in rec.sig_name]
    except Exception:
        # Fallback: rdsamp trả (signals, fields_dict)
        sig, fields = wfdb.rdsamp(rec_path)
        sig = sig.astype(np.float32)
        lead_names = [s.upper() for s in fields.get("sig_name", [])]

    # Tìm chỉ số Lead I (dự phòng vài biến thể tên)
    idx = 0
    for cand in ("I", "LEAD I", "MLI", "MLII"):  # 'I' là chuẩn; các cand khác chỉ để phòng hờ
        if cand in lead_names:
            idx = lead_names.index(cand)
            break

    return sig[:, idx]

# Ví dụ: đọc thử 1 bản ghi
x_demo = read_lead_I_row(df.iloc[0])
print("Một record Lead I:", x_demo.shape)

# 1.4 (Sau này) khi đã có beats, ta sẽ cân bằng lớp bằng downsampling. Ở bước này mới lọc record theo nhãn.


Tổng mẫu sau lọc label: 19569 — SR: 18055 AFIB: 1514
Một record Lead I: (1000,)


Hàm xử lý tín hiệu & R-peak

In [5]:
FS = 100.0  # 100 Hz

def butter_highpass(data, fc=0.5, order=4, fs=FS):
    nyq = 0.5 * fs
    b, a = butter(order, fc/nyq, btype='high')
    return filtfilt(b, a, data)

def butter_lowpass(data, fc=41.0, order=3, fs=FS):
    nyq = 0.5 * fs
    b, a = butter(order, fc/nyq, btype='low')
    return filtfilt(b, a, data)

def preprocess_record(x):
    x = butter_highpass(x, 0.5, 4, FS)
    x = butter_lowpass(x, 41.0, 3, FS)
    return x

def detect_r_peaks(x, fs=FS, thresh_frac=0.5, refractory_ms=200):
    x = np.asarray(x)
    thr = thresh_frac * np.max(np.abs(x))
    refr = int(round(refractory_ms * fs / 1000.0))
    peaks, last = [], -refr
    for i in range(1, len(x)-1):
        if x[i] > thr and x[i] >= x[i-1] and x[i] >= x[i+1]:
            if i - last >= refr:
                peaks.append(i); last = i
    return np.array(peaks, dtype=int)

PRE, POST = 40, 60
def extract_beats(x, rpos, pre=PRE, post=POST):
    beats = []
    for r in rpos:
        a, b = r-pre, r+post
        if a >= 0 and b <= len(x):
            beats.append(x[a:b])
    return np.asarray(beats, dtype=np.float32)

def normalize_beats(X):
    Xn = np.empty_like(X)
    for i, b in enumerate(X):
        m, s = b.mean(), b.std() + 1e-8
        Xn[i] = (b - m) / s
    return Xn

Biến toàn bộ record → beats + label + (tuỳ chọn) patient_id

In [7]:
from tqdm.auto import tqdm

all_beats, all_labels, all_patients = [], [], []

for _, row in tqdm(df.iterrows(), total=len(df)):
    try:
        x = preprocess_record(read_lead_I_row(row))   # dùng hàm bạn đã có
        rpos = detect_r_peaks(x)
        beats = extract_beats(x, rpos)
        if len(beats) == 0:
            continue
        beats = normalize_beats(beats)

        all_beats.append(beats)
        y_val = 1 if row["target"] == "AFIB" else 0
        all_labels.append(np.full(len(beats), y_val, dtype=np.int64))
        pid = row.get("patient_id", row.get("ecg_id", -1))
        all_patients.append(np.full(len(beats), pid))
    except Exception as e:
        # nếu có record lỗi, bỏ qua để pipeline không dừng
        # print("skip", row.get("ecg_id", "?"), e)
        pass

X = np.vstack(all_beats)               # (N_beats, 100)
y = np.concatenate(all_labels)         # (N_beats,)
patients = np.concatenate(all_patients)

print("Beats tổng:", X.shape, " — SR:", (y==0).sum(), " AFIB:", (y==1).sum())
# thêm trục kênh cho Conv1D
X = X[..., None]                        # -> (N_beats, 100, 1)


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 19569/19569 [01:02<00:00, 315.22it/s]


Beats tổng: (217587, 100)  — SR: 198371  AFIB: 19216


Cân bằng lớp theo beat (downsample lớp nhiều hơn)

In [8]:
rng = np.random.default_rng(123)
idx0 = np.where(y==0)[0]
idx1 = np.where(y==1)[0]
m = min(len(idx0), len(idx1))
idx0 = rng.choice(idx0, size=m, replace=False)
idx1 = rng.choice(idx1, size=m, replace=False)
sel = np.sort(np.concatenate([idx0, idx1]))

Xb, yb, patients_b = X[sel], y[sel], patients[sel]
print("Sau cân bằng:", Xb.shape, "— mỗi lớp:", (yb==0).sum(), (yb==1).sum())


Sau cân bằng: (38432, 100, 1) — mỗi lớp: 19216 19216


Chia tập

In [9]:
def split_by_beats(X, y, val_ratio=0.2, seed=123):
    rng = np.random.default_rng(seed)
    N = len(X); perm = rng.permutation(N)
    nval = int(round(val_ratio*N))
    val_idx, tr_idx = perm[:nval], perm[nval:]
    return (X[tr_idx], y[tr_idx]), (X[val_idx], y[val_idx])

# (hoặc) theo patient:
def split_by_patient(X, y, patients, val_ratio=0.2, seed=123):
    rng = np.random.default_rng(seed)
    uniq = np.unique(patients); rng.shuffle(uniq)
    nval = int(round(val_ratio*len(uniq)))
    val_p = set(uniq[:nval]); tr_p = set(uniq[nval:])
    tr_idx = np.where(np.isin(patients, list(tr_p)))[0]
    val_idx = np.where(np.isin(patients, list(val_p)))[0]
    return (X[tr_idx], y[tr_idx]), (X[val_idx], y[val_idx])

# chọn 1 trong 2:
(Xtr, ytr), (Xva, yva) = split_by_beats(Xb, yb, 0.2, 123)
# (Xtr, ytr), (Xva, yva) = split_by_patient(Xb, yb, patients_b, 0.2, 123)

Xtr.shape, Xva.shape


((30746, 100, 1), (7686, 100, 1))

Train 1D-CNN

In [10]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def build_model(input_len=100, n_classes=2):
    inp = keras.Input(shape=(input_len, 1))
    x = layers.Conv1D(16, 5, padding='same', activation='relu')(inp)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(32, 5, padding='same', activation='relu')(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Flatten()(x)
    x = layers.Dense(32, activation='relu')(x)
    out = layers.Dense(n_classes, activation='softmax')(x)
    model = keras.Model(inp, out)
    model.compile(optimizer=keras.optimizers.Adam(1e-3),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

model = build_model()
model.summary()

hist = model.fit(Xtr, ytr, epochs=25, batch_size=128,
                 validation_data=(Xva, yva), verbose=2)


2025-08-23 04:15:29.202786: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-23 04:15:29.593976: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-23 04:15:29.594121: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-23 04:15:29.671443: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-23 04:15:29.834609: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-23 04:15:29.837922: I tensorflow/core/platform/cpu_feature_guard.cc:1

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100, 1)]          0         
                                                                 
 conv1d (Conv1D)             (None, 100, 16)           96        
                                                                 
 max_pooling1d (MaxPooling1  (None, 50, 16)            0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 50, 32)            2592      
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 25, 32)            0         
 g1D)                                                            
                                                                 
 dropout (Dropout)           (None, 25, 32)            0     

2025-08-23 04:15:32.892866: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-08-23 04:15:32.894679: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


241/241 - 2s - loss: 0.3311 - accuracy: 0.8585 - val_loss: 0.2615 - val_accuracy: 0.8968 - 2s/epoch - 8ms/step
Epoch 2/25
241/241 - 1s - loss: 0.2457 - accuracy: 0.9072 - val_loss: 0.2319 - val_accuracy: 0.9128 - 1s/epoch - 5ms/step
Epoch 3/25
241/241 - 1s - loss: 0.2295 - accuracy: 0.9130 - val_loss: 0.2218 - val_accuracy: 0.9183 - 1s/epoch - 5ms/step
Epoch 4/25
241/241 - 1s - loss: 0.2193 - accuracy: 0.9173 - val_loss: 0.2177 - val_accuracy: 0.9182 - 1s/epoch - 5ms/step
Epoch 5/25
241/241 - 1s - loss: 0.2117 - accuracy: 0.9215 - val_loss: 0.2140 - val_accuracy: 0.9188 - 1s/epoch - 5ms/step
Epoch 6/25
241/241 - 1s - loss: 0.2061 - accuracy: 0.9227 - val_loss: 0.2114 - val_accuracy: 0.9215 - 1s/epoch - 5ms/step
Epoch 7/25
241/241 - 1s - loss: 0.1997 - accuracy: 0.9265 - val_loss: 0.2068 - val_accuracy: 0.9228 - 1s/epoch - 5ms/step
Epoch 8/25
241/241 - 1s - loss: 0.1944 - accuracy: 0.9271 - val_loss: 0.2045 - val_accuracy: 0.9238 - 1s/epoch - 6ms/step
Epoch 9/25
241/241 - 1s - loss: 0.1

Đánh giá

In [11]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
yp = model.predict(Xva, verbose=0).argmax(axis=1)
print("Accuracy:", accuracy_score(yva, yp))
print("F1:", f1_score(yva, yp))
print(confusion_matrix(yva, yp))
print(classification_report(yva, yp, target_names=["SR","AFIB"]))


Accuracy: 0.9316939890710383
F1: 0.933552714846222
[[3473  302]
 [ 223 3688]]
              precision    recall  f1-score   support

          SR       0.94      0.92      0.93      3775
        AFIB       0.92      0.94      0.93      3911

    accuracy                           0.93      7686
   macro avg       0.93      0.93      0.93      7686
weighted avg       0.93      0.93      0.93      7686

