# Task 1: Arrhythmia Classification from ECG using 1D CNN

This notebook builds a complete ECG arrhythmia classifier using a 1D CNN. It downloads the Heartbeat dataset from Google Drive, preprocesses signals, trains a model with GPU, and evaluates with Accuracy, Precision, Recall, F1-score, and a Confusion Matrix.

- Dataset: Google Drive (gdown): `https://drive.google.com/file/d/1xAs-CjlpuDqUT2EJUVR5cPuqTUdw2uQg/view?usp=sharing`
- Model: 1D CNN
- Framework: PyTorch
- Runtime: GPU (recommended)


In [None]:
# Install dependencies (Colab-safe)
import sys, subprocess, pkgutil

def pip_install(pkgs):
    for p in pkgs:
        try:
            __import__(p.split('==')[0].split('>=')[0])
        except Exception:
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', p])

pip_install([
    'gdown>=5.2.0', 'numpy>=1.24', 'pandas>=2.0', 'scipy>=1.11', 'matplotlib>=3.7', 'seaborn>=0.13',
    'scikit-learn>=1.3', 'torch>=2.2', 'wfdb>=4.1.2'
])

import os, zipfile, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

print('Torch version:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

In [None]:
# Download and extract dataset via gdown
import gdown, os, zipfile

GDRIVE_URL = 'https://drive.google.com/uc?id=1xAs-CjlpuDqUT2EJUVR5cPuqTUdw2uQg'
DATA_ROOT = 'ecg_data'
os.makedirs(DATA_ROOT, exist_ok=True)
zip_path = os.path.join(DATA_ROOT, 'heartbeat.zip')

if not os.path.exists(zip_path):
    gdown.download(GDRIVE_URL, zip_path, quiet=False)

with zipfile.ZipFile(zip_path, 'r') as zf:
    zf.extractall(DATA_ROOT)

print('Extracted to:', os.listdir(DATA_ROOT))

## Data understanding and preprocessing

We will load the ECG beat segments into fixed-length windows, normalize per-sample, and encode class labels. If the dataset provides pre-segmented beats in `.csv` or `.npy`, we will parse them; otherwise, we will segment from raw signal files.


In [None]:
# Utility: load signals depending on distributed format
# This block tries common formats: a) CSVs with beats and label column; b) npy arrays; c) folders per class
import glob

class_map = {}
X, y = [], []

# Try CSV files with columns: values..., label
csv_files = glob.glob(os.path.join(DATA_ROOT, '**', '*.csv'), recursive=True)
if csv_files:
    import csv
    for f in csv_files:
        try:
            df = pd.read_csv(f)
            # Heuristic: last column is label, others are signal values
            if df.shape[1] > 1:
                sig = df.iloc[:, :-1].values.astype('float32')
                labels = df.iloc[:, -1].astype(str).values
                for i in range(sig.shape[0]):
                    X.append(sig[i])
                    y.append(labels[i])
        except Exception as e:
            print('Skip CSV', f, e)

# Try NPY files: expect dict with 'X','y' or two files
if len(X) == 0:
    npy_files = glob.glob(os.path.join(DATA_ROOT, '**', '*.npy'), recursive=True)
    for f in npy_files:
        try:
            arr = np.load(f, allow_pickle=True)
            if isinstance(arr, dict) and 'X' in arr and 'y' in arr:
                X = [x.astype('float32') for x in arr['X']]
                y = [str(v) for v in arr['y']]
                break
        except Exception:
            pass

# Try folder per class with multiple .npy samples per file
if len(X) == 0:
    subdirs = [d for d in glob.glob(os.path.join(DATA_ROOT, '*')) if os.path.isdir(d)]
    for d in subdirs:
        class_name = os.path.basename(d)
        npys = glob.glob(os.path.join(d, '*.npy'))
        for f in npys:
            try:
                arr = np.load(f)
                if arr.ndim == 1:
                    X.append(arr.astype('float32'))
                    y.append(class_name)
                elif arr.ndim == 2:
                    for row in arr:
                        X.append(row.astype('float32'))
                        y.append(class_name)
            except Exception:
                pass

print('Loaded samples:', len(X))
assert len(X) == len(y) and len(X) > 0, 'No samples found; verify dataset structure after extraction.'

# Pad/trim to fixed length
TARGET_LEN = 187  # common for heartbeat segmentation datasets

def pad_or_trim(signal, target_len=TARGET_LEN):
    sig = np.asarray(signal, dtype='float32')
    if sig.ndim > 1:
        sig = sig.reshape(-1)
    if len(sig) >= target_len:
        return sig[:target_len]
    out = np.zeros(target_len, dtype='float32')
    out[:len(sig)] = sig
    return out

X = np.stack([pad_or_trim(x) for x in X])

# Normalize per-sample to zero mean, unit variance (safe guard small std)
means = X.mean(axis=1, keepdims=True)
stds = X.std(axis=1, keepdims=True) + 1e-6
X = (X - means) / stds

# Encode labels
unique_labels = sorted(list(set(y)))
label_to_id = {lab: i for i, lab in enumerate(unique_labels)}
id_to_label = {i: lab for lab, i in label_to_id.items()}
y_ids = np.array([label_to_id[v] for v in y], dtype='int64')

print('Classes:', unique_labels)
print('X shape:', X.shape, 'y shape:', y_ids.shape)

In [None]:
# Train/val split
X_train, X_val, y_train, y_val = train_test_split(X, y_ids, test_size=0.2, random_state=42, stratify=y_ids)

class ECGDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).long()
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        # shape (C, L) for 1D CNN; use C=1
        return self.X[idx].unsqueeze(0), self.y[idx]

train_ds = ECGDataset(X_train, y_train)
val_ds = ECGDataset(X_val, y_val)
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=256, shuffle=False, num_workers=2, pin_memory=True)

len(train_ds), len(val_ds)

In [None]:
# Define 1D CNN model
class ECGCNN(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=7, padding=3),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(32, 64, kernel_size=5, padding=2),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(2),
        )
        with torch.no_grad():
            dummy = torch.zeros(1, 1, TARGET_LEN)
            out_len = self.features(dummy).shape[-1]
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

num_classes = len(unique_labels)
model = ECGCNN(num_classes).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)
model

In [None]:
# Training loop with validation metrics
from sklearn.metrics import f1_score

def evaluate(model, loader):
    model.eval()
    all_preds, all_trues = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            logits = model(xb)
            preds = logits.argmax(dim=1)
            all_preds.append(preds.cpu().numpy())
            all_trues.append(yb.cpu().numpy())
    y_true = np.concatenate(all_trues)
    y_pred = np.concatenate(all_preds)
    acc = accuracy_score(y_true, y_pred)
    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)
    return acc, pr, rc, f1, y_true, y_pred

EPOCHS = 20
best_f1 = 0.0
history = []
for epoch in range(1, EPOCHS+1):
    model.train()
    running_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)
    avg_loss = running_loss / len(train_ds)
    acc, pr, rc, f1, y_true, y_pred = evaluate(model, val_loader)
    scheduler.step(f1)
    history.append({'epoch': epoch, 'loss': avg_loss, 'val_acc': acc, 'val_f1': f1})
    print(f"Epoch {epoch:02d} | loss {avg_loss:.4f} | val_acc {acc:.4f} | val_f1 {f1:.4f}")
    if f1 > best_f1:
        best_f1 = f1
        torch.save({'model_state': model.state_dict(), 'labels': id_to_label}, 'best_ecg_cnn.pt')

print('Best F1:', best_f1)

In [None]:
# Evaluation: confusion matrix and classification report
acc, pr, rc, f1, y_true, y_pred = evaluate(model, val_loader)
print('Final Validation Metrics:')
print('Accuracy:', acc)
print('Precision:', pr)
print('Recall:', rc)
print('F1-score:', f1)

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=unique_labels, yticklabels=unique_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

print('Classification Report:')
print(classification_report(y_true, y_pred, target_names=unique_labels, zero_division=0))

### Notes on modeling choices

- 1D CNN captures local morphological features of heartbeats efficiently.
- Per-sample normalization stabilizes training across varying amplitudes.
- Weighted macro metrics are reported to account for class imbalance. Consider class-weighted loss or focal loss if strongly imbalanced.
- You can experiment with CNN+LSTM by adding a recurrent block after convolution features for temporal context.
