# Task 1.4 – ECG - data reduction
Reduce the data with

1.4 Data Reduction

Step 1: Load the original raw training data

In [2]:
import pickle
from collections import Counter

# Load original split training data
with open("../data/split_data.pkl", "rb") as f:
    X_train_split, X_val_split, y_train_split, y_val_split = pickle.load(f)

print(f"Full training samples: {len(X_train_split)}")
print("Class distribution:", Counter(y_train_split))


Full training samples: 4943
Class distribution: Counter({0: 2910, 2: 1412, 1: 439, 3: 182})


STEP 2: Stratified Sampling (10%, 25%, 50%)

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

def stratified_sample(X, y, percent):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=1 - percent, random_state=42)
    for train_idx, _ in sss.split(X, y):
        X_sample = [X[i] for i in train_idx]
        y_sample = [y[i] for i in train_idx]
        return X_sample, y_sample

# Create reduced subsets
X_train_50, y_train_50 = stratified_sample(X_train_split, y_train_split, 0.5)
X_train_25, y_train_25 = stratified_sample(X_train_split, y_train_split, 0.25)
X_train_10, y_train_10 = stratified_sample(X_train_split, y_train_split, 0.10)

print(f"50% samples: {len(X_train_50)}")
print(f"25% samples: {len(X_train_25)}")
print(f"10% samples: {len(X_train_10)}")


50% samples: 2471
25% samples: 1235
10% samples: 494


STEP 3: Apply STFT to Each Reduced Subset

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class STFTLayer(nn.Module):
    def __init__(self, n_fft=64, hop_length=16):
        super().__init__()
        self.n_fft = n_fft
        self.hop_length = hop_length

    def forward(self, x):
        stft_tensors, max_t = [], 0

        # 1) compute STFTs and track longest time axis
        for signal in x:
            sig = torch.tensor(signal, dtype=torch.float32)
            m = torch.stft(
                sig,
                n_fft=self.n_fft,
                hop_length=self.hop_length,
                return_complex=True
            ).abs()
            stft_tensors.append(m)
            max_t = max(max_t, m.shape[1])

        # 2) round up length to nearest multiple of 4 (so pooling matches)
        if max_t % 4 != 0:
            max_t = ((max_t + 3) // 4) * 4   # e.g. 505→508→512

        # 3) right-pad every tensor to this common length
        padded = []
        for m in stft_tensors:
            pad = max_t - m.shape[1]
            padded.append(F.pad(m, (0, pad)))  # (left, right) on last dim

        return torch.stack(padded)



# Instantiate STFT layer
stft_layer = STFTLayer()

# Apply STFT to reduced subsets
X_train_50_stft = stft_layer(X_train_50)
X_train_25_stft = stft_layer(X_train_25)
X_train_10_stft = stft_layer(X_train_10)


print("50% shape:", X_train_50_stft.shape)
print("25% shape:", X_train_25_stft.shape)
print("10% shape:", X_train_10_stft.shape)

  return _VF.stft(  # type: ignore[attr-defined]


50% shape: torch.Size([2471, 33, 1144])
25% shape: torch.Size([1235, 33, 1140])
10% shape: torch.Size([494, 33, 1136])


STEP 4: Train Model on Each Reduced Subset

In [5]:
# 4.1. Apply STFT to the validation set (if not already done):

X_val_stft = stft_layer(X_val_split)

In [6]:
# 4.2. Import your DataLoader creator

import sys
sys.path.append("../1.1_dataset_exploration/src")
from dataset import create_spectrogram_dataloaders

In [7]:
# 4.3. Define a wrapper function to train and return accuracy

from model import ECGCNN

def train_on_subset(X_train_stft, y_train, X_val_stft, y_val, title=""):
    train_loader, val_loader = create_spectrogram_dataloaders(
        X_train_stft, y_train, X_val_stft, y_val, batch_size=32, augment=False)

    
    input_shape = (1, *X_train_stft.shape[1:])  # dynamically determine input shape
    # model = ECGCNN(input_shape=input_shape)
    
    model = ECGCNN()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(5):  # keep short for quick testing
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            loss = loss_fn(model(xb), yb)
            loss.backward()
            optimizer.step()

    # Evaluate
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb).argmax(1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)

    acc = correct / total
    print(f"{title} - Val Accuracy: {acc:.4f}")
    return model, acc


In [12]:
model_50, acc_50 = train_on_subset(X_train_50_stft, y_train_50, X_val_stft, y_val_split, "50 %")
model_25, acc_25 = train_on_subset(X_train_25_stft, y_train_25, X_val_stft, y_val_split, "25 %")
model_10, acc_10 = train_on_subset(X_train_10_stft, y_train_10, X_val_stft, y_val_split, "10 %")


50 % - Val Accuracy: 0.5890


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x73216 and 72960x64)

STEP 5: Plot Accuracy vs Dataset Size

In [9]:
import matplotlib.pyplot as plt

# X-axis: dataset sizes
sizes = [10, 25, 50]
accuracies = [acc_50]

# Plot
plt.figure(figsize=(6, 4))
plt.plot(sizes, accuracies, marker='o')
plt.title("Validation Accuracy vs Reduced Dataset Size")
plt.xlabel("Training Dataset Size (%)")
plt.ylabel("Validation Accuracy")
plt.grid(True)
plt.xticks(sizes)
plt.ylim(0, 1)
plt.show()


NameError: name 'acc_50' is not defined

STEP 6: Generate reduced.csv from the 25% model

In [10]:
# Apply STFT to test data
X_test_stft = stft_layer(X_test_raw)
X_test_stft = X_test_stft.unsqueeze(1).to(device)

# Predict using 25% model
model_25.eval()
with torch.no_grad():
    preds = model_25(X_test_stft).argmax(dim=1).cpu().numpy()

# Save to reduced.csv
pd.DataFrame(preds, columns=["label"]).to_csv("reduced.csv", index=False)
print("✅ reduced.csv created successfully.")


NameError: name 'X_test_raw' is not defined