In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
import biosppy



### Load data

In [26]:
# Load data 

def load_data(train_path, test_path):
    X_train = pd.read_csv(train_path, index_col="id")
    y_train = X_train.iloc[:, 0]
    X_train = X_train.iloc[:, 1:]
    X_test = pd.read_csv(test_path, index_col="id")
    return transform_data(X_train), y_train.values, transform_data(X_test)

def transform_data(df):
    return np.array([row.dropna().to_numpy(dtype='float32') for _, row in df.iterrows()], dtype=object)

X_train_raw, y_train_raw, X_test_raw = load_data(
    train_path = "train.csv",
    test_path = "test.csv"
)
print(
    "X_train_raw shape: ",
    X_train_raw.shape,
    "\ny_train shape",
    y_train_raw.shape,
    "\nX_test shape",
    X_test_raw.shape,
)

X_train_raw shape:  (5117,) 
y_train shape (5117,) 
X_test shape (3411,)


## Preprocess data

#### R-peak detection

In [45]:
from biosppy.signals import ecg

def preprocess_ecg_with_rpeaks(ecg_signals, sampling_rate=300, window_size=300):
    processed_signals = []
    segment_counts = []  # To keep track of the number of segments per signal
    
    for signal in ecg_signals:
        # Detect R-peaks using BioSPPy
        out = ecg.ecg(signal=signal, sampling_rate=sampling_rate, show=False)
        rpeaks = out['rpeaks']
        
        signal_segments = []
        
        # Extract windows around R-peaks
        for rpeak in rpeaks:
            start = max(0, rpeak - window_size // 2)
            end = min(len(signal), rpeak + window_size // 2)
            window = signal[start:end]
            
            # Pad or truncate to window size
            if len(window) < window_size:
                window = np.pad(window, (0, window_size - len(window)), 'constant')
            else:
                window = window[:window_size]
            
            signal_segments.append(window)
        
        # Add the segments for this signal
        processed_signals.extend(signal_segments)
        segment_counts.append(len(signal_segments))  # Record the number of segments
    
    return np.array(processed_signals), segment_counts

# Preprocess data
X_train_segments, segment_counts_train = preprocess_ecg_with_rpeaks(X_train_raw)
X_test_segments, segment_counts_test = preprocess_ecg_with_rpeaks(X_test_raw)

np.save("X_train_segments.npy", X_train_segments)
np.save("segment_counts_train.npy", segment_counts_train)
np.save("X_test_segments.npy", X_test_segments)
np.save("segment_counts_test.npy", segment_counts_test)



In [48]:
X_train_segments = np.load("X_train_segments.npy")
segment_counts_train = np.load("segment_counts_train.npy")
X_test_segments = np.load("X_test_segments.npy")
segment_counts_test = np.load("segment_counts_test.npy")

In [None]:
# # Aggreate segments into a single feature vector per signal

# def aggregate_segments(X_segments, segment_counts):
#     aggregated_features = []
#     start = 0
#     for count in segment_counts:
#         # Extract segments for the current signal
#         signal_segments = X_segments[start:start+count]
#         # Aggregate features (mean across all segments for simplicity)
#         aggregated_features.append(np.mean(signal_segments, axis=0))  # Example: Mean
#         start += count
#     return np.array(aggregated_features)

# # Aggregate training and test data
# X_train_aggregated = aggregate_segments(X_train_segments, segment_counts_train)
# X_test_aggregated = aggregate_segments(X_test_segments, segment_counts_test)

# # Add channel dimension for CNNs
# X_train_aggregated = X_train_aggregated[..., np.newaxis]
# X_test_aggregated = X_test_aggregated[..., np.newaxis]

# np.save("X_train_aggregated.npy", X_train_aggregated)
# np.save("X_test_aggregated.npy", X_test_aggregated)

# print("Aggregated X_train shape:", X_train_aggregated.shape)
# print("y_train shape:", y_train_raw.shape)  # Should match now
    

Aggregated X_train shape: (5117, 300, 1)
y_train shape: (5117,)


### Option 2

In [70]:
def get_segment_labels(segment_counts):
    """
    Generate segment labels for each segment based on segment counts.

    Args:
    - segment_counts: List or array where each value represents the number of segments per signal.

    Returns:
    - segment_labels: Array of shape (total_segments,), containing the signal index for each segment.
    """
    segment_labels = []
    for signal_idx, count in enumerate(segment_counts):
        # Add the signal index `count` times
        segment_labels.extend([signal_idx] * count)
    return np.array(segment_labels)

# Generate segment_labels_train using segment_counts_train
segment_labels_train = get_segment_labels(segment_counts_train)
segment_labels_test = get_segment_labels(segment_counts_test)

# Verify shapes
print("X_train_segments shape:", X_train_segments.shape)  # Should match the number of segments
print("segment_labels_train shape:", segment_labels_train.shape)  # Should match num_segments
print("X_test_segments shape:", X_test_segments.shape)  # Should match the number of segments
print("segment_labels_test shape:", segment_labels_test.shape)  # Should match num_segments

X_train_segments shape: (190030, 300)
segment_labels_train shape: (190030,)
X_test_segments shape: (126442, 300)
segment_labels_test shape: (126442,)


In [67]:
# Duplicate y_train for each segment based on segment_labels_train
expanded_y_train = y_train_raw[segment_labels_train]

# Check shapes
print("Expanded X_train shape:", X_train_segments.shape)  # Example: (190030, 300)
print("Expanded y_train shape:", expanded_y_train.shape)  # Example: (190030, 4)


Expanded X_train shape: (190030, 300)
Expanded y_train shape: (190030,)


In [None]:
# # Add channel dimension to X_train and X_test
# X_train_segments = X_train_segments[..., np.newaxis]  # Shape: (190030, 300, 1)
# X_test_segments = X_test_segments[..., np.newaxis]    # Shape: (num_test_segments, 300, 1)

# print("X_train_segments shape:", X_train_segments.shape)
# print("X_test_segments shape:", X_test_segments.shape)




X_train_segments shape: (190030, 300, 1, 1)
X_test_segments shape: (126442, 300, 1, 1)


In [83]:
X_test_segments = X_test_segments.reshape(X_test_segments.shape[0], X_test_segments.shape[1], 1)

In [84]:
np.save("X_train_segments.npy", X_train_segments)
np.save("segment_labels_train.npy", segment_labels_train)
np.save("X_test_segments.npy", X_test_segments)
np.save("segment_labels_test.npy", segment_labels_test)

#### Padding and scaling

In [2]:
# Load X_train_rpeak.npy and X_test_rpeak.npy

X_train = np.load("X_train_segments.npy")
X_test = np.load("X_test_segments.npy")

NameError: name 'np' is not defined

In [86]:
print(
    "X_train shape: ",
    X_train.shape,
    "\nX_test shape",
    X_test.shape,
)

X_train shape:  (190030, 300, 1) 
X_test shape (126442, 300, 1)


In [87]:
# Reshape to 2D: (n_samples * sequence_length, 1)
X_train_reshaped = X_train.reshape(-1, X_train.shape[-1])

# Apply StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_reshaped)

# Reshape back to 3D: (n_samples, sequence_length, 1)
X_train = X_train_scaled.reshape(X_train.shape)

# Repeat for X_test
X_test_reshaped = X_test.reshape(-1, X_test.shape[-1])
X_test_scaled = scaler.transform(X_test_reshaped)
X_test = X_test_scaled.reshape(X_test.shape)


#### One-Hot encode labels

In [92]:
# One-hot encode the labels
encoder = OneHotEncoder(sparse_output=False)
y_train = encoder.fit_transform(expanded_y_train.reshape(-1, 1))


Break up signals into windows of size 500

In [1]:
def split_into_windows(signal, window_size=500, step_size=1000):
    """
    Split a signal into overlapping or non-overlapping windows.
    
    Args:
    - signal: 1D array representing the signal.
    - window_size: Length of each window.
    - step_size: Step size between consecutive windows.
    
    Returns:
    - windows: List of 1D arrays, each of length window_size.
    """
    windows = []
    for start in range(0, len(signal) - window_size + 1, step_size):
        windows.append(signal[start:start + window_size])
    return np.array(windows)

# Apply to training and test data
window_size = 1000
step_size = 1000

X_train_split = np.concatenate([split_into_windows(signal, window_size, step_size) for signal in X_train_combined])
y_train_split = np.repeat(y_train_updated, X_train_split.shape[0] // y_train_updated.shape[0], axis=0)

X_test_split = np.concatenate([split_into_windows(signal, window_size, step_size) for signal in X_test_combined])

print("New X_train_split shape:", X_train_split.shape)  # Example: (n_samples, window_size, 2)
print("New y_train_split shape:", y_train_split.shape)
print("New X_test_split shape:", X_test_split.shape)

NameError: name 'np' is not defined

#### Compute class weights


In [93]:
# Handle class imbalance

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_raw),
    y=y_train_raw
)

class_weights = dict(enumerate(class_weights))

## Define the RNN model

In [94]:
model = Sequential([
    # First Convolution Block
    Conv1D(filters=128, kernel_size=10, activation='relu', input_shape=(300, 1)),
    BatchNormalization(),
    Conv1D(filters=128, kernel_size=10, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.2),
    
    # Second Convolution Block
    Conv1D(filters=128, kernel_size=10, activation='relu'),
    BatchNormalization(),
    Conv1D(filters=128, kernel_size=10, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.2),
    
    # Flatten and Fully Connected Layers
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.2),
    Dense(4, activation='softmax')  # Output layer for 4 classes
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_22 (Conv1D)          (None, 291, 128)          1408      
                                                                 
 batch_normalization_20 (Bat  (None, 291, 128)         512       
 chNormalization)                                                
                                                                 
 conv1d_23 (Conv1D)          (None, 282, 128)          163968    
                                                                 
 batch_normalization_21 (Bat  (None, 282, 128)         512       
 chNormalization)                                                
                                                                 
 max_pooling1d_12 (MaxPoolin  (None, 141, 128)         0         
 g1D)                                                            
                                                      

In [95]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


X_train shape: (190030, 300, 1)
y_train shape: (190030, 4)


## Model training

In [96]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=512,
    # class_weight=class_weights,  # Handle class imbalance
    callbacks=[early_stopping, lr_scheduler]
)

model.save("model.h5")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


In [112]:
from scipy.stats import mode

def aggregate_predictions_mode(segment_predictions, segment_labels):
    """
    Aggregate segment predictions to produce signal-level predictions using mode.

    Args:
    - segment_predictions: Array of shape (num_segments,), predicted class labels for each segment.
    - segment_labels: Array of shape (num_segments,), signal indices for each segment.

    Returns:
    - signal_predictions: Array of shape (num_signals,), aggregated predictions (most frequent class).
    """
    num_signals = np.max(segment_labels) + 1  # Number of unique signals
    signal_predictions = []

    for signal_idx in range(num_signals):
        # Extract predictions for all segments of this signal
        signal_segment_preds = segment_predictions[segment_labels == signal_idx]

        # Compute the mode (most common class)
        signal_pred = mode(signal_segment_preds, axis=None)[0]
        signal_predictions.append(signal_pred)

    return np.array(signal_predictions)

# Example usage
segment_predictions = model.predict(X_test_segments)

# Step 1: Predict segment-level classes
segment_class_predictions = np.argmax(segment_predictions, axis=1)  # Convert probabilities to class labels

# Step 2: Aggregate to signal-level predictions
signal_predictions = aggregate_predictions_mode(segment_class_predictions, segment_labels_test)

# Check results
print("Signal-level predictions shape:", signal_predictions.shape)  # Should match the number of signal





Signal-level predictions shape: (3411,)


In [115]:
submission_data = np.vstack((np.arange(X_test_raw.shape[0]), signal_predictions)).T
# Save as a CSV file
np.savetxt("submission.csv", submission_data, delimiter=",", header="id,y", comments="", fmt="%d")

In [107]:
print(signal_predictions.shape)

(3411, 2)


## Create submission


In [None]:
def create_submission(model, X_test, filename="submission.csv"):
    # Get predictions as probabilities
    prob_preds = model.predict(X_test)
    
    # Convert probabilities to class labels
    class_preds = np.argmax(prob_preds, axis=1)
    
    # Create an array with IDs and corresponding predictions
    submission_data = np.vstack((np.arange(X_test.shape[0]), class_preds)).T
    
    # Save as a CSV file
    np.savetxt(filename, submission_data, delimiter=",", header="id,y", comments="", fmt="%d")
    print(f"Submission file saved as {filename}")


# Create submission
model = tf.keras.models.load_model("model.h5")
create_submission(model, X_test)

Submission file saved as submission.csv


In [4]:
# List all physical devices
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPUs detected: {len(gpus)}")
    for gpu in gpus:
        print(gpu)
else:
    print("No GPUs detected.")

GPUs detected: 1
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
