In [65]:
# General Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [66]:
# Load in the training data
a_train = pd.read_csv('aortaP_train_data.csv')
b_train = pd.read_csv('brachP_train_data.csv')

# Get rid of extra column
a_train = a_train.drop('Unnamed: 0', axis=1)
b_train = b_train.drop('Unnamed: 0', axis=1)

a_test = pd.read_csv('aortaP_test_data.csv')
b_test = pd.read_csv('brachP_test_data.csv')

a_test = a_test.drop('Unnamed: 0', axis=1)
b_test = b_test.drop('Unnamed: 0', axis=1)

a_test['target'] = 0
b_test['target'] = 0

In [67]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Function to smooth data using a moving average
def smooth_data(df, window_size=5):
    """
    Apply a moving average to smooth each row in a DataFrame.

    Parameters:
    - df: The DataFrame to smooth
    - window_size: Size of the moving window

    Returns:
    - A smoothed DataFrame
    """
    smoothed_df = df.apply(lambda row: row.rolling(window=window_size, min_periods=1).mean(), axis=1)
    return smoothed_df

# Fill in the NaNs
# Interpolate all of the rows for a and b
a_train = a_train.interpolate(method='linear', axis=1)
b_train = b_train.interpolate(method='linear', axis=1)

a_test = a_test.interpolate(method='linear', axis=1)
b_test = b_test.interpolate(method='linear', axis=1)

# Backward fill any remaining NaNs
a_train = a_train.bfill(axis=1)
b_train = b_train.bfill(axis=1)

a_test = a_test.bfill(axis=1)
b_test = b_test.bfill(axis=1)

# Apply smoothing
a_train_smoothed = smooth_data(a_train.drop(columns=['target']), window_size=3)
b_train_smoothed = smooth_data(b_train.drop(columns=['target']), window_size=3)

a_test_smoothed = smooth_data(a_test.drop(columns=['target']), window_size=3)
b_test_smoothed = smooth_data(b_test.drop(columns=['target']), window_size=3)

# Add the target column back after smoothing
a_train_smoothed['target'] = a_train['target']
b_train_smoothed['target'] = b_train['target']

a_test_smoothed['target'] = a_test['target']
b_test_smoothed['target'] = b_test['target']

# Get a scaler
scaler = MinMaxScaler()

# Normalize only the columns except target in a_train
a_train_scaled = a_train_smoothed.drop(columns=['target'])
a_train_scaled = pd.DataFrame(scaler.fit_transform(a_train_scaled), columns=a_train_scaled.columns)

a_test_scaled = a_test_smoothed.drop(columns=['target'])
a_test_scaled = pd.DataFrame(scaler.fit_transform(a_test_scaled), columns=a_test_scaled.columns)

# Normalize only the columns except target in b_train
b_train_scaled = b_train_smoothed.drop(columns=['target'])
b_train_scaled = pd.DataFrame(scaler.fit_transform(b_train_scaled), columns=b_train_scaled.columns)

b_test_scaled = b_test_smoothed.drop(columns=['target'])
b_test_scaled = pd.DataFrame(scaler.fit_transform(b_test_scaled), columns=b_test_scaled.columns)

# Add the target column back to the DataFrame
b_train_scaled['target'] = b_train['target']

b_test_scaled['target'] = b_test['target']

# Make a combined dataframe
c_train = pd.concat([a_train_scaled, b_train_scaled], axis=1)
c_test = pd.concat([a_test_scaled, b_test_scaled], axis=1)


In [68]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
# Now let's split our data
# We will first partion our train into model dev and holdout (90/10)
dev_set, holdout_set = train_test_split(c_train, test_size=0.1, random_state=42, stratify=c_train['target'])

# Reset index
holdout_set = holdout_set.reset_index(drop=True)
dev_set = dev_set.reset_index(drop=True)


# Train/Val/Test split (80/10/10)
# Get train and temp
train_set, temp_set = train_test_split(dev_set, test_size=0.2, random_state=42, stratify=dev_set['target'])
# Get val and test
val_set, test_set = train_test_split(temp_set, test_size=0.5, random_state=42, stratify=temp_set['target'])

# Reset indicies
train_set = train_set.reset_index(drop=True)
val_set = val_set.reset_index(drop=True)
test_set = test_set.reset_index(drop=True)

# Now, add 5-fold cross-validation for train_set (90% train / 10% test for each fold)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Preparing the cross-validation splits
folds = []
for fold_idx, (train_index, test_index) in enumerate(skf.split(train_set, train_set['target'])):
    train_fold = train_set.iloc[train_index].reset_index(drop=True)
    test_fold = train_set.iloc[test_index].reset_index(drop=True)
    folds.append((train_fold, test_fold))
    print(f"Fold {fold_idx + 1}: Train size = {len(train_fold)}, Test size = {len(test_fold)}")

Fold 1: Train size = 2015, Test size = 504
Fold 2: Train size = 2015, Test size = 504
Fold 3: Train size = 2015, Test size = 504
Fold 4: Train size = 2015, Test size = 504
Fold 5: Train size = 2016, Test size = 503


In [69]:
import torch
from torch.utils.data import Dataset, DataLoader

# Create dataset
class ABDataset(Dataset):
    def __init__(self, c_train):
        """
        Custom dataset to return rows from a_train and b_train as a tuple,
        and generate a multilabel target
        Args:
            c_train (pd.DataFrame): Time series data for A and B
        """
        self.c_train = c_train
        self.a_train = c_train.iloc[:, :c_train.shape[1]//2]
        self.b_train = c_train.iloc[:, c_train.shape[1]//2:]

    def __len__(self):
        """
        Return the number of samples (rows) in the dataset.
        """
        return len(self.a_train)

    def __getitem__(self, idx):
        """
        Get a single sample (pair of rows and a multilabel target) from the dataset.
        Args:
            idx (int): Index for retrieving a sample.
        """
        # Get the rows from both datasets
        a_row = self.a_train.iloc[idx].values
        b_row = self.b_train.iloc[idx].drop('target').values

        # get the target value (Arbitarily from A)
        target = self.b_train.iloc[idx]['target'].astype(int)

        # Create the multilabel target
        # Set first t
        label = np.zeros(6)
        label[:(target + 1)] = 1


        # Create a single tensor with 2 channels: one for a_row, one for b_row
        combined_tensor = np.stack([a_row, b_row], axis=0)  # Shape will be (2, 336)

        # Convert to torch tensor
        combined_tensor = torch.tensor(combined_tensor, dtype=torch.float32)
        target_tensor = torch.tensor(label, dtype=torch.float32)

        return combined_tensor, target_tensor


In [70]:
import torch
import torch.nn as nn
import torch.optim as optim

class ABCNN(nn.Module):
    def __init__(self, input_channels=2, num_classes=6):
        super(ABCNN, self).__init__()

        # First convolutional layer
        self.conv1 = nn.Conv1d(input_channels, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.dropout1 = nn.Dropout(0.2)
        self.pool1 = nn.AvgPool1d(kernel_size=2, stride=3)

        # Second convolutional layer
        self.conv2 = nn.Conv1d(64, 128, kernel_size=5, padding=2)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(0.3)
        self.pool2 = nn.AvgPool1d(kernel_size=2, stride=2)

        # Third convolutional layer
        self.conv3 = nn.Conv1d(128, 256, kernel_size=7, padding=3)
        self.bn3 = nn.BatchNorm1d(256)
        self.dropout3 = nn.Dropout(0.4)
        self.pool3 = nn.AvgPool1d(kernel_size=2, stride=2)

        self.fc = nn.Linear(256 * 28, num_classes)

    def forward(self, x):
        # Convolutional layers
        x = torch.relu(self.bn1(self.conv1(x)))
        x = self.dropout1(self.pool1(x))

        x = torch.relu(self.bn2(self.conv2(x)))
        x = self.dropout2(self.pool2(x))

        x = torch.relu(self.bn3(self.conv3(x)))
        x = self.dropout3(self.pool3(x))

        # Flatten the output for the fully connected layer
        x = x.view(x.size(0), -1)

        # Fully connected layer with activation function
        x = self.fc(x)

        return x

In [71]:
# Create dataset
train_dataset = ABDataset(train_set)
val_dataset = ABDataset(val_set)
test_dataset = ABDataset(test_set)

dev_dataset = ABDataset(dev_set)
holdout_dataset = ABDataset(holdout_set)
big_test = ABDataset(c_test)

# Create DataLoader instances for batching
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

dev_loader = DataLoader(dev_dataset, batch_size=64, shuffle=True)
holdout_loader = DataLoader(holdout_dataset, batch_size=64, shuffle=False)

big_test_loader = DataLoader(big_test, batch_size=64, shuffle=False)


# DataLoaders for 5-Fold Cross-Validation
fold_loaders = []  # To store loaders for each fold

for fold_idx, (train_fold, test_fold) in enumerate(folds):
    # Create datasets for the current fold
    train_fold_dataset = ABDataset(train_fold)
    test_fold_dataset = ABDataset(test_fold)

    # Create DataLoader instances for batching
    train_fold_loader = DataLoader(train_fold_dataset, batch_size=64, shuffle=True)
    test_fold_loader = DataLoader(test_fold_dataset, batch_size=64, shuffle=False)

    # Store the loaders as a tuple (train_loader, test_loader) for this fold
    fold_loaders.append((train_fold_loader, test_fold_loader))
    print(f"DataLoaders for Fold {fold_idx + 1} created: Train Loader size = {len(train_fold_loader)}, Test Loader size = {len(test_fold_loader)}")

DataLoaders for Fold 1 created: Train Loader size = 32, Test Loader size = 8
DataLoaders for Fold 2 created: Train Loader size = 32, Test Loader size = 8
DataLoaders for Fold 3 created: Train Loader size = 32, Test Loader size = 8
DataLoaders for Fold 4 created: Train Loader size = 32, Test Loader size = 8
DataLoaders for Fold 5 created: Train Loader size = 32, Test Loader size = 8


In [15]:
# CROSS VALIDATION

import torch
import torch.optim as optim
import torch.nn as nn
import pandas as pd
from torch.optim.lr_scheduler import StepLR
from torchsummary import summary

# Variables to track predictions for each fold
all_predictions = []
all_true_labels = []

best_model = None
# Loop through the 5 folds
for fold_idx, (train_fold_loader, val_fold_loader) in enumerate(fold_loaders):
    print(f"\nStarting Fold {fold_idx + 1}")
    best_val_loss = float('inf')

    # Reinitialize model weights for each fold
    model = ABCNN()  # Reinitialize the model
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total number of trainable parameters: {total_params}")
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-3)  # Reinitialize optimizer
    scheduler = StepLR(optimizer, step_size=7, gamma=0.5)  # Reinitialize scheduler
    criterion = torch.nn.BCEWithLogitsLoss()
    num_epochs = 45
    # Train and validate for the specified number of epochs
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        for c_batch, target_batch in train_fold_loader:
            optimizer.zero_grad()
            outputs = model(c_batch)
            loss = criterion(outputs, target_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_fold_loader)
        print(f"Fold {fold_idx + 1}, Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}")

        # Validation phase
        model.eval()
        val_loss = 0.0
        correct_normal = 0
        total_samples = 0
        off_by_1_count = 0
        off_by_2_count = 0
        epoch_preds = []

        with torch.no_grad():
            for c_batch, target_batch in val_fold_loader:
                outputs = model(c_batch)
                loss = criterion(outputs, target_batch)
                val_loss += loss.item()

                predicted = torch.sigmoid(outputs) > 0.5
                for i in range(len(predicted)):
                    predicted_int = predicted[i].cpu().int().numpy().sum()
                    target_int = target_batch[i].cpu().int().numpy().sum()

                    # Save predictions and true labels
                    if epoch == num_epochs - 1:  # Save for the last epoch
                        all_predictions.append(predicted_int)
                        all_true_labels.append(target_int)

                # Track accuracy
                for i in range(len(target_batch)):
                    target_values = target_batch[i]
                    predicted_values = predicted[i]
                    if torch.all(target_values == predicted_values):
                        correct_normal += 1

                    abs_diff = torch.abs(predicted_values.int() - target_values.int()).sum()
                    off_by_1_count += (abs_diff == 1).item()
                    off_by_2_count += (abs_diff == 2).item()
                    total_samples += 1

        # Calculate validation metrics
        avg_val_loss = val_loss / len(val_fold_loader)
        normal_accuracy = correct_normal / total_samples
        print(f"Fold {fold_idx + 1}, Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}")
        print(f"Normal Accuracy: {normal_accuracy:.4f}")
        print(f"Off-by-1 Accuracy: {(off_by_1_count / total_samples):.4f}")
        print(f"Off-by-2 Accuracy: {(off_by_2_count / total_samples):.4f}")

        # Save the best model for this fold
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), f"best_model_fold_{fold_idx + 1}.pth")
            print(f"Saved best model for Fold {fold_idx + 1} with validation loss {best_val_loss:.4f}")
            best_model = model
        # Step the learning rate scheduler
        scheduler.step()

# Save predictions and true labels for all folds
predictions_df = pd.DataFrame({
    "True Labels": all_true_labels,
    "Predictions": all_predictions
})
predictions_df.to_csv("crossval_predictions.csv", index=False)
print("Predictions saved to crossval_predictions.csv")


Starting Fold 1
Total number of trainable parameters: 315078


KeyboardInterrupt: 

In [None]:
# Final Train and Pred

import torch
import torch.optim as optim
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
import pandas as pd

# Initialize model
model = ABCNN()
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total number of trainable parameters: {total_params}")

# Define optimizer, scheduler, and loss function
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.001)
scheduler = StepLR(optimizer, step_size=20, gamma=0.3)
criterion = nn.BCEWithLogitsLoss()
num_epochs = 30

# Variables to track predictions and labels
all_predictions = []
all_true_labels = []
best_val_loss = float('inf')
best_model_path = "best_model.pth"

# Training and validation loop
for epoch in range(num_epochs):
    # Training phase
    model.train()
    running_loss = 0.0
    for c_batch, target_batch in dev_loader:
        optimizer.zero_grad()
        outputs = model(c_batch)
        loss = criterion(outputs, target_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    avg_train_loss = running_loss / len(dev_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}")

    # Validation phase on holdout set
    model.eval()
    val_loss = 0.0
    correct_normal = 0
    total_samples = 0
    off_by_1_count = 0
    off_by_2_count = 0

    with torch.no_grad():
        for c_batch, target_batch in holdout_loader:
            outputs = model(c_batch)
            loss = criterion(outputs, target_batch)
            val_loss += loss.item()

            predicted = torch.sigmoid(outputs) > 0.5
            for i in range(len(predicted)):
                predicted_int = predicted[i].cpu().int().numpy().sum()
                target_int = target_batch[i].cpu().int().numpy().sum()

                all_predictions.append(predicted_int)
                all_true_labels.append(target_int)

            # Track accuracy
            for i in range(len(target_batch)):
                target_values = target_batch[i]
                predicted_values = predicted[i]
                if torch.all(target_values == predicted_values):
                    correct_normal += 1

                abs_diff = torch.abs(predicted_values.int() - target_values.int()).sum()
                off_by_1_count += (abs_diff == 1).item()
                off_by_2_count += (abs_diff == 2).item()
                total_samples += 1

    avg_val_loss = val_loss / len(holdout_loader)
    normal_accuracy = correct_normal / total_samples
    print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}")
    print(f"Normal Accuracy: {normal_accuracy:.4f}")
    print(f"Off-by-1 Accuracy: {(off_by_1_count / total_samples):.4f}")
    print(f"Off-by-2 Accuracy: {(off_by_2_count / total_samples):.4f}")

    # Save the best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), best_model_path)
        print(f"Saved best model with validation loss {best_val_loss:.4f}")

    scheduler.step()

# Save predictions and true labels for holdout set
predictions_df = pd.DataFrame({
    "True Labels": all_true_labels,
    "Predictions": all_predictions
})
predictions_df.to_csv("train_test_predictions.csv", index=False)
print("Predictions saved to train_test_predictions.csv")


# Load the best model
print("\nLoading the best model for big test predictions...")
model = ABCNN()  # Replace with your model class
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

# Generate predictions for big test loader
print("\nGenerating predictions for big test set...")
big_test_predictions = []

with torch.no_grad():
    for idx, (c_batch, target_batch) in enumerate(big_test_loader):
        try:
            # Ensure c_batch is a tensor
            if isinstance(c_batch, list):
                c_batch = torch.stack([torch.tensor(x, dtype=torch.float32) for x in c_batch])

            # Run the model and make predictions
            outputs = model(c_batch)
            predicted = torch.sigmoid(outputs) > 0.5
            # Sum the binary output along the dimension (e.g., [1,1,1,0,0,0] -> 3)
            summed_predictions = predicted.int().sum(dim=1).cpu().numpy()
            big_test_predictions.extend(summed_predictions.tolist())

        except Exception as e:
            print(f"Error processing batch {idx}: {e}")
            continue

# Save big test set predictions as a single number
big_test_df = pd.DataFrame(big_test_predictions, columns=["Prediction"])
big_test_df.to_csv("big_pred.csv", index=False)
print("Big test set predictions saved to big_pred.csv")

# Convert to JSON format
json_data = pd.Series(big_test_df['Prediction'].values - 1).to_json(orient='index')

# Specify the file path to save the JSON data
file_path = 'Deep Breathe_output.json'

# Write the JSON data to the file
with open(file_path, 'w') as f:
    f.write(json_data)


Total number of trainable parameters: 315078
Epoch 1/30, Training Loss: 0.4178
Epoch 1/30, Validation Loss: 0.3800
Normal Accuracy: 0.3029
Off-by-1 Accuracy: 0.4400
Off-by-2 Accuracy: 0.2000
Saved best model with validation loss 0.3800
Epoch 2/30, Training Loss: 0.3074
Epoch 2/30, Validation Loss: 0.2858
Normal Accuracy: 0.3514
Off-by-1 Accuracy: 0.4971
Off-by-2 Accuracy: 0.1371
Saved best model with validation loss 0.2858
