<a href="https://colab.research.google.com/github/daria-dot/ECG-mortality-risk-detection/blob/main/ecg_colab_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Phase 1: Data Loading & Environment Setup (Google Colab)\n\nThis notebook walks you through all the steps for Phase 1
#of our hackathon project.
#We will:  Set up the Google Colab environment and connect Google Drive.\n2.  Download the `exams.csv`
#label file from the `CODE-15%` dataset.
#  Load the labels into a `pandas` DataFrame.\n4.  Download the **first part** of the large HDF5
#ECG data to create a data-loading function.\n5.
# Inspect the HDF5 data to confirm its shape and format.
# Explain how to save this notebook to your new GitHub repo."

In [None]:
!pip install h5py wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=7acd6f268633b6870f4ee9e1279441ed24e9695aad08e6a61d4c43ff4e092d20
  Stored in directory: /root/.cache/pip/wheels/01/46/3b/e29ffbe4ebe614ff224bad40fc6a5773a67a163251585a13a9
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [3]:
# Install h5py, the library needed to read HDF5 files
!pip install h5py

# Import all the libraries we'll need
import os
import pandas as pd
import numpy as np
import h5py
import glob # This will help us find all our zip files
from google.colab import drive



In [6]:
# This is the command that connects to your Drive
# A pop-up will ask for your permission.
print("Connecting to Google Drive...")
drive.mount('/content/drive')
print("Google Drive connected!")

Connecting to Google Drive...
Mounted at /content/drive
Google Drive connected!


In [8]:
GDRIVE_PROJECT_PATH = '/content/drive/MyDrive/ecg-colab-project'
# ---
# These paths are built from the main path
LABELS_CSV_PATH = os.path.join(GDRIVE_PROJECT_PATH, 'exams.csv')
HDF5_DATA_DIR = os.path.join(GDRIVE_PROJECT_PATH, 'hdf5_data')

In [9]:

print("Loading labels from exams.csv...")
df_labels = pd.read_csv(LABELS_CSV_PATH)

# Display the first 5 rows to confirm it loaded
print("--- First 5 rows of exams.csv ---")
print(df_labels.head())

# Display info to confirm we see 'death' and 'timey'
print("\n--- DataFrame Info ---")
df_labels.info()

print(f"\nExams with mortality data: {df_labels['death'].notna().sum()}")

Loading labels from exams.csv...
--- First 5 rows of exams.csv ---
   exam_id  age  is_male  nn_predicted_age  1dAVb   RBBB   LBBB     SB     ST  \
0  1169160   38     True         40.160484  False  False  False  False  False   
1  2873686   73     True         67.059440  False  False  False  False  False   
2   168405   67     True         79.621740  False  False  False  False  False   
3   271011   41     True         69.750260  False  False  False  False  False   
4   384368   73     True         78.873460  False  False  False  False  False   

      AF  patient_id  death     timey  normal_ecg         trace_file  
0  False      523632  False  2.098628        True  exams_part13.hdf5  
1  False     1724173  False  6.657529       False  exams_part13.hdf5  
2   True       51421  False  4.282188       False  exams_part13.hdf5  
3  False     1737282  False  4.038353        True  exams_part13.hdf5  
4  False      331652  False  3.786298       False  exams_part13.hdf5  

--- DataFrame Info 

In [11]:
#currently only have 1 zip file but will keep loop for potential later developments

print(f"Looking for .zip files in: {HDF5_DATA_DIR}")
zip_files = glob.glob(os.path.join(HDF5_DATA_DIR, "*.zip"))
zip_files.sort()

if not zip_files:
    print("No .zip files found. Are they in the right folder?")
else:
    print(f"Found {len(zip_files)} zip files. Starting unzip process...")

# Loop over each zip file and unzip it
for file_path in zip_files:
    print(f"Unzipping {os.path.basename(file_path)}...")

    # -q (quiet), -o (overwrite), -d (destination)
    !unzip -q -o "{file_path}" -d "{HDF5_DATA_DIR}"

    print(f"Finished unzipping {os.path.basename(file_path)}")

print("All files unzipped!")

Looking for .zip files in: /content/drive/MyDrive/ecg-colab-project/hdf5_data
Found 1 zip files. Starting unzip process...
Unzipping exams_part0.zip...
Finished unzipping exams_part0.zip
All files unzipped!


In [12]:
# We'll just check the first HDF5 file
PART0_HDF5_PATH = os.path.join(HDF5_DATA_DIR, 'exams_part0.hdf5')

print(f"Opening HDF5 file at: {PART0_HDF5_PATH}")

try:
    with h5py.File(PART0_HDF5_PATH, 'r') as hf:
        # HDF5 files are like dictionaries. Let's see the keys.
        print("Keys in the HDF5 file:", list(hf.keys()))

        # Get the 'tracings' dataset (this is the ECG data)
        tracings = hf['tracings']
        # Get the 'exam_id' dataset (this links tracings to our CSV)
        exam_ids_hdf5 = hf['exam_id']

        print("\n--- Dataset Shapes ---")
        print(f"Tracings dataset shape: {tracings.shape} (Exams, Samples, Leads)")
        print(f"Exam IDs dataset shape: {exam_ids_hdf5.shape}")

        # --- Load one sample ECG ---
        print("\n--- Loading One Sample ECG ---")
        sample_exam_id = exam_ids_hdf5[0]
        sample_ecg_tracing = tracings[0] # Load the first ECG

        print(f"Loaded ECG for exam_id: {sample_exam_id}")
        print(f"ECG data shape: {sample_ecg_tracing.shape}")

        # Verify it matches the expected shape (4096 samples, 12 leads)
        assert sample_ecg_tracing.shape == (4096, 12)

        print("\n--- PHASE 1 COMPLETE! ---")
        print("Data is in Google Drive, unzipped, and readable.")

except Exception as e:
    print(f"An error occurred. Did Step 4 (unzip) fail? Error: {e}")

Opening HDF5 file at: /content/drive/MyDrive/ecg-colab-project/hdf5_data/exams_part0.hdf5
Keys in the HDF5 file: ['exam_id', 'tracings']

--- Dataset Shapes ---
Tracings dataset shape: (20001, 4096, 12) (Exams, Samples, Leads)
Exam IDs dataset shape: (20001,)

--- Loading One Sample ECG ---
Loaded ECG for exam_id: 590673
ECG data shape: (4096, 12)

--- PHASE 1 COMPLETE! ---
Data is in Google Drive, unzipped, and readable.


PHASE 2: Pre-Processing










In [13]:
import numpy as np

# Original leads from CODE-15% (all 12)
# Indices: 0   1    2     3     4     5    6   7   8   9   10  11
LEADS_ALL = ['DI','DII','DIII','AVR','AVL','AVF','V1','V2','V3','V4','V5','V6']

# Leads required by the Lancet paper (8)
LEADS_REQUIRED = ['DI','DII','V1','V2','V3','V4','V5','V6']

# This line of code automatically finds the correct indices
LEAD_INDICES = [LEADS_ALL.index(lead) for lead in LEADS_REQUIRED]

print(f"Original number of leads: {len(LEADS_ALL)}")
print(f"Required number of leads: {len(LEADS_REQUIRED)}")
print(f"Column indices to keep: {LEAD_INDICES}")

# The output should be: [0, 1, 6, 7, 8, 9, 10, 11]

Original number of leads: 12
Required number of leads: 8
Column indices to keep: [0, 1, 6, 7, 8, 9, 10, 11]


In [14]:
# We'll re-use the HDF5 file path from Phase 1
# (Make sure you've run the cell that defines HDF5_DATA_DIR)
PART0_HDF5_PATH = os.path.join(HDF5_DATA_DIR, 'exams_part0.hdf5')

print(f"Opening HDF5 file: {PART0_HDF5_PATH}")

try:
    with h5py.File(PART0_HDF5_PATH, 'r') as hf:
        # Load the first ECG tracing from the file
        sample_ecg_12_lead = hf['tracings'][0]

        print(f"Original ECG shape: {sample_ecg_12_lead.shape}")

        # --- THIS IS THE PRE-PROCESSING STEP ---
        # We use numpy indexing to select only the columns (leads) we want
        sample_ecg_8_lead = sample_ecg_12_lead[:, LEAD_INDICES]
        # ---

        print(f"Processed ECG shape: {sample_ecg_8_lead.shape}")

        # Verify it matches the expected shape (4096, 8)
        assert sample_ecg_8_lead.shape == (4096, 8)

        print("\n--- PHASE 2 COMPLETE! ---")
        print("We can successfully pre-process a 12-lead ECG into an 8-lead one.")

except Exception as e:
    print(f"An error occurred: {e}")

Opening HDF5 file: /content/drive/MyDrive/ecg-colab-project/hdf5_data/exams_part0.hdf5
Original ECG shape: (4096, 12)
Processed ECG shape: (4096, 8)

--- PHASE 2 COMPLETE! ---
We can successfully pre-process a 12-lead ECG into an 8-lead one.


What Phase 3 Does: Engineering the "Survival" Label
This phase is the most critical part of the project. It's where we translate our simple "did they die?" label into a format that a sophisticated survival model can understand.

The problem is we can't just ask the model to predict "yes/no" for death. A patient who lived for 9 years and then died is very different from a patient who died in 9 days. We need to teach the model about the element of time.

To do this, we are replicating the "discrete-time survival" method from the Lancet paper.

Hereâ€™s the process:

We "Discretize" Time: Instead of viewing time as a single number (like 800 days), we chop the 10-year follow-up period into 120 small, equal "intervals" (like 120 months).

We Create y_true (The "Label" Array): We create a "scorecard" for every single patient that is 120 slots long. Each slot j in the scorecard represents a time interval (e.g., month j).

We put a 1 in a slot if the patient survived that interval.

We put a 0 in a slot if the patient died in that interval.

For example, a patient who died in the 10th interval would have a y_true array that looks like: [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...]

We Create y_mask (The "Mask" Array): This is just as important. We need to tell the model which parts of the scorecard to pay attention to.

Many patients in the dataset didn't die; they were just "lost to follow-up" after a certain time (e.g., 5 years). We know they survived for 5 years, but we have no idea what happened in year 6.

The y_mask array tells the model which slots are "valid." We put a 1 in a slot if we have data for that interval and a 0 if we don't.

This mask ensures the model only learns from the data we actually have and isn't punished for "wrong" guesses on intervals where we have no information.

In summary: Phase 3 converts two simple columns (death and timey) into two detailed arrays (y_true and y_mask) that will teach our model to predict the probability of survival for each month over a 10-year period.

In [15]:
# --- 1. Define Survival Time Intervals ---
# We'll use 120 intervals, representing 10 years (1 interval per month)
N_INTERVALS = 120
DAYS_PER_YEAR = 365.25
N_YEARS = 10
INTERVAL_LENGTH_DAYS = (DAYS_PER_YEAR * N_YEARS) / N_INTERVALS

print(f"--- Survival Model Setup ---")
print(f"Total intervals: {N_INTERVALS}")
print(f"Time per interval: {INTERVAL_LENGTH_DAYS:.2f} days (approx 1 month)")

# --- 2. Filter for Relevant Labels ---
# The mortality data (death, timey) is only present for the first exam of each patient.
# We must drop all the rows where 'death' is NaN (Not a Number).
df_survival = df_labels[df_labels['death'].notna()].copy()

# Convert 'death' (True/False) to an integer (1/0)
df_survival['death'] = df_survival['death'].astype(int)

# Convert 'timey' (follow-up time) from days to our new interval index
# We use floor() to get the index.
# e.g., 50 days / 30.4 days/interval = 1.64 -> interval index 1
df_survival['interval_index'] = (df_survival['timey'] / INTERVAL_LENGTH_DAYS).apply(np.floor).astype(int)

# Cap the interval at the maximum (119)
# If a patient lived 12 years, we only have data up to the 10-year mark (index 119)
df_survival['interval_index'] = df_survival['interval_index'].clip(upper=N_INTERVALS - 1)


print(f"\n--- Label Filtering ---")
print(f"Original label count: {len(df_labels)}")
print(f"Survival label count: {len(df_survival)}")
print("\n--- Processed Survival DataFrame (Head) ---")
print(df_survival[['exam_id', 'death', 'timey', 'interval_index']].head())

--- Survival Model Setup ---
Total intervals: 120
Time per interval: 30.44 days (approx 1 month)

--- Label Filtering ---
Original label count: 345779
Survival label count: 233647

--- Processed Survival DataFrame (Head) ---
   exam_id  death     timey  interval_index
0  1169160      0  2.098628               0
1  2873686      0  6.657529               0
2   168405      0  4.282188               0
3   271011      0  4.038353               0
4   384368      0  3.786298               0


In [18]:
# Get the two columns we need from our new DataFrame
event_observed = df_survival['death'].values
time_observed = df_survival['interval_index'].values

# Create empty arrays to hold our new labels
# Shape: (number_of_patients, number_of_intervals)
y_true = np.zeros((len(df_survival), N_INTERVALS), dtype=np.int32)
y_mask = np.zeros((len(df_survival), N_INTERVALS), dtype=np.int32)

print(f"Created empty label arrays with shape: {y_true.shape}")

# Now, loop through each patient and create their specific label arrays
for i in range(len(df_survival)):
    event = event_observed[i]  # 1 if died, 0 if lived
    time = time_observed[i]    # The interval index (e.g., 49)

    # 1. Create the 'y_true' label array
    # We mark '1' (survived) for all intervals up to their event time
    y_true[i, :time] = 1

    # If they DIED (event=1), we must mark their final interval as '0' (died)
    if event == 1:
        y_true[i, time] = 0 # They did NOT survive this interval

    # 2. Create the 'y_mask' (attention) array
    # The model should pay attention to all intervals UP TO AND INCLUDING the event time.
    y_mask[i, :time + 1] = 1

print("Successfully created y_true and y_mask arrays.")




# --- Robust Verification ---
print("\n--- Robust Verification ---")

# 1. Find the first patient who DIED (event=1)
try:
    patient_died_idx = np.where(event_observed == 1)[0][0]
    died_time = time_observed[patient_died_idx]

    print(f"\nFound a patient who DIED (event=1) at interval {died_time} (index {patient_died_idx}).")
    print("Showing intervals around their event time:")
    # Slicing to show 5 intervals before and 5 after
    print(f"y_true (Label): {y_true[patient_died_idx, max(0, died_time-5) : died_time+5]}")
    print(f"y_mask (Mask): {y_mask[patient_died_idx, max(0, died_time-5) : died_time+5]}")
    # The 'y_true' should end with a '0'
    # The 'y_mask' should end with a '1' at the same spot

except IndexError:
    print("\nCould not find any patients who died in the dataset (this is unlikely).")


# 2. Find the first patient who LIVED (event=0)
try:
    patient_lived_idx = np.where(event_observed == 0)[0][0]
    lived_time = time_observed[patient_lived_idx]

    print(f"\nFound a patient who LIVED (event=0) at interval {lived_time} (index {patient_lived_idx}).")
    print("Showing intervals around their event time:")
    # Slicing to show 5 intervals before and 5 after
    print(f"y_true (Label): {y_true[patient_lived_idx, max(0, lived_time-5) : lived_time+5]}")
    print(f"y_mask (Mask): {y_mask[patient_lived_idx, max(0, lived_time-5) : lived_time+5]}")
    # The 'y_true' should be all '1's
    # The 'y_mask' should end with a '1' at the same spot as 'y_true'

except IndexError:
    print("\nCould not find any patients who lived/were censored in the dataset (this is unlikely).")


print("\n--- PHASE 3 COMPLETE! ---")

Created empty label arrays with shape: (233647, 120)
Successfully created y_true and y_mask arrays.

--- Robust Verification ---

Found a patient who DIED (event=1) at interval 0 (index 29).
Showing intervals around their event time:
y_true (Label): [0 0 0 0 0]
y_mask (Mask): [1 0 0 0 0]

Found a patient who LIVED (event=0) at interval 0 (index 0).
Showing intervals around their event time:
y_true (Label): [0 0 0 0 0]
y_mask (Mask): [1 0 0 0 0]

--- PHASE 3 COMPLETE! ---
