<a href="https://colab.research.google.com/github/daria-dot/ECG-mortality-risk-detection/blob/main/ecg_colab_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Phase 1: Data Loading & Environment Setup (Google Colab)\n\nThis notebook walks you through all the steps for Phase 1
#of our hackathon project.
#We will:  Set up the Google Colab environment and connect Google Drive.\n2.  Download the `exams.csv`
#label file from the `CODE-15%` dataset.
#  Load the labels into a `pandas` DataFrame.\n4.  Download the **first part** of the large HDF5
#ECG data to create a data-loading function.\n5.
# Inspect the HDF5 data to confirm its shape and format.
# Explain how to save this notebook to your new GitHub repo."

In [None]:
!pip install h5py wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=7acd6f268633b6870f4ee9e1279441ed24e9695aad08e6a61d4c43ff4e092d20
  Stored in directory: /root/.cache/pip/wheels/01/46/3b/e29ffbe4ebe614ff224bad40fc6a5773a67a163251585a13a9
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [3]:
# Install h5py, the library needed to read HDF5 files
!pip install h5py

# Import all the libraries we'll need
import os
import pandas as pd
import numpy as np
import h5py
import glob # This will help us find all our zip files
from google.colab import drive



In [6]:
# This is the command that connects to your Drive
# A pop-up will ask for your permission.
print("Connecting to Google Drive...")
drive.mount('/content/drive')
print("Google Drive connected!")

Connecting to Google Drive...
Mounted at /content/drive
Google Drive connected!


In [8]:
GDRIVE_PROJECT_PATH = '/content/drive/MyDrive/ecg-colab-project'
# ---
# These paths are built from the main path
LABELS_CSV_PATH = os.path.join(GDRIVE_PROJECT_PATH, 'exams.csv')
HDF5_DATA_DIR = os.path.join(GDRIVE_PROJECT_PATH, 'hdf5_data')

In [9]:

print("Loading labels from exams.csv...")
df_labels = pd.read_csv(LABELS_CSV_PATH)

# Display the first 5 rows to confirm it loaded
print("--- First 5 rows of exams.csv ---")
print(df_labels.head())

# Display info to confirm we see 'death' and 'timey'
print("\n--- DataFrame Info ---")
df_labels.info()

print(f"\nExams with mortality data: {df_labels['death'].notna().sum()}")

Loading labels from exams.csv...
--- First 5 rows of exams.csv ---
   exam_id  age  is_male  nn_predicted_age  1dAVb   RBBB   LBBB     SB     ST  \
0  1169160   38     True         40.160484  False  False  False  False  False   
1  2873686   73     True         67.059440  False  False  False  False  False   
2   168405   67     True         79.621740  False  False  False  False  False   
3   271011   41     True         69.750260  False  False  False  False  False   
4   384368   73     True         78.873460  False  False  False  False  False   

      AF  patient_id  death     timey  normal_ecg         trace_file  
0  False      523632  False  2.098628        True  exams_part13.hdf5  
1  False     1724173  False  6.657529       False  exams_part13.hdf5  
2   True       51421  False  4.282188       False  exams_part13.hdf5  
3  False     1737282  False  4.038353        True  exams_part13.hdf5  
4  False      331652  False  3.786298       False  exams_part13.hdf5  

--- DataFrame Info 

In [11]:
#currently only have 1 zip file but will keep loop for potential later developments

print(f"Looking for .zip files in: {HDF5_DATA_DIR}")
zip_files = glob.glob(os.path.join(HDF5_DATA_DIR, "*.zip"))
zip_files.sort()

if not zip_files:
    print("No .zip files found. Are they in the right folder?")
else:
    print(f"Found {len(zip_files)} zip files. Starting unzip process...")

# Loop over each zip file and unzip it
for file_path in zip_files:
    print(f"Unzipping {os.path.basename(file_path)}...")

    # -q (quiet), -o (overwrite), -d (destination)
    !unzip -q -o "{file_path}" -d "{HDF5_DATA_DIR}"

    print(f"Finished unzipping {os.path.basename(file_path)}")

print("All files unzipped!")

Looking for .zip files in: /content/drive/MyDrive/ecg-colab-project/hdf5_data
Found 1 zip files. Starting unzip process...
Unzipping exams_part0.zip...
Finished unzipping exams_part0.zip
All files unzipped!


In [12]:
# We'll just check the first HDF5 file
PART0_HDF5_PATH = os.path.join(HDF5_DATA_DIR, 'exams_part0.hdf5')

print(f"Opening HDF5 file at: {PART0_HDF5_PATH}")

try:
    with h5py.File(PART0_HDF5_PATH, 'r') as hf:
        # HDF5 files are like dictionaries. Let's see the keys.
        print("Keys in the HDF5 file:", list(hf.keys()))

        # Get the 'tracings' dataset (this is the ECG data)
        tracings = hf['tracings']
        # Get the 'exam_id' dataset (this links tracings to our CSV)
        exam_ids_hdf5 = hf['exam_id']

        print("\n--- Dataset Shapes ---")
        print(f"Tracings dataset shape: {tracings.shape} (Exams, Samples, Leads)")
        print(f"Exam IDs dataset shape: {exam_ids_hdf5.shape}")

        # --- Load one sample ECG ---
        print("\n--- Loading One Sample ECG ---")
        sample_exam_id = exam_ids_hdf5[0]
        sample_ecg_tracing = tracings[0] # Load the first ECG

        print(f"Loaded ECG for exam_id: {sample_exam_id}")
        print(f"ECG data shape: {sample_ecg_tracing.shape}")

        # Verify it matches the expected shape (4096 samples, 12 leads)
        assert sample_ecg_tracing.shape == (4096, 12)

        print("\n--- PHASE 1 COMPLETE! ---")
        print("Data is in Google Drive, unzipped, and readable.")

except Exception as e:
    print(f"An error occurred. Did Step 4 (unzip) fail? Error: {e}")

Opening HDF5 file at: /content/drive/MyDrive/ecg-colab-project/hdf5_data/exams_part0.hdf5
Keys in the HDF5 file: ['exam_id', 'tracings']

--- Dataset Shapes ---
Tracings dataset shape: (20001, 4096, 12) (Exams, Samples, Leads)
Exam IDs dataset shape: (20001,)

--- Loading One Sample ECG ---
Loaded ECG for exam_id: 590673
ECG data shape: (4096, 12)

--- PHASE 1 COMPLETE! ---
Data is in Google Drive, unzipped, and readable.
