# Statistics of the CODE-15% Dataset
I will use the overview csv files for this.

Things to look for
- split into chagas yes / no
- age, sex distribution
- 


In [1]:
import pandas as pd
# merge the two csv files over the columns 'exam_id' and 'patient_id'
exams_df = pd.read_csv('../code15_hdf5/exams.csv')
chagas_df = pd.read_csv('../code15_hdf5/code15_chagas_labels.csv')


merged_df = pd.merge(exams_df, chagas_df, on=['exam_id', 'patient_id'])

# Move the added columns to index 5
cols = list(merged_df.columns)
cols.insert(10, cols.pop(cols.index('chagas')))
merged_df = merged_df[cols]

merged_path = '../code15_hdf5/code15_merged.csv'

merged_df.to_csv(merged_path, index=False)

merged_df.head()

Unnamed: 0,exam_id,age,is_male,nn_predicted_age,1dAVb,RBBB,LBBB,SB,ST,AF,chagas,patient_id,death,timey,normal_ecg,trace_file
0,1169160,38,True,40.160484,False,False,False,False,False,False,False,523632,False,2.098628,True,exams_part13.hdf5
1,2873686,73,True,67.05944,False,False,False,False,False,False,False,1724173,False,6.657529,False,exams_part13.hdf5
2,168405,67,True,79.62174,False,False,False,False,False,True,False,51421,False,4.282188,False,exams_part13.hdf5
3,271011,41,True,69.75026,False,False,False,False,False,False,False,1737282,False,4.038353,True,exams_part13.hdf5
4,384368,73,True,78.87346,False,False,False,False,False,False,False,331652,False,3.786298,False,exams_part13.hdf5


In [2]:
import altair as alt

age_box = alt.Chart(merged_path).mark_boxplot().encode(
    x='is_male:N',
    y='age:Q'
)

alt.hconcat(
    age_box.transform_filter(chagas='False').properties(title='No Chagas'),
    age_box.transform_filter(chagas='True').properties(title='Chagas')
)

In [3]:
base_density = alt.Chart(merged_path, width=300).transform_density(
    'age',
    as_=['age', 'density'],
    groupby=['chagas'],
    extent=[15, 100]
).mark_area().encode(
    x='age:Q',
    y='density:Q',
).facet(
    'chagas:N',
    columns=2
)

alt.vconcat(
    base_density.transform_filter(is_male='False').properties(title='Non-males'),
    base_density.transform_filter(is_male='True').properties(title='Males')
)

In [4]:
age_hist = alt.Chart(merged_path).mark_bar().encode(
    alt.X('age:Q').bin(maxbins=20),
    y='count()',
    color="chagas:O"
)

ch_f = age_hist.transform_filter(chagas='True', is_male='False')
no_ch_f = age_hist.transform_filter(chagas='False', is_male='False')
ch_m = age_hist.transform_filter(chagas='True', is_male='True')
no_ch_m = age_hist.transform_filter(chagas='False', is_male='True')

alt.hconcat(
    (no_ch_m + ch_m).properties(title='Binned Age (Male)'),
    (no_ch_f + ch_f).properties(title='Binned Age (Not Male)')
)

In [5]:
# print out hw many chagas we have in merged_df

print(merged_df['chagas'].value_counts())
print(merged_df['is_male'].value_counts())

# Select only the abnormality columns
abnormality_cols = ["1dAVb", "RBBB", "LBBB", "SB", "ST", "AF", "normal_ecg", "chagas"]


# Count occurrences of each abnormality (sum over rows)
abnormality_counts = merged_df[abnormality_cols].sum().reset_index()
abnormality_counts.columns = ["Abnormality", "Count"]

alt.Chart(abnormality_counts).mark_bar().encode(
    x=alt.X("Abnormality:N", title="ECG Abnormality"),
    y=alt.Y("Count:Q", title="Number of Cases"),
    color=alt.Color("Abnormality:N", legend=None),
    tooltip=["Abnormality", "Count"]
).properties(
    title="Counts of ECG Abnormalities",
    width=400,
)


chagas
False    336863
True       6561
Name: count, dtype: int64
is_male
False    205046
True     138378
Name: count, dtype: int64


In [11]:
import os
import wfdb
from tqdm import tqdm
from collections import Counter
import numpy as np

# Path to the folder containing the .hea files
wfdb_folder = '../code15_wfdb/'

# Storage for consistency checks
channel_orders = []
sampling_rates = []
num_channels_list = []
num_samples_list = []

total_channel_means = np.zeros(12)
total_channel_stds = np.zeros(12)
num_files = 0

# Process each .hea file in the folder
for file in tqdm(os.listdir(wfdb_folder)):
    if file.endswith(".hea"):
        record_name = os.path.splitext(file)[0]  # Remove .hea extension
        
        try:
            header = wfdb.rdheader(os.path.join(wfdb_folder, record_name))
            record = wfdb.rdrecord(os.path.join(wfdb_folder, record_name))
            
            # Extract key information
            num_channels_list.append(len(header.sig_name))  # Number of leads
            sampling_rates.append(header.fs)  # Sampling frequency
            channel_orders.append(tuple(header.sig_name))  # Channel order

            signal_data = record.p_signal # signal matrix(shape: [num_samples, num_channels])
            
            # Extract the number of samples per signal (First row, last number)
            with open(os.path.join(wfdb_folder, file), "r") as f:
                first_line = f.readline().strip().split()
                num_samples = int(first_line[3])  # Extract the 4th value (4096)
                num_samples_list.append(num_samples)

            # Accumulate sum of means per channel
            total_channel_means += signal_data.mean(axis=0)
            total_channel_stds += signal_data.std(axis=0)

            num_files += 1  # Count processed files

        except Exception as e:
            print(f"⚠️ Error reading {file}: {e}")

# Check consistency for each attribute
def check_consistency(attribute_name, values):
    count = Counter(values)
    if len(count) == 1:
        print(f"✅ All files have the same {attribute_name}: {list(count.keys())[0]}")
    else:
        print(f"⚠️ Inconsistent {attribute_name} detected: {count}")

# Perform consistency checks
check_consistency("number of channels", num_channels_list)
check_consistency("sampling rate", sampling_rates)
check_consistency("channel order", channel_orders)
check_consistency("number of samples per signal", num_samples_list)

# Compute the final mean per channel
if num_files > 0:
    overall_channel_means = total_channel_means / num_files
    overall_channel_stds = total_channel_stds / num_files
    print(f"✅ Mean / STD ECG signal per channel across {num_files} files:")
    
    # Print mean values for each channel
    for i, mean_value in enumerate(overall_channel_means):
        print(f"Channel {i+1}: Mean {mean_value:.4f} mV, STD {overall_channel_stds[i]:.4f} mV")

else:
    print("⚠️ No valid ECG files processed.")


  0%|          | 0/39802 [00:00<?, ?it/s]

100%|██████████| 39802/39802 [01:10<00:00, 566.31it/s]

✅ All files have the same number of channels: 12
✅ All files have the same sampling rate: 400
✅ All files have the same channel order: ('I', 'II', 'III', 'AVR', 'AVL', 'AVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6')
⚠️ Inconsistent number of samples per signal detected: Counter({2934: 11624, 4096: 7529, 3874: 57, 4000: 7, 3022: 4, 3941: 4, 3994: 4, 2520: 4, 3265: 3, 4044: 3, 2978: 3, 3074: 3, 3196: 3, 2976: 3, 3057: 3, 2980: 3, 3026: 3, 3014: 3, 4078: 3, 2845: 3, 3972: 3, 3117: 2, 2793: 2, 3270: 2, 3262: 2, 4037: 2, 2998: 2, 1848: 2, 4073: 2, 4090: 2, 2974: 2, 3452: 2, 3960: 2, 2092: 2, 4077: 2, 3008: 2, 3158: 2, 3944: 2, 1912: 2, 3205: 2, 4058: 2, 3657: 2, 3697: 2, 3077: 2, 3106: 2, 3920: 2, 3081: 2, 3024: 2, 3744: 2, 3113: 2, 4021: 2, 3016: 2, 4018: 2, 3966: 2, 3206: 2, 3904: 2, 3801: 2, 2852: 2, 3021: 2, 3532: 2, 3120: 2, 4048: 2, 4045: 2, 3276: 2, 3069: 2, 3029: 2, 2972: 2, 3788: 2, 3974: 2, 2458: 2, 2120: 2, 3302: 2, 4062: 2, 3808: 2, 4002: 2, 3921: 2, 2081: 2, 2997: 2, 3221: 2, 3193: 




In [16]:
import pandas as pd
import altair as alt

# Create a DataFrame with the mean values per channel
channel_stats_df = pd.DataFrame({
    "Channel": np.arange(1, 13),
    "Mean Value (mV)": overall_channel_means,
    "STD Value (mV)": overall_channel_stds
})

# Create a chart for mean and std combined
std_bars = alt.Chart(channel_stats_df).mark_errorbar(ticks=True).encode(
    x='Channel:O',
    y='Mean Value (mV)',
    yError='STD Value (mV)'
)

points = alt.Chart(channel_stats_df).mark_point(filled=True, size=100).encode(
    x='Channel:O',
    y='Mean Value (mV)',
    color=alt.value('black')
)

std_bars + points

