In [1]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
path = r'/work3/saima/shhs_processed'
cvd_summary = pd.read_csv(os.path.join(path, 'shhs-cvd-summary-dataset-0.19.0.csv'))
path = os.path.join(path, 'shhs1_processed_all_directories')
av_subjs = [int(x.split('_')[1]) for x in os.listdir(path)]
cvd_summary = cvd_summary[cvd_summary['nsrrid'].isin(av_subjs)]

timeLimit = 365*10

In [3]:
baseline_Params = ['prev_mi', 'prev_mip', 'prev_stk', 'prev_revpro', 'prev_ang', 'prev_chf']
healthy_Params = ['chf', 'mi', 'stroke', 'mip', 'mi_fatal', 'stk_fatal', 'angina']

In [4]:
healthy_at_baseline = cvd_summary[
    cvd_summary[baseline_Params].eq(0).all(axis=1)]['nsrrid'].values

healthy_after_baseline = cvd_summary[cvd_summary['nsrrid'].isin(
    healthy_at_baseline)][cvd_summary[healthy_Params].eq(0).all(axis=1)]['nsrrid'].values

stroke_after_baseline = cvd_summary[cvd_summary['nsrrid'].isin(healthy_at_baseline)][(
    cvd_summary['stroke'] > 0)][(cvd_summary['stk_date'] < timeLimit)]['nsrrid'].values

strokeFatal_after_baseline = cvd_summary[cvd_summary['nsrrid'].isin(healthy_at_baseline)][(
    cvd_summary['stk_fatal'] > 0)][(cvd_summary['stk_date'] < timeLimit)]['nsrrid'].values

chf_after_baseline = cvd_summary[cvd_summary['nsrrid'].isin(healthy_at_baseline)][(
    cvd_summary['chf'] > 0)][(cvd_summary['chf_date'] < timeLimit)]['nsrrid'].values

mi_after_baseline = cvd_summary[cvd_summary['nsrrid'].isin(healthy_at_baseline)][(
    cvd_summary['mi'] > 0)][(cvd_summary['mi_date'] < timeLimit)]['nsrrid'].values

cvdDeath_after_baseline = cvd_summary[cvd_summary['nsrrid'].isin(healthy_at_baseline)][(
    cvd_summary['cvd_death'] > 0)][(cvd_summary['censdate'] < timeLimit)]['nsrrid'].values

allCause_after_baseline = cvd_summary[cvd_summary['nsrrid'].isin(healthy_at_baseline)][(
    cvd_summary['vital'] == 0)][(cvd_summary['censdate'] < timeLimit)]['nsrrid'].values

In [5]:

# Create a dictionary of categories and their counts
categories = {
    'Healthy at baseline': len(healthy_at_baseline),
    'Remained healthy': len(healthy_after_baseline),
    'Developed stroke': len(stroke_after_baseline), 
    'Fatal stroke': len(strokeFatal_after_baseline),
    'Developed CHF': len(chf_after_baseline),
    'Developed MI': len(mi_after_baseline),
    'CVD death': len(cvdDeath_after_baseline),
    'All-cause mortality': len(allCause_after_baseline)
}

print("Number of subjects in each category:")
print("-" * 40)
# Print each category with aligned counts
for category, count in categories.items():
    print(f"{category:<25} : {count:>6,d}")

Number of subjects in each category:
----------------------------------------
Healthy at baseline       :    928
Remained healthy          :    533
Developed stroke          :    114
Fatal stroke              :     15
Developed CHF             :    212
Developed MI              :    132
CVD death                 :    133
All-cause mortality       :    370


### Define Train-Test Split 

In [8]:
train_split, test_split = 0.2, 0.05

np.random.seed(42) 

# Get 20% of each category for training
train_healthy = np.random.choice(healthy_after_baseline, size=int(train_split*len(healthy_after_baseline)), replace=False)
train_stroke = np.random.choice(stroke_after_baseline, size=int(train_split*len(stroke_after_baseline)), replace=False)
train_strokeFatal = np.random.choice(strokeFatal_after_baseline, size=int(train_split*len(strokeFatal_after_baseline)), replace=False)
train_chf = np.random.choice(chf_after_baseline, size=int(train_split*len(chf_after_baseline)), replace=False)
train_mi = np.random.choice(mi_after_baseline, size=int(train_split*len(mi_after_baseline)), replace=False)
train_cvdDeath = np.random.choice(cvdDeath_after_baseline, size=int(train_split*len(cvdDeath_after_baseline)), replace=False)
train_allCause = np.random.choice(allCause_after_baseline, size=int(train_split*len(allCause_after_baseline)), replace=False)

train_subjects = np.concatenate([
    train_healthy, train_stroke, train_strokeFatal, train_chf,
    train_mi, train_cvdDeath, train_allCause
])


remaining_healthy = np.setdiff1d(healthy_after_baseline, train_healthy)
remaining_stroke = np.setdiff1d(stroke_after_baseline, train_stroke)
remaining_strokeFatal = np.setdiff1d(strokeFatal_after_baseline, train_strokeFatal)
remaining_chf = np.setdiff1d(chf_after_baseline, train_chf)
remaining_mi = np.setdiff1d(mi_after_baseline, train_mi)
remaining_cvdDeath = np.setdiff1d(cvdDeath_after_baseline, train_cvdDeath)
remaining_allCause = np.setdiff1d(allCause_after_baseline, train_allCause)

remaining_subjects = np.concatenate([
    remaining_healthy, remaining_stroke, remaining_strokeFatal, remaining_chf,
    remaining_mi, remaining_cvdDeath, remaining_allCause
])

# Get 10% of remaining subjects from each category for testing
test_healthy = np.random.choice(remaining_healthy, size=int(test_split*len(healthy_after_baseline)), replace=False)
test_stroke = np.random.choice(remaining_stroke, size=int(test_split*len(stroke_after_baseline)), replace=False)
test_strokeFatal = np.random.choice(remaining_strokeFatal, size=int(test_split*len(strokeFatal_after_baseline)), replace=False)
test_chf = np.random.choice(remaining_chf, size=int(test_split*len(chf_after_baseline)), replace=False)
test_mi = np.random.choice(remaining_mi, size=int(test_split*len(mi_after_baseline)), replace=False)
test_cvdDeath = np.random.choice(remaining_cvdDeath, size=int(test_split*len(cvdDeath_after_baseline)), replace=False)
test_allCause = np.random.choice(remaining_allCause, size=int(test_split*len(allCause_after_baseline)), replace=False)

# Combine all test samples into one array
test_subjects = np.concatenate([
    test_healthy, test_stroke, test_strokeFatal, test_chf,
    test_mi, test_cvdDeath, test_allCause
])


# Create a dictionary of category data
categories = {
    'Healthy': (train_healthy, test_healthy, remaining_healthy),
    'Stroke': (train_stroke, test_stroke, remaining_stroke), 
    'Stroke Fatal': (train_strokeFatal, test_strokeFatal, remaining_strokeFatal),
    'CHF': (train_chf, test_chf, remaining_chf),
    'MI': (train_mi, test_mi, remaining_mi),
    'CVD Death': (train_cvdDeath, test_cvdDeath, remaining_cvdDeath),
    'All Cause': (train_allCause, test_allCause, remaining_allCause)
}

print("Category Sizes:")
print("-" * 65)
print(f"{'Category':<15} {'Train':>8} {'Test':>8} {'Remaining':>10} {'Total':>10}")
print("-" * 65)

total_train = 0
total_test = 0
total_remaining = 0

for category, (train, test, remaining) in categories.items():
    remaining_count = len(remaining) - len(test)
    total = len(train) + len(test) + remaining_count
    print(f"{category:<15} {len(train):>8} {len(test):>8} {remaining_count:>10} {total:>10}")
    
    total_train += len(train)
    total_test += len(test) 
    total_remaining += remaining_count

print("-" * 65)
print(f"{'Total':<15} {total_train:>8} {total_test:>8} {total_remaining:>10} {total_train + total_test + total_remaining:>10}")

Category Sizes:
-----------------------------------------------------------------
Category           Train     Test  Remaining      Total
-----------------------------------------------------------------
Healthy              106       26        401        533
Stroke                22        5         87        114
Stroke Fatal           3        0         12         15
CHF                   42       10        160        212
MI                    26        6        100        132
CVD Death             26        6        101        133
All Cause             74       18        278        370
-----------------------------------------------------------------
Total                299       71       1139       1509


In [9]:
train_array = []
for i in train_subjects:
    age = cvd_summary[cvd_summary['nsrrid'] == i]['age_s1'].values[0]
    for to_do in os.listdir(os.path.join(path, 'shhs1_'+str(i))):
        if os.path.exists(os.path.join(path, 'shhs1_'+str(i), to_do)):
            ecgShape = np.load(os.path.join(path, 'shhs1_'+str(i), to_do)).shape[0]
            train_array.extend([[i, to_do, x, 1 if i in train_healthy else 0, age] for x in range(ecgShape)])
    
test_array = []
for i in test_subjects:  
    age = cvd_summary[cvd_summary['nsrrid'] == i]['age_s1'].values[0]
    for to_do in os.listdir(os.path.join(path, 'shhs1_'+str(i))):
        if os.path.exists(os.path.join(path, 'shhs1_'+str(i), to_do)):
            ecgShape = np.load(os.path.join(path, 'shhs1_'+str(i), to_do)).shape[0]
            test_array.extend([[i, to_do, x, 1 if i in train_healthy else 0, age] for x in range(ecgShape)])
            
remaining_array = []
for i in remaining_subjects:  
    age = cvd_summary[cvd_summary['nsrrid'] == i]['age_s1'].values[0]
    for to_do in os.listdir(os.path.join(path, 'shhs1_'+str(i))):
        if os.path.exists(os.path.join(path, 'shhs1_'+str(i), to_do)):
            ecgShape = np.load(os.path.join(path, 'shhs1_'+str(i), to_do)).shape[0]
            remaining_array.extend([[i, to_do, x, 1 if i in remaining_healthy else 0, age] for x in range(ecgShape)])

train_array = [[int(item) if str(item).isdigit() else str(item) for item in sublist] for sublist in train_array]
test_array = [[int(item) if str(item).isdigit() else str(item) for item in sublist] for sublist in test_array]
remaining_array = [[int(item) if str(item).isdigit() else str(item) for item in sublist] for sublist in remaining_array]

len(train_array), len(test_array), len(remaining_array)

(38893, 8795, 153172)

In [10]:
import json
with open('test_splits.json', 'w') as f:
    json.dump({
        'remaining': remaining_array,
    }, f)


In [47]:
import json
with open('train_splits.json', 'w') as f:
    json.dump({
        'train': train_array,
        'test': test_array, 
    }, f)
