In [19]:
import os, glob
import numpy as np
import pandas as pd

In [20]:
# Define root data paths, where the NSRR data is stored
root_dir = '/data'
assert os.path.isdir(root_dir)

# Define the output path
out_dir = 'output/split/'
if not os.path.isdir(out_dir):
    os.mkdir(out_dir)

In [25]:
desc_dir = 'data/datasets/shhs2-dataset-0.17.0.csv'
usecols = ['nsrrid', 'visitnumber', 'gender', 'age_s1', 'overall_shhs2', 
           'race', 'bmi_s2', 'ahi_a0h3', 'htnderv_s2'] # diabetes is only in shh1?

df_shhs = pd.read_csv(desc_dir, usecols=usecols)

# Rename columns
df_shhs.rename(columns={'nsrrid': 'subj',
                        'age_s1': 'age',
                        'overall_shhs2': 'overall',
                        'bmi_s2': 'bmi',
                        'ahi_a0h3': 'ahi',
                        'htnderv_s2': 'hypertension',
                      }, inplace=True)

df_shhs['race'].replace({1: 'caucasian', 2: 'african', 3: 'other'}, inplace=True)
df_shhs.loc[df_shhs['race'] == 1, 'race'] = 'hispanic'
df_shhs.rename(columns={'race': 'ethnicity'}, inplace=True)

# Keep only "Excellent" quality study
# print(df_shhs[df_shhs['overall'] < 6].shape[0], 
#       'subjects with bad PSG data quality will be removed.')
# df_shhs = df_shhs[df_shhs['overall'] >= 6]

df_shhs['male'] = (df_shhs['gender'] == 1).astype(int)

# Keep only first visit
df_shhs = df_shhs[df_shhs['visitnumber'] == 2]

# Convert to str
df_shhs['subj'] = df_shhs['subj'].apply(lambda x: str(x).zfill(4))
df_shhs.set_index('subj', inplace=True)

# # Define training / testing
# # Keep only a random subset of 600 subjects for training to avoid dataset imbalance
# df_shhs["set"] = "excluded"
# idx_train = df_shhs.sample(n=600, replace=False, random_state=42).index
# idx_test = np.setdiff1d(df_shhs.index, idx_train)
# # Now we keep 100 random participants of ``idx_test`` for testing
# rs = np.random.RandomState(42)
# idx_test = rs.choice(idx_test, size=100, replace=False)
# df_shhs.loc[idx_train, "set"] = "training"
# df_shhs.loc[idx_test, "set"] = "testing"


# ---- Test with 3 ----
df_shhs["set"] = "excluded"
idx_train = ['200077', '200078']
idx_test = ['200079']
# Now we keep 100 random participants of ``idx_test`` for testing
rs = np.random.RandomState(42)
df_shhs.loc[idx_train, "set"] = "training"
df_shhs.loc[idx_test, "set"] = "testing"
# -------- end --------

# Export demographics to CSV file
# df_shhs['dataset'] = 'SHHS'
# df_shhs.to_csv(out_dir + "demo_nsrr_shhs.csv")

print(df_shhs.shape[0], 'subjects remaining')
print(df_shhs['set'].value_counts())
df_shhs.head(10)

4080 subjects remaining
excluded    4077
training       2
testing        1
Name: set, dtype: int64


Unnamed: 0_level_0,overall,hypertension,bmi,visitnumber,ahi,gender,ethnicity,age,male,set
subj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
200077,5.0,0.0,23.388687,2.0,9.73822,1.0,caucasian,41.0,1,training
200078,5.0,1.0,30.211833,2.0,19.685039,1.0,caucasian,54.0,1,training
200079,6.0,0.0,35.45105,2.0,26.0,2.0,other,56.0,0,testing
200080,5.0,0.0,32.645673,2.0,12.45,1.0,caucasian,54.0,1,excluded
200081,6.0,0.0,31.644286,2.0,2.632794,2.0,caucasian,40.0,0,excluded
200082,7.0,0.0,28.546713,2.0,2.843602,1.0,caucasian,40.0,1,excluded
200083,,0.0,,2.0,,1.0,caucasian,54.0,1,excluded
200084,,0.0,,2.0,,2.0,other,51.0,0,excluded
200086,6.0,0.0,23.225432,2.0,22.258065,1.0,caucasian,68.0,1,excluded
200088,6.0,0.0,26.060246,2.0,3.910112,1.0,caucasian,44.0,1,excluded


In [26]:
df_shhs.drop(columns=['gender', 'visitnumber'], inplace=True)
# df_shhs = df_shhs.set_index("dataset", append=True).reorder_levels(["dataset", "subj"])

# Remove "excluded"
df_shhs = df_shhs[df_shhs["set"] != "excluded"]

df_shhs['hypertension'] = df_shhs['hypertension'].astype(float)
df_shhs['hypertension'].value_counts(dropna=False)

df_shhs

Unnamed: 0_level_0,overall,hypertension,bmi,ahi,ethnicity,age,male,set
subj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
200077,5.0,0.0,23.388687,9.73822,caucasian,41.0,1,training
200078,5.0,1.0,30.211833,19.685039,caucasian,54.0,1,training
200079,6.0,0.0,35.45105,26.0,other,56.0,0,testing


In [27]:
# Re-order columns
cols_order = [
    'age', 'male', 'bmi', 'ahi', 'ethnicity', 'set', 
    'hypertension']
df_shhs = df_shhs[cols_order]
df_shhs.head().round(2)

Unnamed: 0_level_0,age,male,bmi,ahi,ethnicity,set,hypertension
subj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200077,41.0,1,23.39,9.74,caucasian,training,0.0
200078,54.0,1,30.21,19.69,caucasian,training,1.0
200079,56.0,0,35.45,26.0,other,testing,0.0


In [28]:
# Export to .csv
df_shhs.to_csv(out_dir + "shhs_split.csv", index=True)

In [29]:
df_shhs

Unnamed: 0_level_0,age,male,bmi,ahi,ethnicity,set,hypertension
subj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200077,41.0,1,23.388687,9.73822,caucasian,training,0.0
200078,54.0,1,30.211833,19.685039,caucasian,training,1.0
200079,56.0,0,35.45105,26.0,other,testing,0.0
