# Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Splits

Splits adapted from Glocker et al. [Risk of Bias in Chest Radiography Deep Learning Foundation Models]

In [2]:
# Glocker et al. split: 
train = 76205
valid = 12673
test=38240

sum = train + valid + test
print("Total: ", sum)

print("Train: ", train/sum)
print("Valid: ", valid/sum)
print("Test: ", test/sum)

Total:  127118
Train:  0.5994823707106782
Valid:  0.09969477178684372
Test:  0.300822857502478


In [3]:
df = pd.read_csv('processed_data/chexpert_plus_240401_cleaned.csv')

In [4]:
bias_var = ["sex", "race", "health_insurance"]

### Create splits

In [5]:
# Combine the bias variables into a single column for stratification
df['bias_combined'] = df[['sex', 'race', 'insurance_type']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

# Step 1: Split into train (60%) and temp (40%) while stratifying
train_df, temp_df = train_test_split(df, test_size=0.4, stratify=df['bias_combined'], random_state=42)

# Step 2: Split temp into validation (20%) and test (20%) while stratifying
valid_df, test_df = train_test_split(temp_df, test_size=0.9, stratify=temp_df['bias_combined'], random_state=42)

# Drop the combined bias column if not needed
train_df = train_df.drop(columns=['bias_combined'])
valid_df = valid_df.drop(columns=['bias_combined'])
test_df = test_df.drop(columns=['bias_combined'])

# Print the number of samples in each set (optional)
print(f"Training set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

### sex distribution

In [23]:
# Print overall race distribution
print("### OVERALL DATA ###")
print(df['sex'].value_counts().to_frame('Count'))
print(df['sex'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print training set race distribution
print("### TRAIN SET ###")
print(train_df['sex'].value_counts().to_frame('Count'))
print(train_df['sex'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print test set race distribution
print("### TEST SET ###")
print(test_df['sex'].value_counts().to_frame('Count'))
print(test_df['sex'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print validation set race distribution
print("### VALIDATION SET ###")
print(valid_df['sex'].value_counts().to_frame('Count'))
print(valid_df['sex'].value_counts(normalize=True).to_frame('Percentage') * 100)


### OVERALL DATA ###
        Count
sex          
Male    65060
Female  47045
        Percentage
sex               
Male     58.034878
Female   41.965122


### TRAIN SET ###
        Count
sex          
Male    39035
Female  28228
        Percentage
sex               
Male     58.033391
Female   41.966609


### TEST SET ###
        Count
sex          
Male    23423
Female  16935
        Percentage
sex               
Male     58.038059
Female   41.961941


### VALIDATION SET ###
        Count
sex          
Male     2602
Female   1882
        Percentage
sex               
Male     58.028546
Female   41.971454


### race distribution

In [24]:
# Print overall race distribution
print("### OVERALL DATA ###")
print(df['race'].value_counts().to_frame('Count'))
print(df['race'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print training set race distribution
print("### TRAIN SET ###")
print(train_df['race'].value_counts().to_frame('Count'))
print(train_df['race'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print test set race distribution
print("### TEST SET ###")
print(test_df['race'].value_counts().to_frame('Count'))
print(test_df['race'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print validation set race distribution
print("### VALIDATION SET ###")
print(valid_df['race'].value_counts().to_frame('Count'))
print(valid_df['race'].value_counts(normalize=True).to_frame('Percentage') * 100)


### OVERALL DATA ###
       Count
race        
White  87681
Asian  16420
Black   8004
       Percentage
race             
White   78.213282
Asian   14.646983
Black    7.139735


### TRAIN SET ###
       Count
race        
White  52609
Asian   9851
Black   4803
       Percentage
race             
White   78.213877
Asian   14.645496
Black    7.140627


### TEST SET ###
       Count
race        
White  31565
Asian   5912
Black   2881
       Percentage
race             
White   78.212498
Asian   14.648892
Black    7.138609


### VALIDATION SET ###
       Count
race        
White   3507
Asian    657
Black    320
       Percentage
race             
White   78.211418
Asian   14.652096
Black    7.136485


### insurance type

In [25]:
# Print overall race distribution
print("### OVERALL DATA ###")
print(df['insurance_type'].value_counts().to_frame('Count'))
print(df['insurance_type'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print training set race distribution
print("### TRAIN SET ###")
print(train_df['insurance_type'].value_counts().to_frame('Count'))
print(train_df['insurance_type'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print test set race distribution
print("### TEST SET ###")
print(test_df['insurance_type'].value_counts().to_frame('Count'))
print(test_df['insurance_type'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print validation set race distribution
print("### VALIDATION SET ###")
print(valid_df['insurance_type'].value_counts().to_frame('Count'))
print(valid_df['insurance_type'].value_counts(normalize=True).to_frame('Percentage') * 100)


### OVERALL DATA ###
                   Count
insurance_type          
Medicare           71875
Private Insurance  30590
Medicaid            9640
                   Percentage
insurance_type               
Medicare            64.114000
Private Insurance   27.286919
Medicaid             8.599081


### TRAIN SET ###
                   Count
insurance_type          
Medicare           43124
Private Insurance  18355
Medicaid            5784
                   Percentage
insurance_type               
Medicare            64.112514
Private Insurance   27.288405
Medicaid             8.599081


### TEST SET ###
                   Count
insurance_type          
Medicare           25877
Private Insurance  11011
Medicaid            3470
                   Percentage
insurance_type               
Medicare            64.118638
Private Insurance   27.283314
Medicaid             8.598047


### VALIDATION SET ###
                   Count
insurance_type          
Medicare            2874
Private Insuran

### save data frames

In [26]:
# df drop columns with Unnamed
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
valid_df = valid_df.loc[:, ~valid_df.columns.str.contains('^Unnamed')]
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]


In [27]:
# save
train_df.to_csv('processed_data/chexpert_plus_240401_train.csv', index=False)
valid_df.to_csv('processed_data/chexpert_plus_240401_valid.csv', index=False)
test_df.to_csv('processed_data/chexpert_plus_240401_test.csv', index=False)
# df.to_csv('/data4/lfay/chexpert_plus_240401_cleaned.csv', index=False)