# Imports

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Splits

Splits adapted from Glocker et al. [Risk of Bias in Chest Radiography Deep Learning Foundation Models]

In [5]:
# Glocker et al. split: 
train = 76205
valid = 12673
test=38240

sum = train + valid + test
print("Total: ", sum)

print("Train: ", train/sum)
print("Valid: ", valid/sum)
print("Test: ", test/sum)

Total:  127118
Train:  0.5994823707106782
Valid:  0.09969477178684372
Test:  0.300822857502478


In [6]:
df = pd.read_csv('final_data/chexpert_plus_240401_cleaned_label_sex.csv')


### Create splits

In [7]:

# Step 1: Split into train (60%) and temp (40%) while stratifying
train_df, temp_df = train_test_split(df, test_size=0.4, stratify=df['sex'], random_state=42)

# Step 2: Split temp into validation (20%) and test (20%) while stratifying
valid_df, test_df = train_test_split(temp_df, test_size=0.9, stratify=temp_df['sex'], random_state=42)

# Drop the combined bias column if not needed
#train_df = train_df.drop(columns=['bias_combined'])
#valid_df = valid_df.drop(columns=['bias_combined'])
#test_df = test_df.drop(columns=['bias_combined'])

# Print the number of samples in each set (optional)
print(f"Training set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")


Training set: 114511 samples
Validation set: 7634 samples
Test set: 68708 samples


### sex distribution

In [8]:
# Print overall race distribution
print("### OVERALL DATA ###")
print(df['sex'].value_counts().to_frame('Count'))
print(df['sex'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print training set race distribution
print("### TRAIN SET ###")
print(train_df['sex'].value_counts().to_frame('Count'))
print(train_df['sex'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print test set race distribution
print("### TEST SET ###")
print(test_df['sex'].value_counts().to_frame('Count'))
print(test_df['sex'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print validation set race distribution
print("### VALIDATION SET ###")
print(valid_df['sex'].value_counts().to_frame('Count'))
print(valid_df['sex'].value_counts(normalize=True).to_frame('Percentage') * 100)


### OVERALL DATA ###
      Count
sex        
0    112034
1     78819
     Percentage
sex            
0     58.701723
1     41.298277


### TRAIN SET ###
     Count
sex       
0    67220
1    47291
     Percentage
sex            
0     58.701784
1     41.298216


### TEST SET ###
     Count
sex       
0    40333
1    28375
     Percentage
sex            
0     58.702043
1     41.297957


### VALIDATION SET ###
     Count
sex       
0     4481
1     3153
     Percentage
sex            
0      58.69793
1      41.30207


### saving df_sex

In [9]:
# df drop columns with Unnamed
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
valid_df = valid_df.loc[:, ~valid_df.columns.str.contains('^Unnamed')]
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]


In [10]:
# save
train_df.to_csv('final_data/chexpert_plus_240401_train_sex.csv', index=False)
valid_df.to_csv('final_data/chexpert_plus_240401_valid_sex.csv', index=False)
test_df.to_csv('final_data/chexpert_plus_240401_test_sex.csv', index=False)
# df.to_csv('/data4/lfay/chexpert_plus_240401_cleaned.csv', index=False)

### race distribution

In [11]:
df = pd.read_csv('final_data/chexpert_plus_240401_cleaned_label_race.csv')

In [12]:
# Step 1: Split into train (60%) and temp (40%) while stratifying
train_df, temp_df = train_test_split(df, test_size=0.4, stratify=df['race'], random_state=42)

# Step 2: Split temp into validation (20%) and test (20%) while stratifying
valid_df, test_df = train_test_split(temp_df, test_size=0.9, stratify=temp_df['race'], random_state=42)

# Drop the combined bias column if not needed
#train_df = train_df.drop(columns=['bias_combined'])
#valid_df = valid_df.drop(columns=['bias_combined'])
#test_df = test_df.drop(columns=['bias_combined'])

# Print the number of samples in each set (optional)
print(f"Training set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")


Training set: 83194 samples
Validation set: 5546 samples
Test set: 49918 samples


In [13]:
# Print overall race distribution
print("### OVERALL DATA ###")
print(df['race'].value_counts().to_frame('Count'))
print(df['race'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print training set race distribution
print("### TRAIN SET ###")
print(train_df['race'].value_counts().to_frame('Count'))
print(train_df['race'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print test set race distribution
print("### TEST SET ###")
print(test_df['race'].value_counts().to_frame('Count'))
print(test_df['race'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print validation set race distribution
print("### VALIDATION SET ###")
print(valid_df['race'].value_counts().to_frame('Count'))
print(valid_df['race'].value_counts(normalize=True).to_frame('Percentage') * 100)


### OVERALL DATA ###
       Count
race        
0     108502
1      20012
2      10144
      Percentage
race            
0      78.251525
1      14.432633
2       7.315842


### TRAIN SET ###
      Count
race       
0     65101
1     12007
2      6086
      Percentage
race            
0      78.252037
1      14.432531
2       7.315431


### TEST SET ###
      Count
race       
0     39061
1      7205
2      3652
      Percentage
race            
0      78.250331
1      14.433671
2       7.315998


### VALIDATION SET ###
      Count
race       
0      4340
1       800
2       406
      Percentage
race            
0      78.254598
1      14.424811
2       7.320591


### saving df_race

In [14]:
# df drop columns with Unnamed
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
valid_df = valid_df.loc[:, ~valid_df.columns.str.contains('^Unnamed')]
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]


In [15]:
# save
train_df.to_csv('final_data/chexpert_plus_240401_train_race.csv', index=False)
valid_df.to_csv('final_data/chexpert_plus_240401_valid_race.csv', index=False)
test_df.to_csv('final_data/chexpert_plus_240401_test_race.csv', index=False)
# df.to_csv('/data4/lfay/chexpert_plus_240401_cleaned.csv', index=False)

### insurance type

In [20]:
df = pd.read_csv('final_data/chexpert_plus_240401_cleaned_label_health.csv')

In [21]:
# Step 1: Split into train (60%) and temp (40%) while stratifying
train_df, temp_df = train_test_split(df, test_size=0.4, stratify=df['insurance_type'], random_state=42)

# Step 2: Split temp into validation (20%) and test (20%) while stratifying
valid_df, test_df = train_test_split(temp_df, test_size=0.9, stratify=temp_df['insurance_type'], random_state=42)

# Drop the combined bias column if not needed
#train_df = train_df.drop(columns=['bias_combined'])
#valid_df = valid_df.drop(columns=['bias_combined'])
#test_df = test_df.drop(columns=['bias_combined'])

# Print the number of samples in each set (optional)
print(f"Training set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")

Training set: 90273 samples
Validation set: 6018 samples
Test set: 54165 samples


In [22]:
# Print overall race distribution
print("### OVERALL DATA ###")
print(df['insurance_type'].value_counts().to_frame('Count'))
print(df['insurance_type'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print training set race distribution
print("### TRAIN SET ###")
print(train_df['insurance_type'].value_counts().to_frame('Count'))
print(train_df['insurance_type'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print test set race distribution
print("### TEST SET ###")
print(test_df['insurance_type'].value_counts().to_frame('Count'))
print(test_df['insurance_type'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print validation set race distribution
print("### VALIDATION SET ###")
print(valid_df['insurance_type'].value_counts().to_frame('Count'))
print(valid_df['insurance_type'].value_counts(normalize=True).to_frame('Percentage') * 100)



### OVERALL DATA ###
                Count
insurance_type       
1               91316
2               40881
0               18259
                Percentage
insurance_type            
1                60.692827
2                27.171399
0                12.135774


### TRAIN SET ###
                Count
insurance_type       
1               54789
2               24529
0               10955
                Percentage
insurance_type            
1                60.692566
2                27.172023
0                12.135411


### TEST SET ###
                Count
insurance_type       
1               32874
2               14717
0                6574
                Percentage
insurance_type            
1                60.692329
2                27.170682
0                12.136989


### VALIDATION SET ###
                Count
insurance_type       
1                3653
2                1635
0                 730
                Percentage
insurance_type            
1               

### save df_insurance_type

In [29]:
# df drop columns with Unnamed
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
valid_df = valid_df.loc[:, ~valid_df.columns.str.contains('^Unnamed')]
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]


In [23]:
# save
train_df.to_csv('final_data/chexpert_plus_240401_train_insurance_type.csv', index=False)
valid_df.to_csv('final_data/chexpert_plus_240401_valid_insurance_type.csv', index=False)
test_df.to_csv('final_data/chexpert_plus_240401_test_insurance_type.csv', index=False)
# df.to_csv('/data4/lfay/chexpert_plus_240401_cleaned.csv', index=False)