# Imports

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Splits

Splits adapted from Glocker et al. [Risk of Bias in Chest Radiography Deep Learning Foundation Models]

In [17]:
# Glocker et al. split: 
train = 76205
valid = 12673
test=38240

sum = train + valid + test
print("Total: ", sum)

print("Train: ", train/sum)
print("Valid: ", valid/sum)
print("Test: ", test/sum)

Total:  127118
Train:  0.5994823707106782
Valid:  0.09969477178684372
Test:  0.300822857502478


In [18]:
df = pd.read_csv('final_data/chexpert_plus_240401_cleaned_label.csv')

### Create splits

In [19]:

# Step 1: Split into train (60%) and temp (40%) while stratifying
train_df, temp_df = train_test_split(df, test_size=0.4, stratify=df['sex'], random_state=42)

# Step 2: Split temp into validation (20%) and test (20%) while stratifying
valid_df, test_df = train_test_split(temp_df, test_size=0.9, stratify=temp_df['sex'], random_state=42)

# Drop the combined bias column if not needed
#train_df = train_df.drop(columns=['bias_combined'])
#valid_df = valid_df.drop(columns=['bias_combined'])
#test_df = test_df.drop(columns=['bias_combined'])

# Print the number of samples in each set (optional)
print(f"Training set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")


Training set: 67263 samples
Validation set: 4484 samples
Test set: 40358 samples


### sex distribution

In [20]:
# Print overall race distribution
print("### OVERALL DATA ###")
print(df['sex'].value_counts().to_frame('Count'))
print(df['sex'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print training set race distribution
print("### TRAIN SET ###")
print(train_df['sex'].value_counts().to_frame('Count'))
print(train_df['sex'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print test set race distribution
print("### TEST SET ###")
print(test_df['sex'].value_counts().to_frame('Count'))
print(test_df['sex'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print validation set race distribution
print("### VALIDATION SET ###")
print(valid_df['sex'].value_counts().to_frame('Count'))
print(valid_df['sex'].value_counts(normalize=True).to_frame('Percentage') * 100)


### OVERALL DATA ###
     Count
sex       
0    65060
1    47045
     Percentage
sex            
0     58.034878
1     41.965122


### TRAIN SET ###
     Count
sex       
0    39036
1    28227
     Percentage
sex            
0     58.034878
1     41.965122


### TEST SET ###
     Count
sex       
0    23422
1    16936
     Percentage
sex            
0     58.035582
1     41.964418


### VALIDATION SET ###
     Count
sex       
0     2602
1     1882
     Percentage
sex            
0     58.028546
1     41.971454


### saving df_sex

In [21]:
# df drop columns with Unnamed
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
valid_df = valid_df.loc[:, ~valid_df.columns.str.contains('^Unnamed')]
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]


In [22]:
# save
train_df.to_csv('final_data/chexpert_plus_240401_train_sex.csv', index=False)
valid_df.to_csv('final_data/chexpert_plus_240401_valid_sex.csv', index=False)
test_df.to_csv('final_data/chexpert_plus_240401_test_sex.csv', index=False)
# df.to_csv('/data4/lfay/chexpert_plus_240401_cleaned.csv', index=False)

### race distribution

In [23]:
# Step 1: Split into train (60%) and temp (40%) while stratifying
train_df, temp_df = train_test_split(df, test_size=0.4, stratify=df['race'], random_state=42)

# Step 2: Split temp into validation (20%) and test (20%) while stratifying
valid_df, test_df = train_test_split(temp_df, test_size=0.9, stratify=temp_df['race'], random_state=42)

# Drop the combined bias column if not needed
#train_df = train_df.drop(columns=['bias_combined'])
#valid_df = valid_df.drop(columns=['bias_combined'])
#test_df = test_df.drop(columns=['bias_combined'])

# Print the number of samples in each set (optional)
print(f"Training set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")


Training set: 67263 samples
Validation set: 4484 samples
Test set: 40358 samples


In [24]:
# Print overall race distribution
print("### OVERALL DATA ###")
print(df['race'].value_counts().to_frame('Count'))
print(df['race'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print training set race distribution
print("### TRAIN SET ###")
print(train_df['race'].value_counts().to_frame('Count'))
print(train_df['race'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print test set race distribution
print("### TEST SET ###")
print(test_df['race'].value_counts().to_frame('Count'))
print(test_df['race'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print validation set race distribution
print("### VALIDATION SET ###")
print(valid_df['race'].value_counts().to_frame('Count'))
print(valid_df['race'].value_counts(normalize=True).to_frame('Percentage') * 100)


### OVERALL DATA ###
      Count
race       
0     87681
1     16420
2      8004
      Percentage
race            
0      78.213282
1      14.646983
2       7.139735


### TRAIN SET ###
      Count
race       
0     52609
1      9852
2      4802
      Percentage
race            
0      78.213877
1      14.646983
2       7.139140


### TEST SET ###
      Count
race       
0     31565
1      5911
2      2882
      Percentage
race            
0      78.212498
1      14.646415
2       7.141087


### VALIDATION SET ###
      Count
race       
0      3507
1       657
2       320
      Percentage
race            
0      78.211418
1      14.652096
2       7.136485


### saving df_race

In [25]:
# df drop columns with Unnamed
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
valid_df = valid_df.loc[:, ~valid_df.columns.str.contains('^Unnamed')]
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]


In [26]:
# save
train_df.to_csv('final_data/chexpert_plus_240401_train_race.csv', index=False)
valid_df.to_csv('final_data/chexpert_plus_240401_valid_race.csv', index=False)
test_df.to_csv('final_data/chexpert_plus_240401_test_race.csv', index=False)
# df.to_csv('/data4/lfay/chexpert_plus_240401_cleaned.csv', index=False)

### insurance type

In [27]:
# Step 1: Split into train (60%) and temp (40%) while stratifying
train_df, temp_df = train_test_split(df, test_size=0.4, stratify=df['insurance_type'], random_state=42)

# Step 2: Split temp into validation (20%) and test (20%) while stratifying
valid_df, test_df = train_test_split(temp_df, test_size=0.9, stratify=temp_df['insurance_type'], random_state=42)

# Drop the combined bias column if not needed
#train_df = train_df.drop(columns=['bias_combined'])
#valid_df = valid_df.drop(columns=['bias_combined'])
#test_df = test_df.drop(columns=['bias_combined'])

# Print the number of samples in each set (optional)
print(f"Training set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")

Training set: 67263 samples
Validation set: 4484 samples
Test set: 40358 samples


In [28]:
# Print overall race distribution
print("### OVERALL DATA ###")
print(df['insurance_type'].value_counts().to_frame('Count'))
print(df['insurance_type'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print training set race distribution
print("### TRAIN SET ###")
print(train_df['insurance_type'].value_counts().to_frame('Count'))
print(train_df['insurance_type'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print test set race distribution
print("### TEST SET ###")
print(test_df['insurance_type'].value_counts().to_frame('Count'))
print(test_df['insurance_type'].value_counts(normalize=True).to_frame('Percentage') * 100)
print("\n")

# Print validation set race distribution
print("### VALIDATION SET ###")
print(valid_df['insurance_type'].value_counts().to_frame('Count'))
print(valid_df['insurance_type'].value_counts(normalize=True).to_frame('Percentage') * 100)


### OVERALL DATA ###
                Count
insurance_type       
1               71875
2               30590
0                9640
                Percentage
insurance_type            
1                64.114000
2                27.286919
0                 8.599081


### TRAIN SET ###
                Count
insurance_type       
1               43125
2               18354
0                5784
                Percentage
insurance_type            
1                64.114000
2                27.286919
0                 8.599081


### TEST SET ###
                Count
insurance_type       
1               25875
2               11013
0                3470
                Percentage
insurance_type            
1                64.113683
2                27.288270
0                 8.598047


### VALIDATION SET ###
                Count
insurance_type       
1                2875
2                1223
0                 386
                Percentage
insurance_type            
1               

### save df_insurance_type

In [29]:
# df drop columns with Unnamed
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
valid_df = valid_df.loc[:, ~valid_df.columns.str.contains('^Unnamed')]
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]


In [30]:
# save
train_df.to_csv('final_data/chexpert_plus_240401_train_insurance_type.csv', index=False)
valid_df.to_csv('final_data/chexpert_plus_240401_valid_insurance_type.csv', index=False)
test_df.to_csv('final_data/chexpert_plus_240401_test_insurance_type.csv', index=False)
# df.to_csv('/data4/lfay/chexpert_plus_240401_cleaned.csv', index=False)