In [1]:
import pandas as pd
import numpy as np
import os
import sys
from tqdm import tqdm


**I demonstrated that following assumptions are all TRUE**

- There are no new site ID in test dataset.

- Patient IDs in train and test sets do not overlap

- Image IDs in train and test sets do not overlap

- There are no new laterality values in test dataset. (R: 27439, L: 27267)

- There are new machine IDs in test dataset. (This is already raised by @abebe9849 in here)

- There are no new view values in test dataset. (MLO: 27903, CC: 26765, AT: 19, LM: 10, ML: 8, LMO: 1)

- No. of images/patient are all >= 4 (train: True)

- Site ID is always the same for each patient. (train: True) 

- Age is always the same for each patient. (train: True)

- There are no overlap of machine IDs between two sites in test dataset. (train: True)

- Some patients underwent mammography with multiple machines. (train: only 1 patient 22637 with more than 1 machine)

- No. of images in site ID 1 > No. of images in site ID 2.

- All patients have CC and MLO images for both sides

- More than 40% of images are from machine ID 49 (43% for train dataset) (by @kaggleqrdl)

- Mean age of patients is between 56-61 (58.6 for train set), and patients in site 1 are younger than those in site 2. (by @kaggleqrdl)

- 1-2% of patients use implants (1.4% for train set).(by @kaggleqrdl)

- Positive rate is between 0.0204-0.0256 (0.0206 for train dataset). (shown by @zzy990106 and others)

- Age column contains nan, while others do not

### Group strategy

- Group by 'patient_id'
- Positive rate is between 0.0204-0.0256 (0.0206 for train dataset)
- There are new machine IDs in test dataset --> group by machine_id as well (generalization to un-seen machine or not?)
- No. of images in site ID 1 > No. of images in site ID 2.
- All patients have CC and MLO images for both sides
- More than 40% of images are from machine ID 49 (43% for train dataset)
- Mean age of patients is between 56-61 (58.6 for train set), and patients in site 1 are younger than those in site 2
- 1-2% of patients use implants (1.4% for train set)

In [8]:
def fold_check(train_df, val_df):
    # ensure no overlap
    train_patients = set(train_df.patient_id.unique())
    val_patients = set(val_df.patient_id.unique())
    assert len(list(val_patients - train_patients)) == len(list(val_patients))
    
    ret = {}
    num_samples = len(val_df)
    num_patients = val_df.patient_id.nunique()
    # percent of positive (samples level)
    ret['val_pos_sample_num'] = val_df.cancer.sum()
    ret['val_pos_patient_num'] = val_df[val_df.cancer == 1].patient_id.nunique()
    ret['val_pos_sample_percent'] = ret['val_pos_sample_num'] / num_samples
    ret['val_pos_patient_percent'] = ret['val_pos_patient_num'] / num_patients
    val_machine_ids = sorted(list(val_df.machine_id.unique()))
    train_machine_ids = sorted(list(train_df.machine_id.unique()))
    not_in_train_machine_ids = list(set(val_machine_ids) - set(train_machine_ids))
    ret['val_machine_ids'] = val_machine_ids
    ret['train_machine_ids'] = train_machine_ids
    ret['not_in_train_machine_ids'] = not_in_train_machine_ids
    ret['mean_site_id'] = val_df.site_id.mean()
    num_machine_id_49 = len(val_df[val_df.machine_id == 49])
    ret['num_machine_id_49'] = num_machine_id_49
#     assert num_machine_id_49 / num_samples > 0.4
    ret['mean_age'] = val_df.age.mean()
    ret['mean_age_site1'] = val_df[val_df.site_id == 1].age.mean()
    ret['mean_age_site2'] = val_df[val_df.site_id == 2].age.mean()
    assert ret['mean_age_site1'] < ret['mean_age_site2']
    ret['implant_pct'] = val_df[val_df.implant == 1].patient_id.nunique() / num_patients
#     assert ret['implant_pct'] < 0.02 and ret['implant_pct'] > 0.01
    return ret

In [3]:
df = pd.read_csv('../datasets/train_full_meta.csv')
df

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,...,__StudyInstanceUID,__RescaleType,__PartialView,__BitsStored,__CompressionForce,__ExposureControlModeDescription,__WindowCenterList,__WindowCenterListLength,__WindowWidthList,__WindowWidthListLength
0,2,10006,462822612,L,CC,61.0,0,0,0,,...,1.2.840.10009.1.2.3.10006,US,NO,16,,,"[1802.31, 1802.31, 2020.704, 1583.916]",4,"[1091.97, 1091.97, 1091.97, 1091.97]",4
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,...,1.2.840.10009.1.2.3.10006,US,NO,16,,,"[1802.31, 1802.31, 2020.704, 1583.916]",4,"[1091.97, 1091.97, 1091.97, 1091.97]",4
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,...,1.2.840.10009.1.2.3.10006,US,NO,16,,,"[1802.31, 1802.31, 2020.704, 1583.916]",4,"[1091.97, 1091.97, 1091.97, 1091.97]",4
3,2,10006,1874946579,R,CC,61.0,0,0,0,,...,1.2.840.10009.1.2.3.10006,US,NO,16,,,"[1802.31, 1802.31, 2020.704, 1583.916]",4,"[1091.97, 1091.97, 1091.97, 1091.97]",4
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,...,1.2.840.10009.1.2.3.10011,US,NO,12,,,[2048.0],1,[4096.0],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54701,1,9973,1729524723,R,MLO,43.0,0,0,0,1.0,...,1.2.840.10009.1.2.3.9973,US,,12,130.7771,AutoFilter,[2047.0],1,[4096.0],1
54702,1,9989,63473691,L,MLO,60.0,0,0,0,,...,1.2.840.10009.1.2.3.9989,US,,12,40.0000,AOP standard RECTANGLE 1062 mm 490 mm 180 mm ...,"[2616.0, 2676.0, 2496.0]",3,"[900.0, 750.0, 1050.0]",3
54703,1,9989,1078943060,L,CC,60.0,0,0,0,,...,1.2.840.10009.1.2.3.9989,US,,12,40.0000,AOP standard RECTANGLE 1092 mm 370 mm 180 mm ...,"[2614.0, 2668.0, 2512.0]",3,"[900.0, 750.0, 1050.0]",3
54704,1,9989,398038886,R,MLO,60.0,0,0,0,0.0,...,1.2.840.10009.1.2.3.9989,US,,12,60.0000,AOP standard RECTANGLE 822 mm 490 mm 180 mm 2...,"[2638.0, 2674.0, 2572.0]",3,"[900.0, 750.0, 1050.0]",3


In [4]:
from sklearn.model_selection import StratifiedGroupKFold

In [5]:
spliter = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=651)

SAVE_DIR = '../datasets/cv/v1'
os.makedirs(SAVE_DIR, exist_ok= True)

ret = []
for i, (train_idxs, val_idxs) in enumerate(spliter.split(df, df.cancer, groups = df.patient_id)):
    print(f"Fold {i}:")
    fold_train_df = df.loc[train_idxs].reset_index(drop = True)
    fold_val_df = df.loc[val_idxs].reset_index(drop = True)
    print(len(fold_train_df), len(fold_val_df))
    save_fold_train_path = os.path.join(SAVE_DIR, f'train_fold_{i}.csv')
    save_fold_val_path = os.path.join(SAVE_DIR, f'val_fold_{i}.csv')
    
    fold_ret = fold_check(fold_train_df, fold_val_df)
    print(fold_ret)
    ret.append(fold_ret)
    
    # save
    fold_train_df.to_csv(save_fold_train_path, index = False)
    fold_val_df.to_csv(save_fold_val_path, index = False)
    
#     print('Train df:')
#     display(fold_train_df)
#     print('Val df:')
#     display(fold_val_df)
    print('\n--------------------\n\n\n')
    

for k in ret[0]:
    print(k)
    for fold_idx, fold_ret in enumerate(ret):
        print('\t', fold_ret[k])
        
    

Fold 0:
43786 10920
{'val_pos_sample_num': 232, 'val_pos_patient_num': 96, 'val_pos_sample_percent': 0.021245421245421246, 'val_pos_patient_percent': 0.04031919361612768, 'val_machine_ids': [21, 29, 48, 49, 93, 170, 190, 210, 216], 'train_machine_ids': [21, 29, 48, 49, 93, 170, 190, 197, 210, 216], 'not_in_train_machine_ids': [], 'mean_site_id': 1.4586080586080585, 'num_machine_id_49': 4701, 'mean_age': 58.645167201099405, 'mean_age_site1': 57.437447096664975, 'mean_age_site2': 60.069688498402556, 'implant_pct': 0.013019739605207897}

--------------------



Fold 1:
43707 10999
{'val_pos_sample_num': 231, 'val_pos_patient_num': 96, 'val_pos_sample_percent': 0.021001909264478587, 'val_pos_patient_percent': 0.040285354595048256, 'val_machine_ids': [21, 29, 48, 49, 93, 170, 190, 210, 216], 'train_machine_ids': [21, 29, 48, 49, 93, 170, 190, 197, 210, 216], 'not_in_train_machine_ids': [], 'mean_site_id': 1.4562232930266388, 'num_machine_id_49': 4836, 'mean_age': 58.559439286364466, 'mean_a

### Group strategy

- Group by 'patient_id'
- Positive rate is between 0.0204-0.0256 (0.0206 for train dataset)
- There are new machine IDs in test dataset --> group by machine_id as well (generalization to un-seen machine or not?)
- No. of images in site ID 1 > No. of images in site ID 2.
- All patients have CC and MLO images for both sides
- More than 40% of images are from machine ID 49 (43% for train dataset)
- Mean age of patients is between 56-61 (58.6 for train set), and patients in site 1 are younger than those in site 2
- 1-2% of patients use implants (1.4% for train set)

In [9]:
# raise Exception()

stds = []
for rs in tqdm(range(0, 1000)):
    spliter = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=rs)
    SAVE_DIR = '../datasets/kfolds/v1'
    os.makedirs(SAVE_DIR, exist_ok= True)

    ret = []
    for i, (train_idxs, val_idxs) in enumerate(spliter.split(df, df.cancer, groups = df.patient_id)):
        fold_train_df = df.loc[train_idxs].reset_index(drop = True)
        fold_val_df = df.loc[val_idxs].reset_index(drop = True)
        save_fold_train_path = os.path.join(SAVE_DIR, f'train_fold_{i}.csv')
        save_fold_val_path = os.path.join(SAVE_DIR, f'val_fold_{i}.csv')

        fold_ret = fold_check(fold_train_df, fold_val_df)
        ret.append(fold_ret)

    #     # save
    #     fold_train_df.to_csv(save_fold_train_path, index = False)
    #     fold_val_df.to_csv(save_fold_val_path, index = False)

    #     print('Train df:')
    #     display(fold_train_df)
    #     print('Val df:')
    #     display(fold_val_df)
#         print('\n--------------------\n\n\n')


#     for k in ret[0]:
#         print(k)
#         for fold_idx, fold_ret in enumerate(ret):
#             print('\t', fold_ret[k])

    pos_rates = [fold_ret['val_pos_patient_percent'] for fold_ret in ret]
    pos_rates = np.array(pos_rates)
    stds.append(pos_rates.std())
#     print(stds)
        
    

100%|██████████████████████████████████████████████| 1000/1000 [1:04:06<00:00,  3.85s/it]


In [10]:
stds = np.array(stds)
np.argmin(stds)

651

In [11]:
arr = [[i, stds[i]] for i in range(len(stds))]
arr.sort(key = lambda x: x[1])
arr

[[651, 0.00047692136437274736],
 [736, 0.0005969950163190125],
 [305, 0.0007014315463993865],
 [642, 0.0008119308381310349],
 [57, 0.0008184074340428579],
 [829, 0.0008259986714313722],
 [906, 0.0008297984821937578],
 [551, 0.0008300175823733803],
 [634, 0.0008534634261213658],
 [514, 0.0008657029837682894],
 [51, 0.0008837809898818351],
 [70, 0.0009208652724228226],
 [307, 0.0009741679135529846],
 [0, 0.0009757002554474786],
 [417, 0.0010320012870848256],
 [498, 0.0010413633957035938],
 [879, 0.0010745501654207553],
 [178, 0.0010852871085821007],
 [935, 0.0010861811378509241],
 [859, 0.0011261402922476487],
 [508, 0.0011314709276874758],
 [567, 0.001139924840200041],
 [923, 0.0011410765284485492],
 [810, 0.001176888371724143],
 [780, 0.001206059088665997],
 [15, 0.0012224539856763436],
 [761, 0.0012251099660282236],
 [757, 0.001225418678884027],
 [603, 0.0012915942086251788],
 [311, 0.0012999313167543526],
 [958, 0.0013119375454217167],
 [939, 0.0013173936666945843],
 [779, 0.00132409