# Settings

In [1]:
COLAB = False
SEED = 42

# Get the data

## Note
If you are running the script on colab or any other GPU provider you need to download your own dataset.
<br>
We used google drive to store the dataset. You can find a few useful commands below

In [2]:
if COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Copy your data 
    !cp ./drive/My\ Drive/data2.tar.gz .
    !tar xzf data2.tar.gz

In [3]:
DATA_PATH = '../data/'
PATIENT_REGEX_STR = r'^\.\./data\/P[0-9]{1,3}$'
N_FOLDS = 2
SPLIT_NAME = 'sample_split.csv'
SPLIT_DIR = '../split/'

if COLAB:
    DATA_PATH = './data/'
    PATIENT_REGEX_STR = r'^\./data\/P[0-9]{1,3}$'
    N_FOLDS = 10
    SPLIT_NAME = 'split.csv'
    SPLIT_DIR = './drive/My\ Drive/Code/CV/'

# Get the stats about the dataset

In [4]:
import re
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

patient_regex = re.compile(PATIENT_REGEX_STR)
examination_regex = re.compile(r'P[0-9]{1,3}')

np.random.seed(SEED)

examination_dirs = []
num_examinations = []
for root, dirs, files in os.walk(DATA_PATH):
    if patient_regex.match(root):
        examination_dirs.append(root[len(DATA_PATH):])
        examinations = []
        for directory in os.listdir(root):
            if examination_regex.match(directory):
                examinations.append(directory)
                
        num_examinations.append(len(examinations))
        
examination_dirs = np.array(examination_dirs)
num_examinations = np.array(num_examinations)

pd.DataFrame({
    'examination_dirs': examination_dirs,
    'num_examinations': num_examinations,
})

Unnamed: 0,examination_dirs,num_examinations
0,P4,1
1,P3,1
2,P2,1
3,P1,2


In [5]:
indexes = np.argsort(num_examinations)[::-1]

num_many_exam_patients = (num_examinations > 1).sum()


kfold_method = KFold(N_FOLDS, shuffle=True, random_state=SEED)


results = []

# Each split gets at first patient with multiple examinations
split_indexes = indexes[N_FOLDS:]

for _id, (train_index, test_index) in enumerate(kfold_method.split(split_indexes), 1):
    patients = [examination_dirs[indexes[N_FOLDS-_id]]] + list(examination_dirs[split_indexes[test_index]])
    num_exams = [num_examinations[indexes[N_FOLDS-_id]]] + list(num_examinations[split_indexes[test_index]])
    result = {
        'split': _id,
        'patients': ",".join(map(str, patients)),
        'num_examinations': ",".join(map(str, num_exams)),
        'num_samples': sum(num_exams),
    }
    results.append(result)

cv_df = pd.DataFrame(results)
cv_df

Unnamed: 0,num_examinations,num_samples,patients,split
0,11,2,"P2,P4",1
1,21,3,"P1,P3",2


In [6]:
cv_df.to_csv(f'{SPLIT_DIR}{SPLIT_NAME}', encoding='utf-8', index=False)