In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
brset_labels = pd.read_csv('labels.csv') # From Original BRSET Dataset
brset_embed = pd.read_csv('embeddings.csv') # From Embeddings archive
brset_labels['split'] = brset_embed['split']
brset_labels['DR_2'] = brset_embed['DR_2']

In [3]:
# Determine overlap of patients between train and test sets
train_patient_set = set(brset_labels[brset_labels['split'] == 'train']['patient_id'])
test_patient_set = set(brset_labels[brset_labels['split'] == 'test']['patient_id'])

overlap = train_patient_set.intersection(test_patient_set)
len(overlap)/len(test_patient_set)

0.8426240652617267

In [4]:
# Determine percentage of patients where DR_2 is the same in both eyes
pivot_df = brset_labels[['patient_id', 'exam_eye', 'DR_2']].pivot_table(index='patient_id', columns='exam_eye', values='DR_2')
# Some patients only have a single eye exam.
pivot_df = pivot_df.dropna()
pivot_df.columns = ['DR_2_left', 'DR_2_right']
(pivot_df['DR_2_left']==pivot_df['DR_2_right']).mean()

0.9879138202837625

In [5]:

# Create aggregate dataset of patienst and max of the label
# Stratify on the mean of DR_2 to try and distribute the few patients with differeing exams between eyes.
patient_data = brset_labels.groupby('patient_id')['DR_2'].mean().reset_index().set_index('patient_id')

train_patients, test_patients = train_test_split(patient_data.reset_index()['patient_id'], 
                                                 test_size=0.2, random_state=42, 
                                                 stratify=patient_data['DR_2'])
train_patients, val_patients = train_test_split(train_patients, 
                                                test_size=test_patients.shape[0]/train_patients.shape[0], 
                                                random_state=42, 
                                                stratify=patient_data.loc[train_patients]['DR_2'])
train_patients = set(train_patients)
val_patients = set(val_patients)
test_patients = set(test_patients)
print(len(train_patients), len(val_patients), len(test_patients))
overlap = train_patients.intersection(test_patients)
print(len(overlap)/len(test_patients))

5113 1706 1705
0.0


In [6]:
split_df = brset_labels[['image_id', 'patient_id']].copy()
split_df['embeddings_split'] = brset_labels['split']
split_df['split'] = 'train'
split_df.loc[split_df['patient_id'].isin(val_patients), 'split'] = 'val'
split_df.loc[split_df['patient_id'].isin(test_patients), 'split'] = 'test'

In [7]:
split_df.to_csv('split.csv', index=False)